<a href="https://www.kaggle.com/code/tgomesjuliana/crossfit-competitions-and-athletes?scriptVersionId=135961436" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Data Cleaning

## Imports

In [1]:
import pandas as pd
import os

## Functions

In [2]:
def selecting_columns_athletes_information(year, competition):
    # Construct the file path
    file_path = os.path.join("..", "input", "crossfit-competitions", f"df_{year}_{competition}_athletes_information.csv")
    
    # Define the numeric columns and continuous columns
    numeric_columns = ['competitorId', 'age', 'overallRank', 'overallScore', 'affiliateId']
    continuous_columns = ['competitorName', 'firstName', 'lastName', 'gender', 'height',
                          'weight', 'countryOfOriginName', 'regionName', 'affiliateName']
    
    # Read the CSV file, select specific columns, and assign data types
    df = pd.read_csv(file_path, usecols=numeric_columns + continuous_columns, dtype='object').copy()
    
    # Convert "DQ" values to -1 (Disqualified) in the 'overallRank' column
    df['overallRank'].replace("DQ", -1, inplace=True)
    
    # Convert "None" values to actual null values (NaN) in the 'affiliateId' column
    df['affiliateId'].replace("None", float('nan'), inplace=True)
    
    # Convert numeric columns to float data type
    df[numeric_columns] = df[numeric_columns].astype(float)
    
    # Add 'year' and 'competition' columns to the DataFrame
    df['year'] = year
    df['competition'] = competition
    
    return df
# Dropping columns for all:
# 'status', 'postCompStatus', 'profilePicS3key', 'countryOfOriginCode','countryShortCode', 'regionId',
# 'divisionId', 'teamCaptain'

In [3]:
def selecting_columns_athletes_scores(year, competition):
    # Construct the file path
    file_path = os.path.join("..", "input", "crossfit-competitions", f"df_{year}_{competition}_athletes_scores.csv")
    
    # Define the numeric columns and continuous columns
    numeric_columns = ['competitorId', 'ordinal', 'rank', 'score', 'valid']
    continuous_columns = ['scoreDisplay']
    
    # Read the CSV file, select specific columns, and assign data types
    df = pd.read_csv(file_path, usecols=numeric_columns + continuous_columns, dtype='object').copy()
    
    # Convert "WD" values to -1 (Withdrawn) in the 'rank' column
    df['rank'].replace("WD", -1, inplace=True)
    
    # Set 'valid' to 0 if it is NaN
    df['valid'].fillna(0, inplace=True)
    
    # Convert numeric columns to float data type
    df[numeric_columns] = df[numeric_columns].astype(float)
    
    # Add 'year' and 'competition' columns to the DataFrame
    df['year'] = year
    df['competition'] = competition
    
    return df
# Dropping columns:
# - For all: 'scoreIdentifier', 'mobileScoreDisplay', 'scaled', 'video', 'heat', 'lane', 'breakdown'
# - For quarterfinals and semifinals: 'judge', 'judge_user_id', 'affiliate', 'time'

## Athletes Information

In [4]:
df_2021_open_athletes_information = selecting_columns_athletes_information("2021", "open")
df_2021_quarterfinals_athletes_information = selecting_columns_athletes_information("2021", "quarterfinals")
df_2021_semifinals_athletes_information = selecting_columns_athletes_information("2021", "semifinals")
df_2021_games_athletes_information = selecting_columns_athletes_information("2021", "games")
df_2022_open_athletes_information = selecting_columns_athletes_information("2022", "open")
df_2022_quarterfinals_athletes_information = selecting_columns_athletes_information("2022", "quarterfinals")
df_2022_semifinals_athletes_information = selecting_columns_athletes_information("2022", "semifinals")
df_2022_games_athletes_information = selecting_columns_athletes_information("2022", "games")
df_2023_open_athletes_information = selecting_columns_athletes_information("2023", "open")
df_2023_quarterfinals_athletes_information = selecting_columns_athletes_information("2023", "quarterfinals")
df_2023_semifinals_athletes_information = selecting_columns_athletes_information("2023", "semifinals")
df_2023_games_athletes_information = selecting_columns_athletes_information("2023", "games")

## Athletes Scores

In [5]:
df_2021_open_athletes_scores = selecting_columns_athletes_scores("2021", "open")
df_2021_quarterfinals_athletes_scores = selecting_columns_athletes_scores("2021", "quarterfinals")
df_2021_semifinals_athletes_scores = selecting_columns_athletes_scores("2021", "semifinals")
df_2021_games_athletes_scores = selecting_columns_athletes_scores("2021", "games")
df_2022_open_athletes_scores = selecting_columns_athletes_scores("2022", "open")
df_2022_quarterfinals_athletes_scores = selecting_columns_athletes_scores("2022", "quarterfinals")
df_2022_semifinals_athletes_scores = selecting_columns_athletes_scores("2022", "semifinals")
df_2022_games_athletes_scores = selecting_columns_athletes_scores("2022", "games")
df_2023_open_athletes_scores = selecting_columns_athletes_scores("2023", "open")
df_2023_quarterfinals_athletes_scores = selecting_columns_athletes_scores("2023", "quarterfinals")
df_2023_semifinals_athletes_scores = selecting_columns_athletes_scores("2023", "semifinals")