<a href="https://www.kaggle.com/code/tgomesjuliana/crossfit-competitions-data-analysis?scriptVersionId=136053324" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [1]:
import pandas as pd
import math
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Functions

In [2]:
def clean_height(height):
    if pd.isna(height):
        return float('nan')
    elif 'cm' in height:
        # Extract the numeric value before 'cm' and convert it to float
        height_cm = float(height.split('cm')[0].strip())
        return math.ceil(height_cm)
    elif 'in' in height:
        # Extract the numeric value before 'in' and convert it to float
        height_in = float(height.split('in')[0].strip())
        # Convert the height from inches to centimeters
        height_cm = height_in * 2.54
        return math.ceil(height_cm)
    else:
        return float('nan')

In [3]:
# Function to clean the height values
def clean_weight(weight):
    if pd.isna(weight):
        return float('nan')
    elif 'kg' in weight:
        # Extract the numeric value before 'kg' and convert it to float
        weight_kg = float(weight.split('kg')[0].strip())
        return math.ceil(weight_kg)
    elif 'lb' in weight:
        # Extract the numeric value before 'lb' and convert it to float
        weight_lb = float(weight.split('lb')[0].strip())
        # Convert the weight from 'lb' to 'kg'
        weight_kg = weight_lb / 2.205
        return math.ceil(weight_kg)
    else:
        return float('nan')

# Exploratory Data Analysis

## Athletes Information

In [4]:
# Define the list of years and competitions
years = ['2021', '2022', '2023']
competitions = ['open', 'quarterfinals', 'semifinals', 'games']

# Create an empty DataFrame to combine the data
combined_data = pd.DataFrame()

# Iterate over each year and competition combination
for year in years:
    for competition in competitions:
        # Construct the file path for each dataset
        file_path = f"../input/crossfit-competitions/df_{year}_{competition}_athletes_information.csv"
        
        # Read the dataset into a temporary DataFrame
        temp_data = pd.read_csv(file_path)
        
        # Concatenate the temporary DataFrame with the combined_data DataFrame
        combined_data = pd.concat([combined_data, temp_data], ignore_index=True)

# Replace value in 'height' column for competitorId 901503 with '186 cm'
combined_data.loc[combined_data['competitorId'] == 901503, 'height'] = '186 cm'

# Apply clean_height function to create 'height_cm' column
combined_data['height_cm'] = combined_data['height'].apply(clean_height)

# Filter out rows with invalid height values
combined_data = combined_data[(combined_data['height_cm'] > 150) & (combined_data['height_cm'] < 200)]

# Apply clean_weight function to create 'weight_kg' column
combined_data['weight_kg'] = combined_data['weight'].apply(clean_weight)

# Filter out rows with invalid weight values
combined_data = combined_data[(combined_data['weight_kg'] > 50) & (combined_data['weight_kg'] < 110)]

# Sort the DataFrame by 'height_cm'
combined_data.sort_values(by='height_cm', inplace=True)

# Print the resulting DataFrame
combined_data

Unnamed: 0,competitorId,competitorName,firstName,lastName,gender,countryOfOriginName,regionName,affiliateId,affiliateName,age,height,weight,overallRank,overallScore,year,competition,height_cm,weight_kg
826083,2222327.0,Aleisha Randall,Aleisha,Randall,F,Australia,Oceania,22029.0,CrossFit Aere,32.0,151 cm,75 kg,105586.0,398297.0,2023,open,151.0,75.0
761016,2416357.0,Mariane Conegero,Mariane,Conegero,F,Brazil,South America,28422.0,MAD CrossFit,33.0,151 cm,51 kg,40519.0,171899.0,2023,open,151.0,51.0
214900,2012155.0,Ingrid Amorim,Ingrid,Amorim,F,Brazil,South America,25883.0,CrossFit Oito Sete,24.0,151 cm,51 kg,77436.0,293165.0,2021,open,151.0,51.0
219972,2069625.0,Ameera Ali,Ameera,Ali,F,Bahrain,Asia,24697.0,MR7 CrossFit,40.0,151 cm,66 kg,82509.0,306080.0,2021,open,151.0,66.0
223787,2111379.0,Sandy Marquette,Sandy,Marquette,F,Australia,Oceania,1152.0,CrossFit Effects (FX),49.0,151 cm,62 kg,86324.0,316670.0,2021,open,151.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352980,1906788.0,Florent DEVOS,Florent,DEVOS,M,France,Europe,22864.0,CrossFit Mayenne,33.0,199 cm,98 kg,92190.0,271646.0,2022,open,199.0,98.0
309488,1104609.0,Tim Tolbert,Tim,Tolbert,M,United States,North America,17779.0,CrossFit Viento,43.0,78 in,210 lb,48698.0,158077.0,2022,open,199.0,96.0
68759,1196408.0,Bill McDonnell,Bill,McDonnell,M,United States,North America,,,37.0,78 in,200 lb,68759.0,274635.0,2021,open,199.0,91.0
122443,2069104.0,Corey Callaway,Corey,Callaway,M,United States,North America,26113.0,CrossFit CLT,30.0,78 in,235 lb,122444.0,437396.0,2021,open,199.0,107.0


### Numeric Variables

##### Age

In [5]:
# Define the list of years and competitions
years = [2021, 2022, 2023]
competitions = ['open', 'quarterfinals', 'semifinals', 'games']

# Create subplots with a 4x3 grid layout
fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_xaxes=True, shared_yaxes=True)

# Iterate over each year and competition combination
for i, year in enumerate(years):
    for j, competition in enumerate(competitions):
        # Filter the data for the current year and competition
        filtered_data = combined_data[(combined_data['year'] == year) & (combined_data['competition'] == competition)]
        
        # Create a histogram trace for the age distribution
        trace = go.Histogram(
            x=filtered_data['age'],
            nbinsx=8,
            name=f"{competition} {year}",
        )
        
        # Add the trace to the subplot
        fig.add_trace(trace, row=j+1, col=i+1)

# Update subplot layout
fig.update_layout(
    title="Age Distribution by Year and Competition",
    showlegend=False,
    height=800,
    width=1000,
)

# Show the plot
fig.show()

##### Height

In [6]:
# Define the list of years and competitions
years = [2021, 2022, 2023]
competitions = ['open', 'quarterfinals', 'semifinals', 'games']

# Create subplots with a 4x3 grid layout
fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_xaxes=True, shared_yaxes=True)

# Iterate over each year and competition combination
for i, year in enumerate(years):
    for j, competition in enumerate(competitions):
        # Filter the data for the current year and competition
        filtered_data = combined_data[(combined_data['year'] == year) & (combined_data['competition'] == competition)]
        
        # Create a histogram trace for the age distribution
        trace = go.Histogram(
            x=filtered_data['height_cm'],
            nbinsx=6,
            name=f"{competition} {year}",
        )
        
        # Add the trace to the subplot
        fig.add_trace(trace, row=j+1, col=i+1)

# Update subplot layout
fig.update_layout(
    title="Height Distribution by Year and Competition",
    showlegend=False,
    height=800,
    width=1000,
)

# Show the plot
fig.show()

##### Weight

In [7]:
# Define the list of years and competitions
years = [2021, 2022, 2023]
competitions = ['open', 'quarterfinals', 'semifinals', 'games']

# Create subplots with a 4x3 grid layout
fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_xaxes=True, shared_yaxes=True)

# Iterate over each year and competition combination
for i, year in enumerate(years):
    for j, competition in enumerate(competitions):
        # Filter the data for the current year and competition
        filtered_data = combined_data[(combined_data['year'] == year) & (combined_data['competition'] == competition)]
        
        # Create a histogram trace for the age distribution
        trace = go.Histogram(
            x=filtered_data['weight_kg'],
            nbinsx=6,
            name=f"{competition} {year}",
        )
        
        # Add the trace to the subplot
        fig.add_trace(trace, row=j+1, col=i+1)

# Update subplot layout
fig.update_layout(
    title="Weight Distribution by Year and Competition",
    showlegend=False,
    height=800,
    width=1000,
)

# Show the plot
fig.show()

### Categorical Variables

##### Gender