<a href="https://www.kaggle.com/code/tgomesjuliana/crossfit-competitions-data-analysis?scriptVersionId=136141947" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
import math
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Defitions

In [2]:
years = [2021, 2022, 2023]
competitions = ['open', 'quarterfinals', 'semifinals', 'games']

# Functions

In [3]:
def clean_height(height):
    if pd.isna(height):
        return float('nan')
    elif 'cm' in height:
        # Extract the numeric value before 'cm' and convert it to float
        height_cm = float(height.split('cm')[0].strip())
        return math.ceil(height_cm)
    elif 'in' in height:
        # Extract the numeric value before 'in' and convert it to float
        height_in = float(height.split('in')[0].strip())
        # Convert the height from inches to centimeters
        height_cm = height_in * 2.54
        return math.ceil(height_cm)
    else:
        return float('nan')

In [4]:
# Function to clean the height values
def clean_weight(weight):
    if pd.isna(weight):
        return float('nan')
    elif 'kg' in weight:
        # Extract the numeric value before 'kg' and convert it to float
        weight_kg = float(weight.split('kg')[0].strip())
        return math.ceil(weight_kg)
    elif 'lb' in weight:
        # Extract the numeric value before 'lb' and convert it to float
        weight_lb = float(weight.split('lb')[0].strip())
        # Convert the weight from 'lb' to 'kg'
        weight_kg = weight_lb / 2.205
        return math.ceil(weight_kg)
    else:
        return float('nan')

In [5]:
# Function to create histograms
def create_histogram(column, nbinsx):
    # Create subplots with a 4x3 grid layout
    fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_xaxes=True, shared_yaxes=True)

    # Iterate over each year and competition combination
    for i, year in enumerate(years):
        for j, competition in enumerate(competitions):
            # Filter the data for the current year and competition
            filtered_data = information_data[(information_data['year'] == year) & (information_data['competition'] == competition)]

            # Create a histogram trace for the age distribution
            trace = go.Histogram(
                x=filtered_data[column],
                nbinsx=nbinsx,
                name=f"{competition} {year}",
            )

            # Add the trace to the subplot
            fig.add_trace(trace, row=j+1, col=i+1)

    # Update subplot layout
    fig.update_layout(
        title= column+" distribution by year and competition",
        showlegend=False,
        height=800,
        width=1000,
        xaxis=dict(title=column),
        yaxis=dict(title='athletes'),
    )

    # Show the plot
    fig.show()

In [6]:
# Function to create bars
def create_bar(column):
    # Create subplots with a 4x3 grid layout
    fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_xaxes=True, shared_yaxes=True)

    # Iterate over each year and competition combination
    for i, year in enumerate(years):
        for j, competition in enumerate(competitions):
            # Filter the data for the current year and competition
            filtered_data = information_data[(information_data['year'] == year) & (information_data['competition'] == competition)]

            # Count the number of individuals
            athletes_counts = filtered_data[column].value_counts().sort_index()

            # Create a bar chart trace for the gender counts
            trace = go.Bar(
                x=athletes_counts.index,
                y=athletes_counts.values,
            )

            # Add the trace to the subplot
            fig.add_trace(trace, row=j+1, col=i+1)

    # Update the layout for the subplots
    fig.update_layout(
        title=column+" distribution by year and competition",
        showlegend=False,
        height=800,
        width=1000,
        xaxis=dict(title=column),
        yaxis=dict(title='athletes'),
    )

    # Show the plot
    fig.show()

In [7]:
# Function to create bars comparing two columns
def create_bar_two_columns(column_x, column_y):
# Create subplots with a 4x3 grid layout
    fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_xaxes=True, shared_yaxes=True)

    # Iterate over each year and competition combination
    for i, year in enumerate(years):
        for j, competition in enumerate(competitions):
            # Filter the data for the current year and competition
            filtered_data = information_data[(information_data['year'] == year) & (information_data['competition'] == competition)]

            # Calculate the average of 'overallScore' per 'regionName'
            average_scores = filtered_data.groupby(column_x)[column_y].mean()

            # Create a bar chart trace for the average scores per region
            trace = go.Bar(
                x=average_scores.index,
                y=average_scores.values,
            )

            # Add the trace to the subplot
            fig.add_trace(trace, row=j+1, col=i+1)

    # Update the layout for the subplots
    fig.update_layout(
        title='average '+column_y+' by year, competition, and '+column_x,
        showlegend=False,
        height=800,
        width=1000,
        xaxis=dict(title=column_x),
        yaxis=dict(title='average '+column_y),
    )

    # Show the plot
    fig.show()

# Exploratory Data Analysis

## Athletes Information

In [8]:
# Create an empty DataFrame to combine the data
information_data = pd.DataFrame()

# Iterate over each year and competition combination
for year in years:
    for competition in competitions:
        # Construct the file path for each dataset
        file_path = f"../input/crossfit-competitions/df_{year}_{competition}_athletes_information.csv"
        
        # Read the dataset into a temporary DataFrame
        temp_data = pd.read_csv(file_path)
        
        # Concatenate the temporary DataFrame with the information_data DataFrame
        information_data = pd.concat([information_data, temp_data], ignore_index=True)

# Replace value in 'height' column for competitorId 901503 with '186 cm'
information_data.loc[information_data['competitorId'] == 901503, 'height'] = '186 cm'

# Apply clean_height function to create 'height_cm' column
information_data['height_cm'] = information_data['height'].apply(clean_height)

# Filter out rows with invalid height values
information_data = information_data[(information_data['height_cm'] > 150) & (information_data['height_cm'] < 200)]

# Apply clean_weight function to create 'weight_kg' column
information_data['weight_kg'] = information_data['weight'].apply(clean_weight)

# Filter out rows with invalid weight values
information_data = information_data[(information_data['weight_kg'] > 50) & (information_data['weight_kg'] < 110)]

# Print the resulting DataFrame
information_data

Unnamed: 0,competitorId,competitorName,firstName,lastName,gender,countryOfOriginName,regionName,affiliateId,affiliateName,age,height,weight,overallRank,overallScore,year,competition,height_cm,weight_kg
0,469656.0,Jeffrey Adler,Jeffrey,Adler,M,Canada,North America,18059.0,CrossFit Wonderland,27.0,69 in,197 lb,1.0,101.0,2021,open,176.0,90.0
1,34796.0,Scott Panchik,Scott,Panchik,M,United States,North America,7991.0,CrossFit Mentality,33.0,69 in,187 lb,2.0,141.0,2021,open,176.0,85.0
2,105875.0,Travis Mead,Travis,Mead,M,United States,North America,9155.0,Iron Valley CrossFit,34.0,73 in,205 lb,3.0,165.0,2021,open,186.0,93.0
3,310970.0,Saxon Panchik,Saxon,Panchik,M,United States,North America,22505.0,CrossFit Cliffside,25.0,69 in,180 lb,4.0,217.0,2021,open,176.0,82.0
4,11435.0,Richard Froning Jr.,Richard,Froning Jr.,M,United States,North America,3220.0,CrossFit Mayhem,33.0,69 in,194 lb,5.0,254.0,2021,open,176.0,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
867622,872275.0,Caroline Stanley,Caroline,Stanley,F,United States,North America East,3617.0,College Hill CrossFit,23.0,63 in,145 lb,,0.0,2023,games,161.0,66.0
867623,174547.0,Amanda Barnhart,Amanda,Barnhart,F,United States,North America East,14583.0,CrossFit High Gear,31.0,67 in,160 lb,,0.0,2023,games,171.0,73.0
867624,121033.0,Sydney Wells,Sydney,Wells,F,United States,North America East,19593.0,CrossFit East Nashville,28.0,66 in,151 lb,,0.0,2023,games,168.0,69.0
867625,503582.0,Alexis Raptis,Alexis,Raptis,F,United States,North America East,25335.0,TTT CrossFit,24.0,65 in,155 lb,,0.0,2023,games,166.0,71.0


In [9]:
information_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 395829 entries, 0 to 867626
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   competitorId         395829 non-null  float64
 1   competitorName       395827 non-null  object 
 2   firstName            395828 non-null  object 
 3   lastName             395825 non-null  object 
 4   gender               395829 non-null  object 
 5   countryOfOriginName  393088 non-null  object 
 6   regionName           395829 non-null  object 
 7   affiliateId          340122 non-null  float64
 8   affiliateName        340122 non-null  object 
 9   age                  395829 non-null  float64
 10  height               395829 non-null  object 
 11  weight               395829 non-null  object 
 12  overallRank          395749 non-null  float64
 13  overallScore         395829 non-null  float64
 14  year                 395829 non-null  int64  
 15  competition      

### Numeric Variables

In [10]:
num_info_data = information_data.select_dtypes(include=[np.number])
num_info_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
competitorId,395829.0,1248228.0,722709.380987,86.0,652848.0,1237528.0,1940550.0,2523369.0
affiliateId,340122.0,13403.66,8562.87049,4.0,5546.0,13293.0,20160.0,32333.0
age,395829.0,35.03528,8.098122,16.0,29.0,34.0,41.0,54.0
overallRank,395749.0,55450.42,42553.356247,-1.0,17643.0,48305.0,87363.0,157742.0
overallScore,395829.0,203275.0,144033.36833,0.0,79018.0,186490.0,307686.0,582777.0
year,395829.0,2022.035,0.820119,2021.0,2021.0,2022.0,2023.0,2023.0
height_cm,395829.0,174.0966,9.19527,151.0,168.0,175.0,181.0,199.0
weight_kg,395829.0,77.9153,12.894299,51.0,68.0,79.0,87.0,109.0


In [11]:
# Calculate the correlation between desired columns
num_info_corr_matrix = num_info_data[['age', 'overallRank', 'overallScore', 'height_cm', 'weight_kg']].corr()

# Create the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=num_info_corr_matrix.values,
    x=num_info_corr_matrix.columns,
    y=num_info_corr_matrix.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
))

# Set the layout options for the heatmap
fig.update_layout(
    title='correlation heatmap',
    xaxis=dict(title='Columns'),
    yaxis=dict(title='Columns', autorange='reversed'),
)

# Show the plot
fig.show()

##### Age

In [12]:
create_histogram('age', 8)

##### Height

In [13]:
create_histogram('height_cm', 5)

##### Weight

In [14]:
create_histogram('weight_kg', 6)

### Categorical Variables

In [15]:
cat_info_data = information_data.select_dtypes(include=[object])
cat_info_data.describe().T

Unnamed: 0,count,unique,top,freq
competitorName,395827,209399,Michael Smith,33
firstName,395828,31014,Michael,4158
lastName,395825,90996,Smith,2113
gender,395829,2,M,263119
countryOfOriginName,393088,175,United States,189833
regionName,395829,8,North America,142682
affiliateName,340122,12767,CrossFit Torian,338
height,395829,68,70 in,22405
weight,395829,189,185 lb,11568
competition,395829,4,open,366063


##### Gender

In [16]:
create_bar('gender')

##### Region

In [17]:
create_bar('regionName')

In [18]:
create_bar_two_columns('regionName', 'overallScore')

In [19]:
create_bar_two_columns('regionName', 'overallRank')

## Athletes Scores

In [20]:
# Create an empty DataFrame to combine the data
scores_data = pd.DataFrame()

# Iterate over each year and competition combination
for year in years:
    for competition in competitions:
        # Exclude the combination "2023" and "games"
        if year == 2023 and competition == "games":
            continue
        
        # Construct the file path for each dataset
        file_path = f"../input/crossfit-competitions/df_{year}_{competition}_athletes_scores.csv"
        
        # Read the dataset into a temporary DataFrame
        temp_data = pd.read_csv(file_path)
        
        # Concatenate the temporary DataFrame with the information_data DataFrame
        scores_data = pd.concat([scores_data, temp_data], ignore_index=True)

# Print the resulting DataFrame
scores_data

Unnamed: 0,ordinal,rank,score,valid,scoreDisplay,competitorId,year,competition
0,1.0,20.0,6050185.0,1.0,11:55,469656.0,2021,open
1,2.0,8.0,2250646.0,1.0,9:14,469656.0,2021,open
2,3.0,27.0,1800405.0,1.0,8:15,469656.0,2021,open
3,4.0,46.0,317180405.0,1.0,317 lbs,469656.0,2021,open
4,1.0,33.0,6050155.0,1.0,12:25,34796.0,2021,open
...,...,...,...,...,...,...,...,...
2718702,6.0,29.0,3.0,1.0,CAP +29,2094617.0,2023,semifinals
2718703,7.0,29.0,3.0,1.0,CAP +68,2094617.0,2023,semifinals
2718704,1.0,24.0,18.0,1.0,CAP +85,1313021.0,2023,semifinals
2718705,2.0,23.0,21.0,1.0,0,1313021.0,2023,semifinals


In [21]:
scores_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2718707 entries, 0 to 2718706
Data columns (total 8 columns):
 #   Column        Dtype  
---  ------        -----  
 0   ordinal       float64
 1   rank          float64
 2   score         float64
 3   valid         float64
 4   scoreDisplay  object 
 5   competitorId  float64
 6   year          int64  
 7   competition   object 
dtypes: float64(5), int64(1), object(2)
memory usage: 165.9+ MB


### Numeric Variables

In [22]:
num_scor_data = scores_data.select_dtypes(include=[np.number])
num_scor_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ordinal,2718707.0,2.359155,1.120357,1.0,1.0,2.0,3.0,15.0
rank,2718707.0,54969.54,37729.66,1.0,21830.0,52488.0,83809.0,152032.0
score,2718707.0,11837820.0,40236590.0,1.0,1230000.0,1620000.0,2160204.0,2147484000.0
valid,2718707.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
competitorId,2718707.0,1537351.0,719461.0,86.0,954519.0,1753102.0,2136984.0,2523369.0
year,2718707.0,2022.097,0.8416065,2021.0,2021.0,2022.0,2023.0,2023.0


In [23]:
# Calculate the correlation between desired columns
num_scor_corr_matrix = num_scor_data[['ordinal', 'rank', 'score']].corr()

# Create the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=num_scor_corr_matrix.values,
    x=num_scor_corr_matrix.columns,
    y=num_scor_corr_matrix.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
))

# Set the layout options for the heatmap
fig.update_layout(
    title='correlation heatmap',
    xaxis=dict(title='Columns'),
    yaxis=dict(title='Columns', autorange='reversed'),
)

# Show the plot
fig.show()

### Categorical Variables

In [24]:
cat_scor_data = scores_data.select_dtypes(include=[object])
cat_scor_data.describe().T

Unnamed: 0,count,unique,top,freq
scoreDisplay,2718704,13142,180 reps,52365
competition,2718707,4,open,2523657
