<a href="https://www.kaggle.com/code/tgomesjuliana/crossfit-competitions-data-analysis?scriptVersionId=136328283" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [1]:
import pandas as pd
import numpy as np
import math
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Defitions

In [2]:
years = [2021, 2022, 2023]
competitions = ['open', 'quarterfinals', 'semifinals', 'games']

# Functions

In [3]:
def clean_height(height):
    if pd.isna(height):
        return float('nan')
    elif 'cm' in height:
        # Extract the numeric value before 'cm' and convert it to float
        height_cm = float(height.split('cm')[0].strip())
        return math.ceil(height_cm)
    elif 'in' in height:
        # Extract the numeric value before 'in' and convert it to float
        height_in = float(height.split('in')[0].strip())
        # Convert the height from inches to centimeters
        height_cm = height_in * 2.54
        return math.ceil(height_cm)
    else:
        return float('nan')

In [4]:
# Function to clean the height values
def clean_weight(weight):
    if pd.isna(weight):
        return float('nan')
    elif 'kg' in weight:
        # Extract the numeric value before 'kg' and convert it to float
        weight_kg = float(weight.split('kg')[0].strip())
        return math.ceil(weight_kg)
    elif 'lb' in weight:
        # Extract the numeric value before 'lb' and convert it to float
        weight_lb = float(weight.split('lb')[0].strip())
        # Convert the weight from 'lb' to 'kg'
        weight_kg = weight_lb / 2.205
        return math.ceil(weight_kg)
    else:
        return float('nan')

In [5]:
# Function to create histograms
def create_histogram(df, column, nbinsx):
    """
    Create histograms of a specific column in a dataframe, grouped by year and competition.

    Parameters:
        df (DataFrame): Input dataframe containing the data.
        column (str): Name of the column to create histograms for.
        nbinsx (int): Number of bins for the histogram.

    Returns:
        None (displays the plot)

    """

    # Create subplots with a 4x3 grid layout
    fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_xaxes=True, shared_yaxes=True)

    # Iterate over each year and competition combination
    for i, year in enumerate(years):
        for j, competition in enumerate(competitions):
            # Filter the data for the current year and competition
            filtered_data = df[(df['year'] == year) & (df['competition'] == competition)]

            # Create a histogram trace for the column distribution
            trace = go.Histogram(
                x=filtered_data[column],
                nbinsx=nbinsx,
                name=f"{competition} {year}",
            )

            # Add the trace to the subplot
            fig.add_trace(trace, row=j+1, col=i+1)

    # Update subplot layout
    fig.update_layout(
        title=f"{column} distribution by year and competition",
        showlegend=False,
        height=800,
        width=1000,
        xaxis=dict(title=column),
        yaxis=dict(title='athletes'),
    )

    # Show the plot
    fig.show()

In [6]:
# Function to create bar charts
def create_bar(df, column):
    """
    Create bar charts of a specific column in a dataframe, grouped by year and competition.

    Parameters:
        df (DataFrame): Input dataframe containing the data.
        column (str): Name of the column to create bar charts for.

    Returns:
        None (displays the plot)

    """

    # Create subplots with a 4x3 grid layout
    fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_yaxes=True)

    # Iterate over each year and competition combination
    for i, year in enumerate(years):
        for j, competition in enumerate(competitions):
            # Filter the data for the current year and competition
            filtered_data = df[(df['year'] == year) & (df['competition'] == competition)]

            # Count the number of individuals
            athletes_counts = filtered_data[column].value_counts().sort_index()

            # Create a bar chart trace for the counts
            trace = go.Bar(
                x=athletes_counts.index,
                y=athletes_counts.values,
            )

            # Add the trace to the subplot
            fig.add_trace(trace, row=j+1, col=i+1)

    # Update subplot layout
    fig.update_layout(
        title=f"{column} distribution by year and competition",
        showlegend=False,
        height=800,
        width=1000,
        xaxis=dict(title=column),
        yaxis=dict(title='athletes'),
    )

    # Show the plot
    fig.show()

In [7]:
# Function to create bar charts comparing with the average of a second column
def create_bar_two_columns_average(df, column_x, column_y):
    """
    Create bar charts comparing two columns in a dataframe, grouped by year and competition,
    considering the average of the second column.

    Parameters:
        df (DataFrame): Input dataframe containing the data.
        column_x (str): Name of the column to be plotted on the x-axis.
        column_y (str): Name of the column to be plotted on the y-axis.

    Returns:
        None (displays the plot)

    """

    # Create subplots with a 4x3 grid layout
    fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_yaxes=True)

    # Iterate over each year and competition combination
    for i, year in enumerate(years):
        for j, competition in enumerate(competitions):
            # Filter the data for the current year and competition
            filtered_data = df[(df['year'] == year) & (df['competition'] == competition)]

            # Calculate the average of 'column_y' per 'column_x'
            average_values = filtered_data.groupby(column_x)[column_y].mean()

            # Create a bar chart trace for the average values per category
            trace = go.Bar(
                x=average_values.index,
                y=average_values.values,
            )

            # Add the trace to the subplot
            fig.add_trace(trace, row=j+1, col=i+1)

    # Update the layout for the subplots
    fig.update_layout(
        title=f"Average {column_y} by Year, Competition, and {column_x}",
        showlegend=False,
        height=800,
        width=1000,
        xaxis=dict(title=column_x),
        yaxis=dict(title=f"Average {column_y}"),
    )

    # Show the plot
    fig.show()

In [8]:
# Function to create bar charts comparing without the duplicates of a second column
def create_bar_two_columns_duplicate(df, column_x, column_y):
    """
    Create bar charts of a specific column in a dataframe, grouped by year and competition,
    without duplicates in the second column.

    Parameters:
        df (DataFrame): Input dataframe containing the data.
        column_x (str): Name of the column to be plotted on the x-axis.
        column_y (str): Name of the column to be plotted on the y-axis.

    Returns:
        None (displays the plot)

    """

    # Create subplots with a 4x3 grid layout
    fig = make_subplots(rows=4, cols=3, subplot_titles=[f"{competition} {year}" for competition in competitions for year in years], shared_yaxes=True)

    # Iterate over each year and competition combination
    for i, year in enumerate(years):
        for j, competition in enumerate(competitions):
            # Filter the data for the current year and competition
            filtered_data = df[(df['year'] == year) & (df['competition'] == competition)]

            # Count the number of unique individuals for each category in column_x
            athletes_counts = filtered_data.drop_duplicates(subset=column_y)[column_x].value_counts().sort_index()

            # Create a bar chart trace for the counts of unique individuals
            trace = go.Bar(
                x=athletes_counts.index,
                y=athletes_counts.values,
            )

            # Add the trace to the subplot
            fig.add_trace(trace, row=j+1, col=i+1)

    # Update the layout for the subplots
    fig.update_layout(
        title=f"{column_x} Distribution by Year and Competition",
        showlegend=False,
        height=800,
        width=1000,
        xaxis=dict(title=column_x),
        yaxis=dict(title=column_y),
    )

    # Show the plot
    fig.show()

# Exploratory Data Analysis

## Athletes Information

In [9]:
# Use list comprehension to concatenate the temporary DataFrames
information_data = pd.concat([pd.read_csv(f"../input/crossfit-competitions/df_{year}_{competition}_athletes_information.csv") for year in years for competition in competitions], ignore_index=True)

# Define the conditions for replacing values
conditions = {
    455677: 38.0,
    592472: 39.0,
    1019212: 40.0,
    901702: 39.0,
    975774: 40.0
}

# Update the 'overallRank' values based on 'competitorId' and 'year'
information_data.loc[(information_data['competitorId'].isin(conditions.keys())) & (information_data['year'] == 2021), 'overallRank'] = information_data['competitorId'].map(conditions)

# Replace values in the 'regionName' column using a dictionary
information_data['regionName'] = information_data['regionName'].replace({'North America East': 'North America', 'North America West': 'North America'})

# Replace value in 'height' column for competitorId 901503 with '186 cm'
information_data.loc[information_data['competitorId'] == 901503, 'height'] = '186 cm'

# Apply clean_height function to create 'height_cm' column
information_data['height_cm'] = information_data['height'].apply(clean_height)

# Apply clean_weight function to create 'weight_kg' column
information_data['weight_kg'] = information_data['weight'].apply(clean_weight)

# Combine filtering operations for height and weight
information_data = information_data[(information_data['height_cm'].between(150, 200)) & (information_data['weight_kg'].between(50, 110))]

# Print the resulting DataFrame
information_data

Unnamed: 0,competitorId,competitorName,firstName,lastName,gender,countryOfOriginName,regionName,affiliateId,affiliateName,age,height,weight,overallRank,overallScore,year,competition,height_cm,weight_kg
0,469656.0,Jeffrey Adler,Jeffrey,Adler,M,Canada,North America,18059.0,CrossFit Wonderland,27.0,69 in,197 lb,1.0,101.0,2021,open,176.0,90.0
1,34796.0,Scott Panchik,Scott,Panchik,M,United States,North America,7991.0,CrossFit Mentality,33.0,69 in,187 lb,2.0,141.0,2021,open,176.0,85.0
2,105875.0,Travis Mead,Travis,Mead,M,United States,North America,9155.0,Iron Valley CrossFit,34.0,73 in,205 lb,3.0,165.0,2021,open,186.0,93.0
3,310970.0,Saxon Panchik,Saxon,Panchik,M,United States,North America,22505.0,CrossFit Cliffside,25.0,69 in,180 lb,4.0,217.0,2021,open,176.0,82.0
4,11435.0,Richard Froning Jr.,Richard,Froning Jr.,M,United States,North America,3220.0,CrossFit Mayhem,33.0,69 in,194 lb,5.0,254.0,2021,open,176.0,88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
867622,872275.0,Caroline Stanley,Caroline,Stanley,F,United States,North America,3617.0,College Hill CrossFit,23.0,63 in,145 lb,,0.0,2023,games,161.0,66.0
867623,174547.0,Amanda Barnhart,Amanda,Barnhart,F,United States,North America,14583.0,CrossFit High Gear,31.0,67 in,160 lb,,0.0,2023,games,171.0,73.0
867624,121033.0,Sydney Wells,Sydney,Wells,F,United States,North America,19593.0,CrossFit East Nashville,28.0,66 in,151 lb,,0.0,2023,games,168.0,69.0
867625,503582.0,Alexis Raptis,Alexis,Raptis,F,United States,North America,25335.0,TTT CrossFit,24.0,65 in,155 lb,,0.0,2023,games,166.0,71.0


In [10]:
information_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399342 entries, 0 to 867626
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   competitorId         399342 non-null  float64
 1   competitorName       399340 non-null  object 
 2   firstName            399341 non-null  object 
 3   lastName             399338 non-null  object 
 4   gender               399342 non-null  object 
 5   countryOfOriginName  396587 non-null  object 
 6   regionName           399342 non-null  object 
 7   affiliateId          343166 non-null  float64
 8   affiliateName        343166 non-null  object 
 9   age                  399342 non-null  float64
 10  height               399342 non-null  object 
 11  weight               399342 non-null  object 
 12  overallRank          399262 non-null  float64
 13  overallScore         399342 non-null  float64
 14  year                 399342 non-null  int64  
 15  competition      

### Numeric Variables

In [11]:
num_info_data = information_data.select_dtypes(include=[np.number])
num_info_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
competitorId,399342.0,1249626.0,722747.744123,86.0,655925.0,1239003.0,1942362.0,2523369.0
affiliateId,343166.0,13407.67,8562.63402,4.0,5550.0,13295.0,20161.0,32333.0
age,399342.0,35.03684,8.101685,16.0,29.0,34.0,41.0,54.0
overallRank,399262.0,55504.24,42532.260165,-1.0,17732.0,48394.0,87406.75,157742.0
overallScore,399342.0,203432.6,143953.597002,0.0,79336.0,186778.0,307736.0,582777.0
year,399342.0,2022.035,0.820031,2021.0,2021.0,2022.0,2023.0,2023.0
height_cm,399342.0,173.9847,9.33317,150.0,168.0,175.0,181.0,200.0
weight_kg,399342.0,77.8033,13.075751,50.0,68.0,79.0,87.0,110.0


In [12]:
# Calculate the correlation between desired columns
num_info_corr_matrix = num_info_data[['age', 'overallRank', 'overallScore', 'height_cm', 'weight_kg']].corr()

# Create the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=num_info_corr_matrix.values,
    x=num_info_corr_matrix.columns,
    y=num_info_corr_matrix.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
))

# Set the layout options for the heatmap
fig.update_layout(
    title='correlation heatmap',
    xaxis=dict(title='Columns'),
    yaxis=dict(title='Columns', autorange='reversed'),
)

# Show the plot
fig.show()

**Insights:**
* overallRank and overallScore are obviously strongly correlated  
* height and weight are also very correlated  
* age has a super small correlation to overallRank and overallScore  
* height and weight have a even smaller correlation to overallRank and overallScore  
* age doesn't seem to be correlated to height and weight

##### Age

In [13]:
create_histogram(information_data, 'age', 8)

**Insights:**
* all years and competitions follow a normal distribution for age
* open has older athletes, while games has younger ones
    * are older athletes being eliminated between competitions or is this a data scraping problem?
        * I scraped the individual category only, leaving age group, team and adaptive behind, so why do we have older athletes for open?

##### Height

In [14]:
create_histogram(information_data, 'height_cm', 5)

**Insights:**
* all years and competitions follow a normal distribution for height
* should we breakdown the height analysis and / or the model by gender?
* games 2023 has a athlete with a height never seem in the past two years games
    * will this be a problem when predicting this athlete's results?

##### Weight

In [15]:
create_histogram(information_data, 'weight_kg', 6)

**Insights:**
* all years and competitions follow a normal distribution for weight
* should we breakdown the weight analysis and / or the model by gender?

### Categorical Variables

In [16]:
cat_info_data = information_data.select_dtypes(include=[object])
cat_info_data.describe().T

Unnamed: 0,count,unique,top,freq
competitorName,399340,211399,Michael Smith,33
firstName,399341,31309,Michael,4166
lastName,399338,91678,Smith,2132
gender,399342,2,M,263902
countryOfOriginName,396587,175,United States,191436
regionName,399342,6,North America,217141
affiliateName,343166,12772,CrossFit Torian,342
height,399342,71,70 in,22430
weight,399342,195,185 lb,11569
competition,399342,4,open,369450


##### Gender

In [17]:
create_bar(information_data, 'gender')

**Insights:**
* open has more male than female athletes, while games has an even number of man and woman
    * male's competitions are more disputed as there are more athletes per vacancies than in the female's competitions

##### Region

In [18]:
create_bar(information_data, 'regionName')

**Insights:**
* half of the athletes represent north america, a quarter of them represent europe and the rest is more or less evenly distributed between oceania, asia, south america and africa  
    * how would this chart look like if instead of # of athletes we plotted something like # of athletes / # of habitants?
        * oceania would look more significant, while asia, south america and africa would be even more insignificant
    * can we relate this big or small representativeness with how much the region invests on sports and athletes?
        * I know north america invests heavily in athletes and south america almost doesn't, for example

In [19]:
create_bar_two_columns_average(information_data, 'regionName', 'overallScore')

**Disclaimer:**
* for open and quarterfinals, the smallest the overallScore, the better!  
* for semifinals and games, the biggest the overallScore, the better!

**Insights:**
* north america and europe not only have more athletes but also constantly have the best average overallScore
* oceania doesn't have so many athletes but consistently has a great average overallScore
* south america seems to consistently have a medium average overallScore
* africa and asia doesn't seem to have a consistency about their average overallScore

In [20]:
create_bar_two_columns_average(information_data, 'regionName', 'overallRank')

**Disclaimer:**
* for all competitions, the smallest the overallRank, the better!

**Insights:**
* north america and europe not only have more athletes but also constantly have the best average overallRank
* oceania doesn't have so many athletes but consistently has a great average overallRank
* south america seems to consistently have a medium average overallRank
* africa and asia doesn't seem to have a consistency about their average overallRank

## Athletes Scores

In [21]:
# Create an empty DataFrame to combine the data
scores_data = pd.DataFrame()

# Iterate over each year and competition combination
for year in years:
    for competition in competitions:
        # Exclude the combination "2023" and "games"
        if year == 2023 and competition == "games":
            continue
        
        # Construct the file path for each dataset
        file_path = f"../input/crossfit-competitions/df_{year}_{competition}_athletes_scores.csv"
        
        # Read the dataset into a temporary DataFrame
        temp_data = pd.read_csv(file_path)
        
        # Concatenate the temporary DataFrame with the scores_data DataFrame
        scores_data = pd.concat([scores_data, temp_data], ignore_index=True)

# Rename the 'ordinal' column to 'workout'
scores_data = scores_data.rename(columns={'ordinal': 'workout'})

# Filter out rows where 'scoreDisplay' is null or equal to '--'
scores_data = scores_data.dropna(subset=['scoreDisplay']).loc[scores_data['scoreDisplay'] != '--']

# Create the score_type column using list comprehension
scores_data['score_type'] = ['weight' if ('lb' in score or 'kg' in score) else 'repetition' if ('rep' in score or 'cal' in score or 'pt' in score) else 'time' if ('CAP' in score or ':' in score) else 'repetition' if (score.isnumeric() and int(score) < 100) else 'weight' if (score.isnumeric() and int(score) >= 100) else 'error' for score in scores_data['scoreDisplay']]

# Group the data by 'competition' and 'year' and calculate the number of unique 'workout' values
total_workouts = scores_data.groupby(['competition', 'year'])['workout'].nunique().reset_index().rename(columns={'workout': 'total_workouts'})

# Merge the 'total_workouts' column back to the original DataFrame
scores_data = scores_data.merge(total_workouts, on=['competition', 'year'], how='left')

# Print the resulting DataFrame
scores_data

Unnamed: 0,workout,rank,score,valid,scoreDisplay,competitorId,year,competition,score_type,total_workouts
0,1.0,20.0,6050185.0,1.0,11:55,469656.0,2021,open,time,4
1,2.0,8.0,2250646.0,1.0,9:14,469656.0,2021,open,time,4
2,3.0,27.0,1800405.0,1.0,8:15,469656.0,2021,open,time,4
3,4.0,46.0,317180405.0,1.0,317 lbs,469656.0,2021,open,weight,4
4,1.0,33.0,6050155.0,1.0,12:25,34796.0,2021,open,time,4
...,...,...,...,...,...,...,...,...,...,...
2714561,6.0,29.0,3.0,1.0,CAP +29,2094617.0,2023,semifinals,time,7
2714562,7.0,29.0,3.0,1.0,CAP +68,2094617.0,2023,semifinals,time,7
2714563,1.0,24.0,18.0,1.0,CAP +85,1313021.0,2023,semifinals,time,7
2714564,2.0,23.0,21.0,1.0,0,1313021.0,2023,semifinals,repetition,7


In [22]:
scores_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2714566 entries, 0 to 2714565
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   workout         float64
 1   rank            float64
 2   score           float64
 3   valid           float64
 4   scoreDisplay    object 
 5   competitorId    float64
 6   year            int64  
 7   competition     object 
 8   score_type      object 
 9   total_workouts  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 227.8+ MB


### Numeric Variables

In [23]:
num_scor_data = scores_data.select_dtypes(include=[np.number])
num_scor_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
workout,2714566.0,2.358942,1.120442,1.0,1.0,2.0,3.0,15.0
rank,2714566.0,54905.44,37719.62,1.0,21821.0,52403.0,83795.0,152032.0
score,2714566.0,11830820.0,40230540.0,1.0,1220167.5,1620000.0,2160188.0,2147484000.0
valid,2714566.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
competitorId,2714566.0,1537333.0,719665.6,86.0,954268.0,1752895.5,2139128.0,2523369.0
year,2714566.0,2022.099,0.841156,2021.0,2021.0,2022.0,2023.0,2023.0
total_workouts,2714566.0,3.825847,0.6373951,3.0,3.0,4.0,4.0,15.0


In [24]:
# Calculate the correlation between desired columns
num_scor_corr_matrix = num_scor_data[['rank', 'score']].corr()

# Create the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=num_scor_corr_matrix.values,
    x=num_scor_corr_matrix.columns,
    y=num_scor_corr_matrix.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
))

# Set the layout options for the heatmap
fig.update_layout(
    title='correlation heatmap',
    xaxis=dict(title='Columns'),
    yaxis=dict(title='Columns', autorange='reversed'),
)

# Show the plot
fig.show()

**Insights:**
* rank and score are counterintuitively strongly not correlated

### Categorical Variables

In [25]:
cat_scor_data = scores_data.select_dtypes(include=[object])
cat_scor_data.describe().T

Unnamed: 0,count,unique,top,freq
scoreDisplay,2714566,13141,180 reps,52365
competition,2714566,4,open,2519516
score_type,2714566,3,repetition,1959005


##### Workouts

In [26]:
create_bar_two_columns_duplicate(scores_data, 'score_type', 'workout')

**Insights:**
* open has always 4 workouts, quarterfinals has 5, semifinals has 6-7 workouts and games has 14-15 workouts
* most of the workouts are usually a time type of workout (called 'for time', when you have a task to complete and your score is the time you finish it)
* repetition (caled 'AMRAP', when your task is to complete as many rounds or repetitions as possible given a time cap) and weight (literally how much weight you lift in a given task) types of workout are not necessarily present in every competition so they can vary