# Imports

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

# Definitions

In [2]:
base_path = "C:/Users/Zi/Documents/data-science-crossfit/"

# Data Analysis

### Games

In [3]:
df_games_conso = pd.read_csv(base_path + "csv_files/version_2_clean_data/df_games_conso.csv")
df_games_conso = df_games_conso[df_games_conso['year'] != 2023]

In [4]:
df_games_conso

Unnamed: 0,competitorId,competitorName,firstName,lastName,status,gender,countryOfOriginCode,countryOfOriginName,regionId,regionName,affiliateId,affiliateName,age,height,weight,overallRank,overallScore,genderId,year,bmi
0,164070,James Fitzgerald,James,Fitzgerald,ACT,M,0,,0,,0.0,,33.0,176.0,80.0,1,272,1,2007,25.826446
1,57785,Brett Marshall,Brett,Marshall,ACT,M,0,,0,,0.0,,33.0,168.0,71.0,2,270,1,2007,25.155896
2,10091,Josh Everett,Josh,Everett,ACT,M,0,,0,,0.0,,32.0,176.0,84.0,3,267,1,2007,27.117769
3,27065,Chris Spealler,Chris,Spealler,ACT,M,0,,0,,0.0,,28.0,166.0,65.0,4,261,1,2007,23.588329
4,7173,Breck Berry,Breck,Berry,ACT,M,0,,0,,0.0,,31.0,171.0,78.0,5,252,1,2007,26.674874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,314703,Caroline Conners,Caroline,Conners,CUT,F,1,United States,1,North America,1926.0,CrossFit MF,29.0,155.0,61.0,36,281,2,2022,25.390219
1629,1558034,Julia Kato,Julia,Kato,CUT,F,19,Brazil,3,South America,17913.0,CrossFit Al Ain,22.0,159.0,67.0,37,258,2,2022,26.502116
1630,762495,Elena Carratala Sanahuja,Elena,Carratala Sanahuja,CUT,F,15,Spain,4,Europe,19523.0,CrossFit 4 Friends,28.0,164.0,63.0,38,223,2,2022,23.423557
1631,505225,Michelle Merand,Michelle,Merand,CUT,F,9,South Africa,5,Africa,0.0,,33.0,158.0,60.0,39,210,2,2022,24.034610


In [5]:
df_games_conso.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1633 entries, 0 to 1632
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   competitorId         1633 non-null   int64  
 1   competitorName       1633 non-null   object 
 2   firstName            1633 non-null   object 
 3   lastName             1632 non-null   object 
 4   status               1633 non-null   object 
 5   gender               1633 non-null   object 
 6   countryOfOriginCode  1633 non-null   int64  
 7   countryOfOriginName  1140 non-null   object 
 8   regionId             1633 non-null   int64  
 9   regionName           1178 non-null   object 
 10  affiliateId          1633 non-null   float64
 11  affiliateName        839 non-null    object 
 12  age                  1633 non-null   float64
 13  height               1633 non-null   float64
 14  weight               1633 non-null   float64
 15  overallRank          1633 non-null   int64 

##### Numeric Variables

In [6]:
numerical_games = df_games_conso.select_dtypes(include=[np.number])
numerical_games.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
competitorId,1633.0,317673.375383,562906.224038,1616.0,15679.0,58489.0,310970.0,2377152.0
countryOfOriginCode,1633.0,9.276791,22.034021,0.0,0.0,1.0,5.0,115.0
regionId,1633.0,1.853644,2.12421,0.0,0.0,1.0,4.0,7.0
affiliateId,1633.0,3944.567055,6285.218117,0.0,0.0,372.0,5490.0,29609.0
age,1633.0,27.901944,4.540692,13.0,25.0,27.881491,30.0,54.0
height,1633.0,171.785128,8.450826,150.0,165.374825,171.0,178.0,199.0
weight,1633.0,77.746615,12.987179,50.0,65.663854,80.0,88.375447,109.0
overallRank,1633.0,33.088794,28.944463,-1.0,13.0,26.0,40.0,141.0
overallScore,1633.0,326.676056,296.719965,0.0,48.0,237.0,562.0,1435.0
genderId,1633.0,1.461115,0.498638,1.0,1.0,1.0,2.0,2.0


In [7]:
# Calculate the correlation between desired columns
numerical_games_correlation = numerical_games.drop(columns=['competitorId', 'year']).corr()

# Create the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=numerical_games_correlation.values,
    x=numerical_games_correlation.columns,
    y=numerical_games_correlation.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
))

# Set the layout options for the heatmap
fig.update_layout(
    title='correlation heatmap',
    xaxis=dict(title='Columns'),
    yaxis=dict(title='Columns', autorange='reversed'),
)

# Show the plot
fig.show()

In [8]:
# Loop over each numeric column in numerical_games
for column in numerical_games[['year', 'regionId', 'age', 'height', 'weight', 'bmi']].columns:
    # Create a histogram trace for the current column
    trace = go.Histogram(
        x=numerical_games[column],
        nbinsx=20,
        name=f"{column} Distribution"
    )

    # Create the figure and add the trace
    fig = go.Figure(trace)

    # Update the layout
    fig.update_layout(
        title=f"{column} Distribution",
        xaxis=dict(title=column),
        yaxis=dict(title="Frequency"),
    )

    # Show the plot
    fig.show()

In [9]:
# Loop over each numeric column in numerical_games
for column in numerical_games[['height', 'weight', 'bmi']].columns:
    # Create a histogram with breakdown by 'genderId'
    fig = px.histogram(
        numerical_games,
        x=column,
        color='genderId',
        nbins=20,
        title=f"{column} Distribution by Gender",
        labels={'genderId': 'Gender', column: column},
        opacity=0.7,
    )

    # Show the plot
    fig.show()

##### Categorical Variables

In [10]:
categorical_games = df_games_conso.select_dtypes(include=[object])
categorical_games.describe().T

Unnamed: 0,count,unique,top,freq
competitorName,1633,951,Ben Smith,11
firstName,1633,624,Jason,21
lastName,1632,846,Smith,31
status,1633,5,ACT,909
gender,1633,2,M,880
countryOfOriginName,1140,115,United States,596
regionName,1178,7,North America,694
affiliateName,839,448,CrossFit Mayhem,17


In [11]:
# Loop over each numeric column in numerical_games
for column in categorical_games[['status', 'gender', 'regionName']].columns:
    # Create a count for the current column
    count = categorical_games[column].value_counts().sort_index()
    
    # Create a histogram trace for the current column
    trace = go.Bar(
        x=count.index,
        y=count.values,
        name=f"{column} Distribution"
    )

    # Create the figure and add the trace
    fig = go.Figure(trace)

    # Update the layout
    fig.update_layout(
        title=f"{column} Distribution",
        xaxis=dict(title=column),
        yaxis=dict(title="Count"),
    )

    # Show the plot
    fig.show()

In [12]:
# Create a average score per region
average_score = df_games_conso.groupby('regionName')['overallScore'].mean()

# Create a histogram trace for the current column
trace = go.Bar(
    x=average_score.index,
    y=average_score.values
)

# Create the figure and add the trace
fig = go.Figure(trace)

# Update the layout
fig.update_layout(
    title="Average overallScore by regionName",
    xaxis=dict(title="regionName"),
    yaxis=dict(title="Average overallScore"),
)

# Show the plot
fig.show()

In [13]:
# Create a average rank per region
average_rank = df_games_conso.groupby('regionName')['overallRank'].mean()

# Create a histogram trace for the current column
trace = go.Bar(
    x=average_rank.index,
    y=average_rank.values
)

# Create the figure and add the trace
fig = go.Figure(trace)

# Update the layout
fig.update_layout(
    title="Average overallRank by regionName",
    xaxis=dict(title="regionName"),
    yaxis=dict(title="Average overallRank"),
)

# Show the plot
fig.show()