# Imports

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

# Definitions

In [2]:
base_path = "C:/Users/Zi/Documents/data-science-crossfit/"

# Data Analysis

In [3]:
df_games_and_open = pd.read_csv(base_path + "csv_files/version_2_clean_data/df_games_and_open.csv")
df_games_and_open = df_games_and_open[df_games_and_open['year'] != 2023]

In [4]:
df_games_and_open

Unnamed: 0,competitorId,competitorName,firstName,lastName,gender,genderId,age,ageNull,height,heightNull,...,status,statusId,year,overallRank,overallScore,openCompetitor,openRank,openScore,gamesCompetitions,openCompetitions
0,1616,Russ Greene,Russ,Greene,M,1,20,0,178.000000,0,...,ACT,1,2007,11,232,0,140133,604776,1,0
1,1616,Russ Greene,Russ,Greene,M,1,21,0,178.000000,0,...,ACT,1,2008,53,21,0,140133,604776,2,0
2,1685,Christopher Woods,Christopher,Woods,M,1,29,0,163.000000,0,...,ACT,1,2008,32,19,0,140133,604776,1,0
3,1690,Travis Mayer,Travis,Mayer,M,1,23,0,181.000000,0,...,ACT,1,2014,29,483,1,17,566,1,1
4,1690,Travis Mayer,Travis,Mayer,M,1,25,0,181.000000,0,...,ACT,1,2016,10,702,1,3,86,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2377148,Melanie Ward,Melanie,Ward,F,2,27,1,165.374825,1,...,ACT,1,2007,14,237,0,140133,604776,1,1
1710,2377149,Susan Kopcha,Susan,Kopcha,F,2,27,1,165.374825,1,...,WD,2,2007,16,162,0,140133,604776,1,1
1711,2377150,Melanie Ayres,Melanie,Ayres,F,2,27,1,165.374825,1,...,WD,2,2007,18,81,0,140133,604776,1,1
1712,2377151,Jaime Stumpf,Jaime,Stumpf,F,2,27,1,165.374825,1,...,WD,2,2007,19,76,0,140133,604776,1,1


### Numeric Variables

In [5]:
numerical_games = df_games_and_open.select_dtypes(include=[np.number])
numerical_games.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
competitorId,1633.0,317673.375383,562906.224038,1616.0,15679.0,58489.0,310970.0,2377152.0
genderId,1633.0,1.461115,0.498638,1.0,1.0,1.0,2.0,2.0
age,1633.0,27.86038,4.544724,13.0,25.0,27.0,30.0,54.0
ageNull,1633.0,0.047152,0.21203,0.0,0.0,0.0,0.0,1.0
height,1633.0,171.785128,8.450826,150.0,165.374825,171.0,178.0,199.0
heightNull,1633.0,0.103491,0.304692,0.0,0.0,0.0,0.0,1.0
weight,1633.0,77.746615,12.987179,50.0,65.663854,80.0,88.375447,109.0
weightNull,1633.0,0.100429,0.300663,0.0,0.0,0.0,0.0,1.0
bmi,1633.0,26.169534,2.659314,15.65617,24.013081,26.21641,28.08626,39.55556
bmiNull,1633.0,0.109614,0.312504,0.0,0.0,0.0,0.0,1.0


##### Heatmap

In [6]:
# Calculate the correlation between desired columns
numerical_games_correlation = numerical_games.corr()

# Create the heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=numerical_games_correlation.values,
    x=numerical_games_correlation.columns,
    y=numerical_games_correlation.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'),
))

# Set the layout options for the heatmap
fig.update_layout(
    title='Correlation Heatmap',
    xaxis=dict(title='Columns', tickangle=-45, automargin=True, tickfont=dict(size=10)),
    yaxis=dict(title='Columns', autorange='reversed', tickangle=0, automargin=True, tickfont=dict(size=10)),
    height=600,
)

# Show the plot
fig.show()


In [7]:
numerical_games.columns

Index(['competitorId', 'genderId', 'age', 'ageNull', 'height', 'heightNull',
       'weight', 'weightNull', 'bmi', 'bmiNull', 'affiliateId',
       'countryOfOriginCode', 'regionId', 'statusId', 'year', 'overallRank',
       'overallScore', 'openCompetitor', 'openRank', 'openScore',
       'gamesCompetitions', 'openCompetitions'],
      dtype='object')

##### Histograms

In [8]:
# Loop over each numeric column in numerical_games
for column in numerical_games[['age', 'height', 'weight', 'bmi', 'regionId', 'year', 'overallRank', 'overallScore', 
                               'openRank', 'openScore', 'gamesCompetitions', 'openCompetitions']].columns:
    # Create a histogram trace for the current column
    trace = go.Histogram(
        x=numerical_games[column],
        nbinsx=20,
        name=f"{column} Distribution"
    )

    # Create the figure and add the trace
    fig = go.Figure(trace)

    # Update the layout
    fig.update_layout(
        title=f"{column} Distribution",
        xaxis=dict(title=column),
        yaxis=dict(title="Frequency"),
    )

    # Show the plot
    fig.show()

##### Histograms per gender

In [9]:
# Loop over each numeric column in numerical_games
for column in numerical_games[['height', 'weight', 'bmi']].columns:
    # Create a histogram with breakdown by 'genderId'
    fig = px.histogram(
        numerical_games,
        x=column,
        color='genderId',
        nbins=20,
        title=f"{column} Distribution by Gender",
        labels={'genderId': 'Gender', column: column},
        opacity=0.7,
    )

    # Show the plot
    fig.show()

##### Categorical Variables

In [10]:
categorical_games = df_games_and_open.select_dtypes(include=[object])
categorical_games.describe().T

Unnamed: 0,count,unique,top,freq
competitorName,1633,951,Annie Thorisdottir,11
firstName,1633,624,Jason,21
lastName,1632,846,Smith,31
gender,1633,2,M,880
affiliateName,839,448,CrossFit Mayhem,17
countryOfOriginName,1140,115,United States,596
regionName,1178,7,North America,694
status,1633,5,ACT,909


##### Bar charts

In [11]:
# Loop over each numeric column in numerical_games
for column in categorical_games[['gender', 'regionName', 'status']].columns:
    # Create a count for the current column
    count = categorical_games[column].value_counts().sort_index()
    
    # Create a histogram trace for the current column
    trace = go.Bar(
        x=count.index,
        y=count.values,
        name=f"{column} Distribution"
    )

    # Create the figure and add the trace
    fig = go.Figure(trace)

    # Update the layout
    fig.update_layout(
        title=f"{column} Distribution",
        xaxis=dict(title=column),
        yaxis=dict(title="Count"),
    )

    # Show the plot
    fig.show()

### Averages

##### OverallScore

In [12]:
# Loop over each numeric column in numerical_games
for column in df_games_and_open[['gender', 'age', 'height', 'weight', 'bmi', 'regionName', 'status', 'year', 
                                 'gamesCompetitions', 'openCompetitions']].columns:
    # Create a average score per region
    average_score = df_games_and_open.groupby(column)['overallScore'].mean()

    # Create a histogram trace for the current column
    trace = go.Bar(
        x=average_score.index,
        y=average_score.values
    )

    # Create the figure and add the trace
    fig = go.Figure(trace)

    # Update the layout
    fig.update_layout(
        title="Average overallScore by " + column,
        xaxis=dict(title=column),
        yaxis=dict(title="Average overallScore"),
    )

    # Show the plot
    fig.show()

##### OverallRank

In [13]:
# Loop over each numeric column in numerical_games
for column in df_games_and_open[['gender', 'age', 'height', 'weight', 'bmi', 'regionName', 'status', 'year', 
                                 'gamesCompetitions', 'openCompetitions']].columns:
    # Create a average score per region
    average_score = df_games_and_open.groupby(column)['overallRank'].mean()

    # Create a histogram trace for the current column
    trace = go.Bar(
        x=average_score.index,
        y=average_score.values
    )

    # Create the figure and add the trace
    fig = go.Figure(trace)

    # Update the layout
    fig.update_layout(
        title="Average overallRank by " + column,
        xaxis=dict(title=column),
        yaxis=dict(title="Average overallRank"),
    )

    # Show the plot
    fig.show()