# Imports

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

# Definitions

In [11]:
base_path = "C:/Users/Zi/Documents/data-science-crossfit/"

# Feature Engineering

### Games

In [12]:
df_games_conso = pd.read_csv(base_path + "csv_files/version_2_clean_data/df_games_conso.csv")
df_games_conso = df_games_conso[df_games_conso['year'] != 2023]

In [13]:
df_games_conso

Unnamed: 0,competitorId,competitorName,firstName,lastName,status,gender,countryOfOriginCode,countryOfOriginName,regionId,regionName,affiliateId,affiliateName,age,height,weight,overallRank,overallScore,genderId,year,bmi
0,164070,James Fitzgerald,James,Fitzgerald,ACT,M,0,,0,,0.0,,33.0,176.0,80.0,1,272,1,2007,25.826446
1,57785,Brett Marshall,Brett,Marshall,ACT,M,0,,0,,0.0,,33.0,168.0,71.0,2,270,1,2007,25.155896
2,10091,Josh Everett,Josh,Everett,ACT,M,0,,0,,0.0,,32.0,176.0,84.0,3,267,1,2007,27.117769
3,27065,Chris Spealler,Chris,Spealler,ACT,M,0,,0,,0.0,,28.0,166.0,65.0,4,261,1,2007,23.588329
4,7173,Breck Berry,Breck,Berry,ACT,M,0,,0,,0.0,,31.0,171.0,78.0,5,252,1,2007,26.674874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,314703,Caroline Conners,Caroline,Conners,CUT,F,1,United States,1,North America,1926.0,CrossFit MF,29.0,155.0,61.0,36,281,2,2022,25.390219
1629,1558034,Julia Kato,Julia,Kato,CUT,F,19,Brazil,3,South America,17913.0,CrossFit Al Ain,22.0,159.0,67.0,37,258,2,2022,26.502116
1630,762495,Elena Carratala Sanahuja,Elena,Carratala Sanahuja,CUT,F,15,Spain,4,Europe,19523.0,CrossFit 4 Friends,28.0,164.0,63.0,38,223,2,2022,23.423557
1631,505225,Michelle Merand,Michelle,Merand,CUT,F,9,South Africa,5,Africa,0.0,,33.0,158.0,60.0,39,210,2,2022,24.034610


### Open

In [14]:
df_open_conso = pd.read_csv(base_path + "csv_files/version_2_clean_data/df_open_conso.csv")
df_open_conso = df_open_conso[df_open_conso['year'] != 2023]

In [15]:
df_open_conso

Unnamed: 0,competitorId,overallRank,overallScore,year
0,47661,1,43,2011
1,11435,3,61,2011
2,151906,4,75,2011
3,10169,5,112,2011
4,5284,6,157,2011
...,...,...,...,...
1143,254824,172,876,2022
1144,1232297,193,969,2022
1145,505225,279,1376,2022
1146,1190381,307,1533,2022


### Merged

In [16]:
# Rename columns 'overallRank' and 'overallScore' to 'openRank' and 'openScore'
df_open_conso = df_open_conso.rename(columns={'overallRank': 'openRank', 'overallScore': 'openScore'})

# Merge the two DataFrames based on 'competitorId' and 'year'
df_merged = df_games_conso.merge(df_open_conso[['competitorId', 'year', 'openRank', 'openScore']], on=['competitorId', 'year'], how='left')

# Create 'openCompetitor' column and set it to 1 for open competitors
df_merged['openCompetitor'] = 1

# Set 'openCompetitor' to 0 for non-open competitors (where 'openRank' is null)
df_merged.loc[df_merged['openRank'].isnull(), 'openCompetitor'] = 0

# Fill null values in 'openScore' with 0
df_merged['openScore'] = df_merged['openScore'].fillna(0)

# Fill null values in 'openRank' with the maximum value in the column
df_merged['openRank'] = df_merged['openRank'].fillna(df_merged['openRank'].max())

In [17]:
df_merged

Unnamed: 0,competitorId,competitorName,firstName,lastName,status,gender,countryOfOriginCode,countryOfOriginName,regionId,regionName,...,height,weight,overallRank,overallScore,genderId,year,bmi,openRank,openScore,openCompetitor
0,164070,James Fitzgerald,James,Fitzgerald,ACT,M,0,,0,,...,176.0,80.0,1,272,1,2007,25.826446,140133.0,0.0,0
1,57785,Brett Marshall,Brett,Marshall,ACT,M,0,,0,,...,168.0,71.0,2,270,1,2007,25.155896,140133.0,0.0,0
2,10091,Josh Everett,Josh,Everett,ACT,M,0,,0,,...,176.0,84.0,3,267,1,2007,27.117769,140133.0,0.0,0
3,27065,Chris Spealler,Chris,Spealler,ACT,M,0,,0,,...,166.0,65.0,4,261,1,2007,23.588329,140133.0,0.0,0
4,7173,Breck Berry,Breck,Berry,ACT,M,0,,0,,...,171.0,78.0,5,252,1,2007,26.674874,140133.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,314703,Caroline Conners,Caroline,Conners,CUT,F,1,United States,1,North America,...,155.0,61.0,36,281,2,2022,25.390219,51.0,285.0,1
1629,1558034,Julia Kato,Julia,Kato,CUT,F,19,Brazil,3,South America,...,159.0,67.0,37,258,2,2022,26.502116,133.0,701.0,1
1630,762495,Elena Carratala Sanahuja,Elena,Carratala Sanahuja,CUT,F,15,Spain,4,Europe,...,164.0,63.0,38,223,2,2022,23.423557,44.0,259.0,1
1631,505225,Michelle Merand,Michelle,Merand,CUT,F,9,South Africa,5,Africa,...,158.0,60.0,39,210,2,2022,24.034610,279.0,1376.0,1


##### Mutual Information

In [18]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    # Compute mutual information scores
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    
    # Create a Series with MI scores, using column names as index
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    
    # Sort the MI scores in descending order
    mi_scores = mi_scores.sort_values(ascending=False)
    
    return mi_scores

# Select the relevant data and target variables
X = df_merged.select_dtypes(include=[np.number]).copy()
y = X.pop('overallScore')

# Identify discrete features (integer columns)
discrete_features = X.dtypes == int

# Compute MI scores using the make_mi_scores function
mi_scores = make_mi_scores(X, y, discrete_features)

# Output the MI scores
mi_scores

overallRank            0.938913
year                   0.904747
countryOfOriginCode    0.469767
regionId               0.465831
openRank               0.434165
openScore              0.389867
openCompetitor         0.341312
competitorId           0.199925
affiliateId            0.185698
height                 0.162182
weight                 0.161047
bmi                    0.105456
age                    0.055637
genderId               0.025598
Name: MI Scores, dtype: float64

In [19]:
def plot_mi_scores(scores):
    # Sort the scores in ascending order
    scores = scores.sort_values(ascending=True)
    
    # Extract the scores and feature names as lists
    width = list(scores)
    ticks = list(scores.index)
    
    # Create a horizontal bar chart with Plotly
    fig = go.Figure(go.Bar(
        x=width,
        y=ticks,
        orientation='h'
    ))
    
    # Update the layout of the chart
    fig.update_layout(
        title="Mutual Information Scores",  # Set the chart title
        yaxis_title="Features",  # Set the y-axis label
        xaxis_title="Mutual Information Score"  # Set the x-axis label
    )
    
    # Display the chart
    fig.show()

plot_mi_scores(mi_scores)

In [21]:
df_merged.columns

Index(['competitorId', 'competitorName', 'firstName', 'lastName', 'status',
       'gender', 'countryOfOriginCode', 'countryOfOriginName', 'regionId',
       'regionName', 'affiliateId', 'affiliateName', 'age', 'height', 'weight',
       'overallRank', 'overallScore', 'genderId', 'year', 'bmi', 'openRank',
       'openScore', 'openCompetitor'],
      dtype='object')

In [30]:
data = df_merged[['regionId', 'genderId', 'age', 'height', 'weight', 'bmi', 'overallRank', 'overallScore', 'openRank', 'openScore',
                  'openCompetitor']]

# Get the list of column names, excluding the 'overallScores' column
x_columns = [col for col in data.columns if col != 'overallScore']

# Create a scatter plot for each column
for column in x_columns:
    fig = px.scatter(data_frame=data, x=column, y='overallScore', title=f'Scatter plot of {column} vs. overallScore')
    fig.show()
