# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

# Definitions

In [2]:
base_path = "C:/Users/Zi/Documents/data-science-crossfit/"

# Feature Engineering

In [3]:
df_games_and_open = pd.read_csv(base_path + "csv_files/version_2_clean_data/df_games_and_open.csv")
df_games_and_open = df_games_and_open[df_games_and_open['year'] != 2023]

##### Mutual Information

In [5]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    # Compute mutual information scores
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    
    # Create a Series with MI scores, using column names as index
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    
    # Sort the MI scores in descending order
    mi_scores = mi_scores.sort_values(ascending=False)
    
    return mi_scores

# Select the relevant data and target variables
X = df_games_and_open.select_dtypes(include=[np.number]).copy()
y = X.pop('overallScore')

# Identify discrete features (integer columns)
discrete_features = X.dtypes == int

# Compute MI scores using the make_mi_scores function
mi_scores = make_mi_scores(X, y, discrete_features)

# Output the MI scores
mi_scores

overallRank            0.930419
year                   0.897714
countryOfOriginCode    0.477399
openScore              0.473782
regionId               0.468607
openRank               0.453544
openCompetitor         0.316697
statusId               0.306112
competitorId           0.201775
affiliateId            0.187185
gamesCompetitions      0.178987
openCompetitions       0.177511
height                 0.166275
weight                 0.153618
bmiNull                0.099392
bmi                    0.099323
weightNull             0.080892
heightNull             0.076430
ageNull                0.070163
age                    0.042313
genderId               0.009250
Name: MI Scores, dtype: float64

In [6]:
def plot_mi_scores(scores):
    # Sort the scores in ascending order
    scores = scores.sort_values(ascending=True)
    
    # Extract the scores and feature names as lists
    width = list(scores)
    ticks = list(scores.index)
    
    # Create a horizontal bar chart with Plotly
    fig = go.Figure(go.Bar(
        x=width,
        y=ticks,
        orientation='h'
    ))
    
    # Update the layout of the chart
    fig.update_layout(
        title="Mutual Information Scores",  # Set the chart title
        yaxis_title="Features",  # Set the y-axis label
        xaxis_title="Mutual Information Score"  # Set the x-axis label
    )
    
    # Display the chart
    fig.show()

plot_mi_scores(mi_scores)

In [7]:
df_games_and_open.columns

Index(['competitorId', 'competitorName', 'firstName', 'lastName', 'gender',
       'genderId', 'age', 'ageNull', 'height', 'heightNull', 'weight',
       'weightNull', 'bmi', 'bmiNull', 'affiliateName', 'affiliateId',
       'countryOfOriginName', 'countryOfOriginCode', 'regionName', 'regionId',
       'status', 'statusId', 'year', 'overallRank', 'overallScore',
       'openCompetitor', 'openRank', 'openScore', 'gamesCompetitions',
       'openCompetitions'],
      dtype='object')

In [8]:
data = df_games_and_open[['regionId', 'genderId', 'age', 'height', 'weight', 'bmi', 'overallRank', 'overallScore', 'openRank', 'openScore',
                  'openCompetitor']]

# Get the list of column names, excluding the 'overallScores' column
x_columns = [col for col in data.columns if col != 'overallScore']

# Create a scatter plot for each column
for column in x_columns:
    fig = px.scatter(data_frame=data, x=column, y='overallScore', title=f'Scatter plot of {column} vs. overallScore')
    fig.show()