In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../../datasets/athletics.csv')
df.head()
df['Event'].unique()

array(['Shot Put', 'Weight', 'Javelin', 'Discus', 'Hammer'], dtype=object)

In [3]:
df.isna().sum()

Event          0
Male_Female    0
EventID        0
Athlete        0
Flight1        0
Flight2        0
Flight3        0
Flight4        0
Flight5        0
Flight6        0
dtype: int64

In [4]:
# drop the EventID column
df.drop('EventID', axis=1, inplace=True)
df.head()

Unnamed: 0,Event,Male_Female,Athlete,Flight1,Flight2,Flight3,Flight4,Flight5,Flight6
0,Shot Put,Male,Ryan Whiting,21.29,21.0,0.0,0.0,21.8,21.53
1,Shot Put,Male,Cory Martin,20.85,0.0,0.0,0.0,20.61,20.93
2,Shot Put,Male,Kurt Roberts,20.89,19.63,0.0,0.0,0.0,20.55
3,Shot Put,Male,Derrick Vicars,19.36,0.0,0.0,0.0,19.19,19.83
4,Shot Put,Male,Jacob Thormaehlen,18.35,18.92,0.0,19.1,0.0,19.52


In [5]:
# Data analysis by gender for each event
df_grouped = df.groupby(['Event', 'Male_Female']).mean().reset_index()

# Rank for the best athlete for each event considering the max flight, for each gender
df['Max_Flight'] = df[['Flight1', 'Flight2', 'Flight3', 'Flight4', 'Flight5', 'Flight6']].max(axis=1)
df_ranked = df.sort_values(['Event', 'Male_Female', 'Max_Flight'], ascending=[True, True, False])
df_ranked = df_ranked.groupby(['Event', 'Male_Female']).first().reset_index()

# Plot the data using Plotly
fig = px.bar(df_ranked, x='Event', y='Max_Flight', color='Male_Female', barmode='group')
fig.show()

In [6]:
# Rank for the best athlete for each event considering the max flight, for each gender
df['Max_Flight'] = df[['Flight1', 'Flight2', 'Flight3', 'Flight4', 'Flight5', 'Flight6']].max(axis=1)
df_ranked = df.sort_values(['Event', 'Male_Female', 'Max_Flight'], ascending=[True, True, False])

# Print the rank list of the best flights
for gender in df_ranked['Male_Female'].unique():
    print(f"Rank list for {gender} events:")
    for i, event in enumerate(df_ranked[df_ranked['Male_Female'] == gender]['Event'].unique()):
        print(f"{i+1} - {event}:")
        event_ranked = df_ranked[(df_ranked['Event'] == event) & (df_ranked['Male_Female'] == gender)]
        for k, (j, row) in enumerate(event_ranked.iterrows()):
            if k > 4:
                break
            print(f"\t{j+1} - Athlete: {row['Athlete']}, Flight Value: {row['Max_Flight']}")

Rank list for Female events:
1 - Discus:
	158 - Athlete: Gia Lewis-Smallwood, Flight Value: 65.96
	918 - Athlete: Stephanie Brown Trafton, Flight Value: 65.18
	68 - Athlete: Gia Lewis-Smallwood, Flight Value: 65.13
	1892 - Athlete: Andrew Evans, Flight Value: 64.46
	1191 - Athlete: Gia Lewis-Smallwood, Flight Value: 64.01
2 - Hammer:
	2054 - Athlete: Deanna Price, Flight Value: 78.12
	1546 - Athlete: Gwen Berry, Flight Value: 76.31
	82 - Athlete: Amanda Bingson, Flight Value: 75.73
	770 - Athlete: Sultana Frizell, Flight Value: 75.73
	172 - Athlete: Amanda Bingson, Flight Value: 75.07
3 - Javelin:
	1148 - Athlete: Kara Patterson, Flight Value: 64.94
	2062 - Athlete: Kara Winger, Flight Value: 62.88
	1831 - Athlete: Kara Winger, Flight Value: 62.8
	143 - Athlete: Kara Patterson, Flight Value: 62.42
	144 - Athlete: Brittany Borman, Flight Value: 62.05
4 - Shot Put:
	38 - Athlete: Michelle Carter, Flight Value: 20.24
	1140 - Athlete: Michelle Carter, Flight Value: 20.02
	1866 - Athlete: R

In [7]:
# calculate mean of flights 1 to 3
df['mean_y'] = df[['Flight1', 'Flight2', 'Flight3']].mean(axis=1)

# calculate mean of flights 4 to 6
df['mean_x'] = df[['Flight4', 'Flight5', 'Flight6']].mean(axis=1)

In [8]:
import plotly.express as px

# loop through each unique event in the dataset
for event in df['Event'].unique():
    # filter dataset by event
    event_df = df[df['Event'] == event]

    # plot data with a different color for each gender
    fig = px.scatter(event_df, x='mean_x', y='mean_y', color='Male_Female',
                     title=event, labels={'mean_x': 'Mean of Flights 4 to 6', 'mean_y': 'Mean of Flights 1 to 3'})
    fig.show()

In [9]:
# create a new column 'gender' with 0 for female and 1 for male
df['gender'] = df['Male_Female'].apply(lambda x: 1 if x == 'Male' else 0)

# create a new dataframe with the relevant columns
df_rf = df[['gender', 'Flight1', 'Flight2', 'Flight3', 'Flight4', 'Flight5', 'Flight6', 'Event']]

In [10]:
from sklearn.model_selection import train_test_split

# split the data into training and testing sets
X = df_rf.drop('Event', axis=1)
y = df_rf['Event']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y.unique()

array(['Shot Put', 'Weight', 'Javelin', 'Discus', 'Hammer'], dtype=object)

In [11]:
import plotly.express as px
from sklearn.cluster import KMeans

# Prepare the data
df["Gender"] = df["Male_Female"].apply(lambda x: 1 if x == "Male" else 0)
X = df[["Flight1", "Flight2", "Flight3", "Flight4", "Flight5", "Flight6"]]
X["mean_x"] = X[["Flight1", "Flight2", "Flight3"]].mean(axis=1)
X["mean_y"] = X[["Flight4", "Flight5", "Flight6"]].mean(axis=1)

# Perform K-means clustering
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(X[["mean_x", "mean_y"]])

# Add the predicted clusters to the original dataframe
df["Cluster"] = kmeans.predict(X[["mean_x", "mean_y"]])

# Plot the clusters
fig = px.scatter(df, x="mean_x", y="mean_y", color="Cluster")
fig.show()

# Predict the Event based on the clusters
event_map = {
    0: "Shot Put",
    1: "Weight",
    2: "Javelin",
    3: "Discus",
    4: "Hammer"
}
df["Event"] = df["Cluster"].map(event_map)

In [12]:
from sklearn.metrics import silhouette_score
silhouette_score(X[["mean_x", "mean_y"]], kmeans.labels_)

0.5005713697872408

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

df = df.dropna()
# Split the data into features (X) and target (y)
X = df.drop(['Event','Male_Female','Athlete','Max_Flight', 'mean_y', 'mean_x', 'gender', 'Cluster'], axis=1)
y = df['Event']
y.unique()

array(['Shot Put', 'Discus', 'Weight', 'Javelin', 'Hammer'], dtype=object)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train.head()

Unnamed: 0,Flight1,Flight2,Flight3,Flight4,Flight5,Flight6,Gender
713,0.0,16.16,16.19,15.62,16.36,16.26,0
1676,23.76,23.84,23.86,23.83,24.09,24.3,0
154,59.36,60.52,0.0,0.0,0.0,60.46,1
192,16.28,15.75,16.48,0.0,0.0,15.89,0
2071,19.71,20.74,0.0,0.0,0.0,20.57,1


In [15]:
import plotly.graph_objs as go
def gen_cm_plotly(cm):
    # compute success percentages and format as strings with percentage symbol
    total_predictions = cm.sum(axis=1)
    success_percentages = np.zeros_like(cm, dtype=float)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            if total_predictions[i] > 0:
                success_percentages[i,j] = cm[i,j] / total_predictions[i] * 100
    success_percentages = np.char.add(success_percentages.round(1).astype(str), '%')

    # create text labels for each cell
    text = [[f"Count: {cm[j][i]:,}<br>Success: {success_percentages[j][i]}%" for i in range(len(cm[j]))] for j in range(len(cm))]

    # plot the confusion matrix using plotly
    fig = go.Figure(data=go.Heatmap(
                    z=cm,
                    x=y.unique(),
                    y=y.unique(),
                    colorscale='inferno',
                    text=text,
                    hovertemplate='%{text}<extra></extra>'
                    ))
    # add annotations as percentages to heatmap
    for i in range(len(cm)):
        for j in range(len(cm)):
            fig.add_annotation(x=j, y=i, text=success_percentages[i][j], showarrow=False, font=dict(color='white', size=12))
            
    fig.update_layout(
        title='Confusion Matrix',
        xaxis_title='Predicted Event',
        yaxis_title='True Event',
        font=dict(
            size=14,
            color='black'
        )
    )
    fig.update_traces(textfont_color='white')
    fig.show()

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Create and fit a Gradient Boosting classifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
# make predictions on the testing set
y_pred = gb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
gen_cm_plotly(cm)

Accuracy: 0.9492063492063492


In [17]:
from sklearn.ensemble import RandomForestClassifier
# Create and fit a Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
# make predictions on the testing set
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
gen_cm_plotly(cm)

Accuracy: 0.9634920634920635


In [18]:
X_train.columns, X_train['Gender'].unique()

(Index(['Flight1', 'Flight2', 'Flight3', 'Flight4', 'Flight5', 'Flight6',
        'Gender'],
       dtype='object'),
 array([0, 1], dtype=int64))

In [19]:
#let`s try a new data to predict
new_data = pd.DataFrame({
    'Flight1': 61.18,
    'Flight2': 65,
    'Flight3': 0,
    'Flight4': 51.74,
    'Flight5': 56.74,
    'Flight6': 63.32,
    'Gender': 1
}, index=[0])

# use the trained model to predict the output for the new data
new_pred = rf.predict(new_data)
proba = rf.predict_proba(new_data)

new_pred, proba

(array(['Weight'], dtype=object), array([[0.06, 0.27, 0.04, 0.  , 0.63]]))

In [20]:
#let`s try a new data to predict
new_data = pd.DataFrame({
    'Flight1': 61.7,
    'Flight2': 58,
    'Flight3': 57.2,
    'Flight4': 0,
    'Flight5': 47,
    'Flight6': 58.5,
    'Gender': 0
}, index=[0])

# use the trained model to predict the output for the new data
new_pred = rf.predict(new_data)
proba = rf.predict_proba(new_data)

new_pred, proba

(array(['Discus'], dtype=object), array([[0.85, 0.05, 0.08, 0.  , 0.02]]))

In [21]:
#let`s try a new data to predict
new_data = pd.DataFrame({
    'Flight1': 60.7,
    'Flight2': 62,
    'Flight3': 57.2,
    'Flight4': 61,
    'Flight5': 60,
    'Flight6': 61.5,
    'Gender': 0
}, index=[0])

# use the trained model to predict the output for the new data
new_pred = rf.predict(new_data)
proba = rf.predict_proba(new_data)

new_pred, proba

(array(['Javelin'], dtype=object), array([[0.03, 0.03, 0.85, 0.  , 0.09]]))