In [1]:
import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde

In [2]:
df = pd.read_csv("marathon_results.csv")

In [3]:
df["tot_minutes"] = pd.to_timedelta(df["Finish"]).dt.total_seconds()/60

In [4]:
df

Unnamed: 0,Age,M/F,Country,5K,10K,15K,20K,Half,25K,30K,35K,40K,Finish,Pace,Overall,Gender,Division,tot_minutes
0,25,M,ETH,00:14:43,00:29:43,00:44:57,01:00:29,01:04:02,01:16:07,01:32:00,01:47:59,02:02:39,02:09:17,00:04:56,1,1,1,129.283333
1,30,M,ETH,00:14:43,00:29:43,00:44:58,01:00:28,01:04:01,01:16:07,01:31:59,01:47:59,02:02:42,02:09:48,00:04:58,2,2,2,129.800000
2,29,M,KEN,00:14:43,00:29:43,00:44:57,01:00:29,01:04:02,01:16:07,01:32:00,01:47:59,02:03:01,02:10:22,00:04:59,3,3,3,130.366667
3,28,M,KEN,00:14:43,00:29:44,00:45:01,01:00:29,01:04:02,01:16:07,01:32:00,01:48:03,02:03:47,02:10:47,00:05:00,4,4,4,130.783333
4,32,M,KEN,00:14:43,00:29:44,00:44:58,01:00:28,01:04:01,01:16:07,01:32:00,01:47:59,02:03:27,02:10:49,00:05:00,5,5,5,130.816667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,64,F,USA,00:50:15,01:43:31,02:36:53,03:32:26,03:43:46,04:25:53,05:19:44,06:17:19,07:13:34,07:38:56,00:17:31,26594,12015,269,458.933333
26294,61,F,USA,00:48:36,01:39:39,02:39:13,03:35:58,03:47:55,04:32:44,05:31:58,06:28:56,07:26:19,07:51:30,00:17:59,26595,12016,270,471.500000
26295,66,F,USA,00:53:03,01:47:16,02:41:45,03:37:07,03:48:21,04:33:51,05:38:56,06:38:51,07:36:18,07:59:33,00:18:18,26596,12017,91,479.550000
26296,53,M,USA,00:49:04,01:40:12,02:33:31,03:31:41,03:43:35,04:29:20,05:31:11,06:33:35,07:35:38,08:00:37,00:18:20,26597,14580,2055,480.616667


Split the data between training and testing

In [5]:
train_df = df.sample(frac=0.5, random_state=1)

In [6]:
test_df = df.drop(train_df.index)

Split again in two gender groups. Calculate the probability of a runner finishing at certain time
given the age and gender using the KDE for male (kdem) and the probability of male (prob_m).
Do the same thing for female.

In [7]:
train_dfm = train_df[train_df["M/F"] == "M"]
train_dff = train_df[train_df["M/F"] == "F"]

kdem = gaussian_kde(train_dfm[["tot_minutes", "Age"]].T, bw_method=0.3)
kdef = gaussian_kde(train_dff[["tot_minutes", "Age"]].T, bw_method=0.5)

mc = len(train_dfm)
fc = len(train_dff)

prob_f = fc/(fc+mc)
prob_m = mc/(fc+mc)

defining a predictor function for each gender to calculate the probability of a gender, given time and age, to use a test data. 

In [8]:
def predictor_f(t, a):
    return (kdef([t, a])*prob_f)/(kdef([t, a])*prob_f + kdem([t, a])*prob_m)

In [9]:
def predictor_m(t, a):
    return (kdem([t, a])*prob_m)/(kdef([t, a])*prob_f + kdem([t, a])*prob_m)

In [10]:
test_df["prob_F"] = predictor_f(test_df["tot_minutes"], test_df["Age"])

In [11]:
test_df["prob_M"] = predictor_m(test_df["tot_minutes"], test_df["Age"])

In [12]:
test_df

Unnamed: 0,Age,M/F,Country,5K,10K,15K,20K,Half,25K,30K,35K,40K,Finish,Pace,Overall,Gender,Division,tot_minutes,prob_F,prob_M
1,30,M,ETH,00:14:43,00:29:43,00:44:58,01:00:28,01:04:01,01:16:07,01:31:59,01:47:59,02:02:42,02:09:48,00:04:58,2,2,2,129.800000,0.112831,8.871690e-01
9,33,M,UKR,00:15:14,00:30:34,00:46:05,01:01:43,01:05:07,01:17:18,01:33:11,01:49:43,02:06:16,02:13:52,00:05:07,10,10,10,133.866667,0.124469,8.755307e-01
10,33,M,USA,00:14:46,00:29:50,00:45:33,01:01:20,01:04:48,01:17:08,01:33:12,01:49:52,02:06:55,02:13:52,00:05:07,11,11,11,133.866667,0.124469,8.755307e-01
14,42,M,ITA,00:15:53,00:32:17,00:48:32,01:04:49,01:08:21,01:21:17,01:38:02,01:54:55,02:11:25,02:18:44,00:05:18,15,15,1,138.733333,0.175468,8.245325e-01
15,29,M,USA,00:16:08,00:32:19,00:48:41,01:05:01,01:08:38,01:21:18,01:38:01,01:54:54,02:11:37,02:19:12,00:05:19,16,16,15,139.200000,0.115613,8.843874e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26287,49,M,USA,00:39:11,01:21:33,02:06:32,02:57:51,03:08:56,03:48:37,04:40:33,05:30:51,06:18:53,06:39:52,00:15:16,26588,14575,2484,399.866667,0.515097,4.849033e-01
26288,47,F,USA,00:43:22,01:27:40,02:13:36,03:01:58,03:12:16,03:49:07,04:41:53,05:34:15,06:23:00,06:42:01,00:15:20,26589,12014,1832,402.016667,0.374358,6.256425e-01
26295,66,F,USA,00:53:03,01:47:16,02:41:45,03:37:07,03:48:21,04:33:51,05:38:56,06:38:51,07:36:18,07:59:33,00:18:18,26596,12017,91,479.550000,0.999996,3.711321e-06
26296,53,M,USA,00:49:04,01:40:12,02:33:31,03:31:41,03:43:35,04:29:20,05:31:11,06:33:35,07:35:38,08:00:37,00:18:20,26597,14580,2055,480.616667,0.999999,6.374998e-07


This was able to correctly predict the gender of the runner given time and age with over 75% accuracy.

In [13]:
sum((test_df["prob_F"] >= 0.5) == (test_df["M/F"] == "F"))/len(test_df)

0.7512358354247471

## Using K-NN

In [14]:
from sklearn.neighbors import KNeighborsClassifier

use half the data for training and the other half for testing. Put the training data as a list of tuple (age, time).

In [15]:
train_df = df.sample(frac=0.5, random_state=1) 
train_X = [(train_df['Age'].iloc[i], train_df['tot_minutes'].iloc[i]) for i in range(len(train_df))]
train_Y = np.asarray([train_df['M/F']])[0]

In [16]:
test_df = df.drop(train_df.index)
test_X = [(test_df['Age'].iloc[i], test_df['tot_minutes'].iloc[i]) for i in range(len(test_df))]
test_Y = np.asarray([test_df['M/F']])[0]

In [17]:
knn = KNeighborsClassifier()
knn.fit(train_X, train_Y)

KNeighborsClassifier()

In [18]:
prediction = knn.predict(test_X)
prediction

array(['M', 'M', 'M', ..., 'M', 'M', 'M'], dtype=object)

In [19]:
test_Y

array(['M', 'M', 'M', ..., 'F', 'M', 'M'], dtype=object)

It gives a accurary of nearly 73%

In [20]:
accuracy = (prediction == test_Y).sum()/len(test_Y)
accuracy

0.7281162065556316

## improvement by Increase the size of the testing data for K-NN only

Doing the same thing as above with the size of the training data increased to 90%. 

In [21]:
train_df = df.sample(frac=0.9, random_state=1) 
train_X = [(train_df['Age'].iloc[i], train_df['tot_minutes'].iloc[i]) for i in range(len(train_df))]
train_Y = np.asarray([train_df['M/F']])[0]

In [22]:
test_df = df.drop(train_df.index)
test_X = [(test_df['Age'].iloc[i], test_df['tot_minutes'].iloc[i]) for i in range(len(test_df))]
test_Y = np.asarray([test_df['M/F']])[0]

In [23]:
knn = KNeighborsClassifier()
knn.fit(train_X, train_Y)

KNeighborsClassifier()

In [24]:
prediction = knn.predict(test_X)

In [25]:
accuracy = (prediction == test_Y).sum()/len(test_Y)
accuracy

0.7212927756653993

## Conclusion

Overall we have a good accuracy score for both predictions using KDE and K-NN. However, the prediction with KDE has a higher score. Increasing the size of the training data lower the accuracy of the predictions.