In [52]:
import pandas as pd 
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.manifold import Isomap
from keras.models import Sequential
from keras.layers import Dense

In [53]:
def load_data():
    dfs = []
    for filename in os.listdir('CSVs/'):
        year_df = pd.read_csv('CSVs/'+filename)
        year_df['winner'] = 1*(year_df['rank'] == 1)
        dfs.append(year_df)
    df = pd.concat(dfs)
    df.index = range(0,df.shape[0])
    return df

def process_data(df):
    
    # defining country mapping
    country_to_id_mapping = {}
    id_to_country_mapping = {}
    
    id_count = 0
    for code in df['code'].unique():
        country_to_id_mapping[code] = id_count
        id_to_country_mapping[id_count] = code
        
        id_count += 1
        
    # new data
    new_ids = []
    for code in df['code']:
        new_ids.append(country_to_id_mapping[code])
    df['country_id'] = new_ids
    
    df['top_5'] = (df['rank'] < 5)*1
    df['top_10'] = (df['rank'] < 10)*1
    df['top_15'] = (df['rank'] < 15)*1
    df['top_25'] = (df['rank'] < 25)*1
    df['top_50'] = (df['rank'] < 50)*1
    df['top_100'] = (df['rank'] < 100)*1
    
    # dropping columns
    df = df.drop(columns = ['problem1','problem2','problem3','problem4','problem5','problem6'])
    
    df['next_year_winner'] = 0
    new_dfs = []
    for year in df['year'].unique():
        if year < 2020:
            tmp1 = df[df['year'] == year + 1].copy()
            next_year_winner = tmp1[tmp1['rank'] == 1]['country_id'].iloc[0]
            new_df = df[df['year'] == year].copy()
            new_df['next_year_winner'] = (new_df['country_id'] == next_year_winner)*1
            
            for i in [5,10,15,25,50,100]:
                top_i = tmp1[tmp1['top_'+str(i)] == 1]
                countries = list(top_i['country_id'].unique())
                new_df['next_year_top_'+str(i)] = new_df['country_id'].isin(countries)*1
                
            
            new_dfs.append(new_df)
        else:
            new_df = df[df['year'] == 2020].copy()
            new_df['next_year_winner'] = 0
            new_dfs.append(new_df)
            
    df = pd.concat(new_dfs)
    
    dfs = []
    for code in df['code'].unique():
        code_df = df[df['code'] == code].sort_values(by = ['year'])
        code_df['total_wins'] = code_df['winner'].cumsum()
        code_df['total_gold'] = code_df['gold_medals'].cumsum()
        code_df['total_silver'] = code_df['silver_medals'].cumsum()
        code_df['total_bronze'] = code_df['bronze_medals'].cumsum()
        code_df['min_rank'] = code_df['rank'].cummin()
        code_df['max_rank'] = code_df['rank'].cummax()
        code_df['average_rank'] = code_df['rank'].cumsum()/np.arange(1,code_df.shape[0]+1)
        for i in [2,3,4]:
            code_df['average_rank_'+str(i)] = code_df['rank'].rolling(window = i).mean()
            code_df['recent_wins_'+str(i)] = code_df['rank'].rolling(window = i).sum()

        dfs.append(code_df)
    df = pd.concat(dfs).fillna(0)

    return df, country_to_id_mapping,id_to_country_mapping

In [54]:
df = load_data()
df, country_to_id_mapping,id_to_country_mapping = process_data(df)

In [55]:
features = ['total', 'rank','winner', 'country_id', 
            'top_5','top_10', 'top_15', 'top_25', 'top_50', 'top_100',
            'total_wins', 'min_rank',
            'max_rank', 'average_rank', 'average_rank_2', 'recent_wins_2',
            'average_rank_3', 'recent_wins_3', 'average_rank_4', 'recent_wins_4']

target = 'next_year_winner'

In [56]:
train, test = df[df['year'] <= 2019],  df[df['year'] == 2020]
X_train, y_train = train[features], train[target].astype(int)
X_test = test[features]

In [34]:
X_train.shape

(942, 20)

In [35]:
model = Sequential()
model.add(Dense(16, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=150, batch_size=10)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x1ffa3e495c8>

In [37]:
pred_probs = model.predict(X_test)
country_ids = X_test['country_id']
for_df = []
for i,j in zip(pred_probs, country_ids):
    win_prob = i
    country = id_to_country_mapping[j]
    odds = win_prob/(1-win_prob)
    for_df.append({'country':country,'prob':win_prob,'odds':odds})
results = pd.DataFrame(for_df)
results.to_csv('winning_odds.csv')

Unnamed: 0,country,prob,odds
1,USA,[0.35674542],[0.55459446]
0,CHN,[0.2452125],[0.3248762]
3,RUS,[0.14422467],[0.168531]
11,KOR,[0.07283309],[0.07855445]
4,THA,[0.010528684],[0.010640716]
2,SGP,[3.4686323e-07],[3.4686335e-07]
14,UKR,[3.2689397e-07],[3.2689405e-07]
13,POL,[1.3064596e-07],[1.3064597e-07]
10,JPN,[1.567338e-08],[1.567338e-08]
16,UNK,[9.543903e-09],[9.543903e-09]


In [47]:
from sklearn.model_selection import GridSearchCV
parameters = {'activation':["logistic", "relu"],
             'solver' : ['lbfgs','sgd','adam'],
             'alpha' : [0.00001,0.0001,0.001,0.01]}
nn = MLPClassifier(random_state = 42, max_iter = 1000)
clf = GridSearchCV(nn, parameters)
clf.fit(X_train, y_train)
optimal_params = clf.best_params_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [49]:
optimal_params

{'activation': 'logistic', 'alpha': 1e-05, 'solver': 'sgd'}

In [59]:
model = MLPClassifier()
model.fit(X_train, y_train)
model.score(X_train, y_train)
pred_probs = model.predict_proba(X_test)
country_ids = X_test['country_id']
for_df = []
for i,j in zip(pred_probs, country_ids):
    win_prob = i[1]
    country = id_to_country_mapping[j]
    odds = win_prob/(1-win_prob)
    for_df.append({'country':country,'prob':win_prob,'odds':odds})
results = pd.DataFrame(for_df)
results.sort_values(by = ['prob'], ascending = False).head(20)

Unnamed: 0,country,prob,odds
0,CHN,0.3225284,0.4760767
1,USA,0.3129136,0.4554211
3,RUS,0.205589,0.2587942
11,KOR,0.04468823,0.04677868
4,THA,0.0244886,0.02510334
14,UKR,8.981567e-05,8.982374e-05
2,SGP,6.418087e-05,6.418498e-05
13,POL,5.638332e-05,5.63865e-05
16,UNK,2.993448e-06,2.993457e-06
22,AUS,3.984886e-07,3.984887e-07


In [60]:
model.score(X_train, y_train)

0.9904458598726115

In [61]:
nn_pipe = Pipeline([('pca',PCA()),('minmax',MinMaxScaler()),('nn', MLPClassifier())])
nn_pipe.fit(X_train, y_train)
pred_probs = nn_pipe.predict_proba(X_test)
country_ids = X_test['country_id']
for_df = []
for i,j in zip(pred_probs, country_ids):
    win_prob = i[1]
    country = id_to_country_mapping[j]
    odds = win_prob/(1-win_prob)
    for_df.append({'country':country,'prob':win_prob,'odds':odds})
results = pd.DataFrame(for_df)
results.sort_values(by = ['prob'], ascending = False).head(20)

Unnamed: 0,country,prob,odds
0,CHN,0.446413,0.806401
1,USA,0.322227,0.475421
11,KOR,0.139072,0.161538
3,RUS,0.114708,0.129571
4,THA,0.009416,0.009505
2,SGP,0.008185,0.008252
14,UKR,0.006699,0.006744
15,CAN,0.006185,0.006223
6,ROU,0.006072,0.006109
18,BRA,0.005784,0.005818


In [62]:
nn_pipe.score(X_train, y_train)

0.9904458598726115

In [46]:
for i in [5,10,15,25,50,100]:
    target = 'next_year_top_'+str(i)
    train, test = df[df['year'] <= 2019],  df[df['year'] == 2020]
    X_train, y_train = train[features], train[target].astype(int)
    X_test = test[features]

    model = MLPClassifier()
    model.fit(X_train, y_train)
    model.score(X_train, y_train)
    
    pred_probs = model.predict_proba(X_test)
    country_ids = X_test['country_id']
    for_df = []
    for i,j in zip(pred_probs, country_ids):
        win_prob = i[1]
        country = id_to_country_mapping[j]
        odds = win_prob/(1-win_prob)
        for_df.append({'country':country,'prob':win_prob,'odds':odds})
    results = pd.DataFrame(for_df)
    results.to_csv('odds_top_'+str(i))