In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (auc, classification_report, roc_auc_score, accuracy_score,
                             f1_score, log_loss, roc_curve, confusion_matrix, precision_score, recall_score)
from sklearn.preprocessing import StandardScaler
from math import sin, cos, sqrt, atan2, radians
import random
import statsmodels.api as sm

In [3]:
mm_data = pd.read_csv('MM_DATA1.csv')

In [4]:
def distance(lat1, lon1, lat2, lon2):

    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    
    return distance

In [5]:
mm_data['dist1'] = mm_data.apply(lambda row: distance(row['host_lat'], row['host_long'], row['team1_lat'], row['team1_long']), axis=1)
mm_data['dist2'] = mm_data.apply(lambda row: distance(row['host_lat'], row['host_long'], row['team2_lat'], row['team2_long']), axis=1)

In [6]:
mm_data['diff_dist'] = mm_data['dist1'] - mm_data['dist2']

In [7]:
mm_data['exp_win1'] = (mm_data['team1_adjoe']**11.5)/ ((mm_data['team1_adjde']**11.5)+(mm_data['team1_adjoe']**11.5))
mm_data['exp_win2'] = (mm_data['team2_adjoe']**11.5)/ ((mm_data['team2_adjde']**11.5)+(mm_data['team2_adjoe']**11.5))

In [8]:
mm_data['team1_log5'] = (mm_data['exp_win1'] - (mm_data['exp_win1']*mm_data['exp_win2']))/ (mm_data['exp_win1']+mm_data['exp_win2']-(2*mm_data['exp_win1']*mm_data['exp_win2']))

In [10]:
mm_data[['team1_adjoe','team1_id','season','team2_id','team1_adjde','exp_win1','exp_win2','team1_log5']].head()

Unnamed: 0,team1_adjoe,team1_id,season,team2_id,team1_adjde,exp_win1,exp_win2,team1_log5
0,111.4954,1104,2002,1194,93.877,0.878465,0.411542,0.91178
1,117.3877,1112,2002,1364,96.9262,0.900485,0.707177,0.789333
2,118.5999,1181,2002,1457,87.7504,0.96966,0.367453,0.982148
3,109.6136,1231,2002,1428,89.9224,0.906968,0.812015,0.692962
4,115.0406,1242,2002,1221,90.5399,0.940149,0.501475,0.939816


In [11]:
mm_data['seed_diff'] = mm_data['team1_seed'] - mm_data['team2_seed']

In [12]:
mm_data[['game_id','exp_win1','exp_win2','team2_score','team1_score','diff_dist','team1_log5','team1_id','team2_id']].head()

Unnamed: 0,game_id,exp_win1,exp_win2,team2_score,team1_score,diff_dist,team1_log5,team1_id,team2_id
0,2002-1104-1194,0.878465,0.411542,78,86,-88.820532,0.91178,1104,1194
1,2002-1112-1364,0.900485,0.707177,81,86,-697.542289,0.789333,1112,1364
2,2002-1181-1457,0.96966,0.367453,37,84,-191.666514,0.982148,1181,1457
3,2002-1231-1428,0.906968,0.812015,56,75,2148.977013,0.692962,1231,1428
4,2002-1242-1221,0.940149,0.501475,59,70,-39.535171,0.939816,1242,1221


In [13]:
mm_data.dropna(subset=['team1_log5'], inplace=True)

In [14]:
mm_data.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,team1_seed,team2_seed,season,host_lat,host_long,team1_lat,...,team2_adjde,game_id,result,dist1,dist2,diff_dist,exp_win1,exp_win2,team1_log5,seed_diff
0,1104,86,1194,78,2,15,2002,35.6017,-77.3725,33.2144,...,99.9263,2002-1104-1194,1,970.079692,1058.900224,-88.820532,0.878465,0.411542,0.91178,-13
1,1112,86,1364,81,3,14,2002,35.1107,-106.61,32.232071,...,97.6704,2002-1112-1364,1,513.690472,1211.232761,-697.542289,0.900485,0.707177,0.789333,-11
2,1181,84,1457,37,1,16,2002,35.6017,-77.3725,36.00159,...,99.9754,2002-1181-1457,1,148.431171,340.097686,-191.666514,0.96966,0.367453,0.982148,-15
3,1231,75,1428,56,5,12,2002,38.5556,-121.4689,39.166383,...,95.6459,2002-1231-1428,1,3008.248664,859.271651,2148.977013,0.906968,0.812015,0.692962,-7
4,1242,70,1221,59,1,16,2002,38.6272,-90.1978,38.957351,...,96.6499,2002-1242-1221,1,439.123404,478.658575,-39.535171,0.940149,0.501475,0.939816,-15


In [15]:
result0 = mm_data.sample(n=488, frac=None, replace=False,  weights=None, random_state=1, axis=None)

In [16]:
result1 = mm_data.drop(labels=result0.axes[0])

In [17]:
result0['result']=0

In [37]:
result0.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,team1_seed,team2_seed,season,host_lat,host_long,team1_lat,...,team2_adjde,game_id,result,dist1,dist2,diff_dist,exp_win1,exp_win2,team1_log5,seed_diff
932,1268,79,1355,74,5,12,2016,47.6589,-117.425,38.988607,...,100.309,2016-1268-1355,0,3372.40685,1632.597996,1739.808854,0.906823,0.74106,0.772759,-7
724,1332,68,1329,55,12,5,2013,42.6875,-83.2342,44.044515,...,90.3122,2013-1332-1329,0,3193.515857,1392.697047,1800.81881,0.818842,0.882985,0.37461,7
525,1437,73,1352,70,2,15,2010,43.05,-87.95,40.039388,...,100.2647,2010-1437-1352,0,1100.175416,700.367644,399.807772,0.903115,0.474357,0.911734,-13
6,1246,83,1434,68,4,13,2002,38.6272,-90.1978,38.028081,...,96.7884,2002-1246-1434,0,500.611426,414.108067,86.503358,0.895478,0.80144,0.679753,-9
248,1228,90,1112,89,1,3,2005,30.25,-97.75,39.730827,...,95.3458,2005-1228-1112,0,1255.694798,1273.74586,-18.051062,0.971417,0.886596,0.812983,-2


In [38]:
result0['diff_dist']=result0['diff_dist']*-1

In [39]:
result0['team1_log5']=1-result0['team1_log5']

In [40]:
result0['seed_diff']=result0['seed_diff']*(-1)

In [42]:
result0['team2_adjde']=result0['team2_adjde']*-1

In [43]:
result0.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,team1_seed,team2_seed,season,host_lat,host_long,team1_lat,...,team2_adjde,game_id,result,dist1,dist2,diff_dist,exp_win1,exp_win2,team1_log5,seed_diff
932,1268,79,1355,74,5,12,2016,47.6589,-117.425,38.988607,...,-100.309,2016-1268-1355,0,3372.40685,1632.597996,-1739.808854,0.906823,0.74106,0.227241,7
724,1332,68,1329,55,12,5,2013,42.6875,-83.2342,44.044515,...,-90.3122,2013-1332-1329,0,3193.515857,1392.697047,-1800.81881,0.818842,0.882985,0.62539,-7
525,1437,73,1352,70,2,15,2010,43.05,-87.95,40.039388,...,-100.2647,2010-1437-1352,0,1100.175416,700.367644,-399.807772,0.903115,0.474357,0.088266,13
6,1246,83,1434,68,4,13,2002,38.6272,-90.1978,38.028081,...,-96.7884,2002-1246-1434,0,500.611426,414.108067,-86.503358,0.895478,0.80144,0.320247,9
248,1228,90,1112,89,1,3,2005,30.25,-97.75,39.730827,...,-95.3458,2005-1228-1112,0,1255.694798,1273.74586,18.051062,0.971417,0.886596,0.187017,2


In [44]:
mm_train = pd.concat([result0, result1])

In [45]:
mm_train.set_index('game_id',inplace =True)

In [46]:
mm_train.head()

Unnamed: 0_level_0,team1_id,team1_score,team2_id,team2_score,team1_seed,team2_seed,season,host_lat,host_long,team1_lat,...,team2_de,team2_adjde,result,dist1,dist2,diff_dist,exp_win1,exp_win2,team1_log5,seed_diff
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-1268-1355,1268,79,1355,74,5,12,2016,47.6589,-117.425,38.988607,...,100.037,-100.309,0,3372.40685,1632.597996,-1739.808854,0.906823,0.74106,0.227241,7
2013-1332-1329,1332,68,1329,55,12,5,2013,42.6875,-83.2342,44.044515,...,91.5887,-90.3122,0,3193.515857,1392.697047,-1800.81881,0.818842,0.882985,0.62539,-7
2010-1437-1352,1437,73,1352,70,2,15,2010,43.05,-87.95,40.039388,...,96.8173,-100.2647,0,1100.175416,700.367644,-399.807772,0.903115,0.474357,0.088266,13
2002-1246-1434,1246,83,1434,68,4,13,2002,38.6272,-90.1978,38.028081,...,94.0668,-96.7884,0,500.611426,414.108067,-86.503358,0.895478,0.80144,0.320247,9
2005-1228-1112,1228,90,1112,89,1,3,2005,30.25,-97.75,39.730827,...,97.7107,-95.3458,0,1255.694798,1273.74586,18.051062,0.971417,0.886596,0.187017,2


In [None]:
##test data

In [47]:
mm_1718 = pd.read_csv('MM_DATA1_1718.csv')

In [48]:
mm_1718['dist1'] = mm_data.apply(lambda row: distance(row['host_lat'], row['host_long'], row['team1_lat'], row['team1_long']), axis=1)
mm_1718['dist2'] = mm_data.apply(lambda row: distance(row['host_lat'], row['host_long'], row['team2_lat'], row['team2_long']), axis=1)

In [49]:
mm_1718['diff_dist'] = mm_1718['dist1'] - mm_1718['dist2']

In [50]:
mm_1718['exp_win1'] = (mm_1718['team1_adjoe']**11.5)/ ((mm_1718['team1_adjde']**11.5)+(mm_1718['team1_adjoe']**11.5))
mm_1718['exp_win2'] = (mm_1718['team2_adjoe']**11.5)/ ((mm_1718['team2_adjde']**11.5)+(mm_1718['team2_adjoe']**11.5))

In [51]:
mm_1718['team1_log5'] = (mm_1718['exp_win1'] - (mm_1718['exp_win1']*mm_1718['exp_win2']))/ (mm_1718['exp_win1']+mm_1718['exp_win2']-(2*mm_1718['exp_win1']*mm_1718['exp_win2']))

In [52]:
mm_1718['seed_diff'] = mm_1718['team1_seed'] - mm_1718['team2_seed']

In [53]:
mm_1718.dropna(subset=['team1_log5'], inplace=True)

In [61]:
test_result0 = mm_1718.sample(n=67, frac=None, replace=False,  weights=None, random_state=1, axis=None)

In [62]:
test_result0['diff_dist']=test_result0['diff_dist']*-1
test_result0['team1_log5']=1-test_result0['team1_log5']
test_result0['seed_diff']=test_result0['seed_diff']*(-1)

In [63]:
test_result1 = mm_1718.drop(labels=test_result0.axes[0])

In [64]:
test_result0['result']=0

In [65]:
mm_test = pd.concat([test_result0, test_result1])

In [66]:
scaler = StandardScaler()
mm_train_scaled = scaler.fit_transform(mm_train[['team1_log5','diff_dist','seed_diff']])
logit = LogisticRegression()
logit.fit(mm_train_scaled, mm_train['result'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [55]:
#mm_test = mm_1718

In [67]:
mm_test['preds'] = logit.predict_proba(scaler.fit_transform(mm_test[['team1_log5','diff_dist','seed_diff']]))[:,1]
mm_test['prediction'] = logit.predict(scaler.fit_transform(mm_test[['team1_log5','diff_dist','seed_diff']]))

In [71]:
#mm_test.set_index('game_id',inplace = True)

In [69]:
mm_test[['preds','prediction']]

Unnamed: 0_level_0,preds,prediction
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-1458-1437,0.704610,1
2017-1112-1388,0.480877,0
2018-1305-1400,0.212954,0
2017-1139-1457,0.108457,0
2017-1314-1332,0.377826,0
2018-1276-1222,0.537011,1
2017-1196-1458,0.346491,0
2017-1452-1137,0.101311,0
2017-1242-1345,0.432047,0
2018-1243-1246,0.596590,1


In [59]:
#mm_test['re']=1

In [70]:
log_loss(mm_test['result'].values, mm_test['preds'].values)

0.5717815106560902