In [14]:
from preprocessing import preprocess_players, preprocess
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import recall_score, accuracy_score
import xgboost
import warnings
import numpy as np
warnings.filterwarnings("ignore")

In [8]:
sys.path.append(os.path.join( '..', 'data'))
from build_db import connect
from db_helpers import parse_date

In [6]:
# connect to db
db_name = 'dota-draft-test'
with open(os.path.expanduser('~/.pgpass')) as f:
    for line in f:
        host, port, db, user, password = [x.strip() for x in line.split(':')]
        if db == db_name:
            con, meta = connect(user=user, password=password, db=db, host=host, port=port)
            break

In [15]:
start_date = parse_date('2016-12-12')
query = '''SELECT m.match_id as match_id,
            m.picks_bans as picks_bans,
            m.radiant_win as radiant_win
            FROM matches as m
            JOIN start_times as st
            ON st.match_id = m.match_id
            WHERE st.start_time >={};'''.format(start_date)
df = pd.read_sql(query , con)
df = preprocess(df)

In [16]:
df.head()

Unnamed: 0,match_id,team1_win,t1_1,t1_2,t1_3,t1_4,t1_5,t1_6,t1_7,t1_8,...,t2_106,t2_107,t2_108,t2_109,t2_110,t2_111,t2_112,t2_113,t2_114,team1
0,3054498770,True,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,2840174893,False,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2840258400,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2840346671,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2840515558,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# split
y = df['team1_win'].values
X = df.drop(['team1_win', 'match_id' ], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Quick Comparison
Let's do a quick look at how each model behaves without tuning.
#### Logistic Regression

In [18]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.53785071466384327

#### kNN

In [19]:
# this takes forever btw
kNN = KNeighborsClassifier()
kNN.fit(X_train, y_train)
kNN.score(X_test, y_test)

KeyboardInterrupt: 

#### Decision Tree

In [12]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.50996978851963748

#### Random Forest

In [14]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.52179542511868793

#### Boosting

In [15]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

0.54967630556754421

# Grid Search

#### Logistic Regression

In [19]:
params = {'penalty': ['l1', 'l2'], 'n_jobs': [-1]}
gs = GridSearchCV(LogisticRegression(), param_grid=params)
best_lr = gs.fit(X_train, y_train)
best_lr.score(X_test, y_test)

0.55338800172637026

#### Decision Tree

In [44]:
params = {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 50], 'min_impurity_split':[.6, .7, .8]}
gs = GridSearchCV(DecisionTreeClassifier(), param_grid=params)
best_dt = gs.fit(X_train, y_train)
best_dt.score(X_test, y_test)

0.52982304704359084

In [45]:
best_dt.best_params_

{'criterion': 'entropy', 'max_depth': 20, 'min_impurity_split': 0.7}

#### Random Forest

In [23]:
params = {'n_estimators': [15], 'criterion': ['gini', 'entropy'], 'max_features':['sqrt', 60],
          'n_jobs':[-1], 'max_depth': [None, 10]}
gs = GridSearchCV(RandomForestClassifier(), param_grid=params)
best_rf = gs.fit(X_train, y_train)
best_rf.score(X_test, y_test)

0.54061286145878296

In [24]:
best_rf.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'n_estimators': 15,
 'n_jobs': -1}

#### Boosting

In [26]:
# warning: takes a long time to run
params = {'reg_alpha': [0, 1], 'reg_lambda': [0, 1], 'n_jobs':[-1], 'max_depth': [2, 3, 4], 
          'booster': ['gbtree', 'gblinear', 'dart']}
gs = GridSearchCV(xgboost.XGBClassifier(), params)
best_xgb = gs.fit(X_train, y_train)
best_xgb.score(X_test, y_test)

0.55390591281829948

In [27]:
best_xgb.best_params_

{'booster': 'gblinear',
 'max_depth': 2,
 'n_jobs': -1,
 'reg_alpha': 1,
 'reg_lambda': 0}

If we decide to build a decision tree to classify based on ordered picks, we will probably have to be careful of overfitting. In support of this conjecture we have the fact that random forests and boosting show a significant improvement. Oddly, logistic regression works very well out of the box.

Unfortunately the accuracy of our models is still marginally better than guessing. The Conley & Perry were able to get ~60-70% accuracy with logistic regression and kNN.

One confounding factor is that our data is spread out over a longer time period. Patches changing the game over time could decrease the signal.

# Patch 7.0
Let's see if we can get more signal by staying on the most recent patch. We'll lose a lot of our data though.

In [4]:
df_7 = preprocess_matches_after('dota-draft-test', '2017-08-20')

In [5]:
df_7['match_id'].min()

3389647091

In [52]:
df_7.head()

Unnamed: 0,match_id,start_time,picks_bans,radiant_win,duration,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,...,dire_105,dire_106,dire_107,dire_108,dire_109,dire_110,dire_111,dire_112,dire_113,dire_114
0,3054498770,1489484471,"[{u'is_pick': False, u'hero_id': 80, u'order':...",False,4223,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2840174893,1481526623,"[{u'is_pick': False, u'hero_id': 74, u'order':...",False,2234,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2840258400,1481529758,"[{u'is_pick': False, u'hero_id': 16, u'order':...",True,1762,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2840346671,1481532894,"[{u'is_pick': False, u'hero_id': 73, u'order':...",True,1261,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2840515558,1481538627,"[{u'is_pick': False, u'hero_id': 28, u'order':...",False,1929,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
y = df_7['radiant_win']
X = df_7.drop(['match_id', 'start_time', 'picks_bans', 'radiant_win'], axis=1)

#### Logistic Regression

In [66]:
bag = BaggingClassifier(base_estimator=LogisticRegression(), 
                        n_estimators=100,
                        max_samples=1.0,
                        bootstrap=True,
                        n_jobs=-1)
bag.fit(X, y)
recalls = []
accuracies = []
for estimator, samples in zip(bag.estimators_, bag.estimators_samples_):
    # compute predictions on out-of-bag samples
    mask = ~samples
    y_pred = estimator.predict(X[mask])
    # compute some statistic
    recalls.append(recall_score(y[mask], y_pred))
    accuracies.append(accuracy_score(y[mask], y_pred))
# Do something with stats, e.g. find confidence interval
print(np.percentile(recalls, [2.5, 97.5]))
print(np.percentile(accuracies, [2.5, 97.5]))

[ 0.50158982  0.58129778]
[ 0.52255362  0.55089008]


In [55]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.52038115404976182

We did marginally better. Could their better accuracy score bet due to sampling from both dire and radiant when making predictions? My hypothesis was that this would hurt their accuracy, though in our case with less data it may be beneficial.