In [1]:
from __future__ import division, print_function
import os
import sys
import numpy as np
import pandas as pd
import src.preprocessing as preprocessing
from src.preprocessing import preprocess, preprocess_players, polynomial_features
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, Lasso
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
import warnings
from sklearn.preprocessing import PolynomialFeatures
from src.build_db import connect
from src.db_helpers import parse_date
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from scipy.sparse import coo_matrix, csc_matrix, hstack
from collections import defaultdict
warnings.filterwarnings("ignore")

In [2]:
# connect to db
db_name = 'dota_db'
with open(os.path.expanduser('~/.pgpass')) as f:
    for line in f:
        host, port, db, user, password = [x.strip() for x in line.split(':')]
        if db == db_name:
            dota_con, meta = connect(user=user, password=password, db=db, host=host, port=port)
            break


In [3]:
query = '''
SELECT DISTINCT ON(match_id) match_id, players, radiant_win FROM match_history WHERE duration >= 900;
'''
df = pd.read_sql(query, dota_con)
df = preprocess_players(df)

In [4]:
y = df['radiant_win'].values
X = df.drop(['radiant_win', 'match_id'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5000)
X_poly_train = polynomial_features(X_train)
X_poly_test = polynomial_features(X_test)
X_poly = polynomial_features(X)

# Synergy and Countering

In [5]:
def synergy_matrix(X, y):
    winrates = np.ones((114,114)) * .5
    for i in xrange(0, 113):
        for j in xrange(i + 1, 114):
            radiant_matches = (X[:, i] == 1) & (X[:, j + 114] == 1)
            dire_matches = (X[:, j] == 1) & (X[:, i + 114] == 1)
            try:
                wr = (y[radiant_matches].sum() +\
                    (y[dire_matches].shape[0] - y[dire_matches].sum()) )\
                    / (radiant_matches.sum() + dire_matches.sum())
            except ZeroDivisionError:
                wr = .5
            if np.isnan([wr]):
                wr = .5
            winrates[i, j] = wr
            winrates[j, i] = wr
    return winrates

def counter_matrix(X, y):
    winrates = np.ones((114, 114)) * .5
    for i in xrange(0, 113):
        for j in xrange(i + 1, 114):
            radiant_matches = (X[:, i] == 1) & (X[:, j + 114] == 1)
            dire_matches = (X[:, j] == 1) & (X[:, i + 114] == 1)
            try:
                wr = (y[radiant_matches].sum() +\
                     (y[dire_matches].shape[0] - y[dire_matches].sum()) )\
                     / (radiant_matches.sum() + dire_matches.sum())
            except ZeroDivisionError:
                wr = .5
            if np.isnan([wr]):
                wr = .5
            winrates[i, j] = wr - .5
            winrates[j, i] = .5 - wr
    return winrates

def calculate_synergy(X_i, sm):
    team1 = 0
    team2 = 0
    team1_heroes = np.argwhere( X_i[:114])
    team2_heroes = np.argwhere( X_i[114:228] )
    for i in xrange(4):
        for j in xrange(i, 5):
            team1 += sm[team1_heroes[i], team1_heroes[j]]
            team2 += sm[team2_heroes[i], team2_heroes[j]]
    return (team1 - team2)[0]
    
def calculate_countering(X_i, cm):
    team1_heroes = np.argwhere( X_i[:114])
    team2_heroes = np.argwhere( X_i[114:228] )
    countering = 0
    for i in xrange(5):
        for j in xrange(5):
            try:
                countering += cm[team1_heroes[i], team2_heroes[j]]
            except IndexError:
                import pdb; pdb.set_trace()
    return countering[0]

# Learning Rate

In [6]:
def my_cross_val(estimators, X, y, train_size=None):
    scores = [[] for est in estimators ]
    for i in xrange(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5000)
        if train_size:
            while True:
                X_train, y_train = shuffle(X_train, y_train)
                X_train = X_train[:train_size, :]
                y_train = y_train[:train_size]
                if 1 in y_train and 0 in y_train:
                    break
        # add countering and synergy
        cm = counter_matrix(X_train, y_train)
        sm = synergy_matrix(X_train, y_train)
        X_train_countering = np.apply_along_axis(lambda x: calculate_countering(x, cm), axis=1, arr=X_train).reshape(-1, 1)
        X_train_synergy = np.apply_along_axis(lambda x: calculate_synergy(x, cm), axis=1, arr=X_train).reshape(-1,1)
        X_test_countering = np.apply_along_axis(lambda x: calculate_countering(x, cm), axis=1, arr=X_test).reshape(-1, 1)
        X_test_synergy = np.apply_along_axis(lambda x: calculate_synergy(x, cm), axis=1, arr=X_test).reshape(-1,1)                
        X_train_sc = np.concatenate([X_train, X_train_synergy, X_train_countering], axis=1)
        X_test_sc = np.concatenate([X_test, X_test_synergy, X_test_countering], axis=1)
        for est in estimators:
            est.fit(X_train_sc, y_train)

        for i, est in enumerate(estimators):
            scores[i].append(est.score(X_test_sc, y_test))
    return scores
        

In [7]:
cm = counter_matrix(X_train, y_train)
sm = synergy_matrix(X_train, y_train)
X_train_countering = np.apply_along_axis(lambda x: calculate_countering(x, cm), axis=1, arr=X_train).reshape(-1, 1)
X_train_synergy = np.apply_along_axis(lambda x: calculate_synergy(x, cm), axis=1, arr=X_train).reshape(-1,1)
X_test_countering = np.apply_along_axis(lambda x: calculate_countering(x, cm), axis=1, arr=X_test).reshape(-1, 1)
X_test_synergy = np.apply_along_axis(lambda x: calculate_synergy(x, cm), axis=1, arr=X_test).reshape(-1,1)                
X_train_sc = np.concatenate([X_train, X_train_synergy, X_train_countering], axis=1)
X_test_sc = np.concatenate([X_test, X_test_synergy, X_test_countering], axis=1)

In [23]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
params = {'n_estimators': stats.geom(p=.005), 'max_leaf_nodes': stats.geom(p=.001),
         'max_features': ['log2', 'auto']}
rs = RandomizedSearchCV(RandomForestClassifier(), n_iter=50, n_jobs=-1, param_distributions=params)
best_model = rs.fit(X_train_sc, y_train)
best_model.score(X_test_sc, y_test)

0.63660000000000005

In [26]:
params = {'max_depth': stats.geom(p=.3), 'reg_alpha' : stats.expon(scale=10)}
rs = RandomizedSearchCV(XGBClassifier(), n_iter=50, n_jobs=-1, param_distributions=params)
best_xgb = rs.fit(X_train_sc, y_train)
best_model.score(X_test_sc, y_test)

0.63660000000000005

In [27]:
y_test.sum() / y_test.shape[0]

0.56459999999999999

In [28]:
estimators = [LogisticRegression(C=.1), RidgeClassifier(alpha=1000),
              RandomForestClassifier(**best_model.best_params_),
              XGBClassifier(**best_xgb.best_params_)]
scores = my_cross_val(estimators, X, y)
mean_scores = map(np.mean, scores)
lr_score, ridge_score, rf_score, xgb_score = mean_scores

In [29]:
print(lr_score, ridge_score, rf_score, xgb_score)

0.61704 0.62816 0.62772 0.62728


In [24]:
best_model.best_params_

{'max_features': 'auto', 'max_leaf_nodes': 745, 'n_estimators': 456}

In [25]:
best_model.score(X_train_sc, y_train)

0.71330927475012884

In [11]:
cross_val_score(RandomForestClassifier(**best_model.best_params_), X, y, cv=5).mean()

0.59781063720160721

In [None]:
training_sizes = []
test_scores = defaultdict(list)
training_size = 4
models = {'random_forest': rf, 'logreg': logreg, 'ridge': ridge, 'xgb': xgb}
while training_size <= X_train.shape[0]:
    training_sizes.append(training_size)
    for model_name, model in models.iteritems():
        score = model(training_size)
        test_scores[model_name].append(score)
    training_size = 2 * training_size

In [None]:
training_sizes.append(X_train.shape[0])
for model_name, model in models.iteritems():
    score = model(None)
    test_scores[model_name].append(score)

In [None]:
fig, ax = plt.subplots()
for model_name, scores in test_scores.iteritems():
    ax.plot(training_sizes, scores, label=model_name)
ax.legend()
plt.show()