In [1]:
from __future__ import division, print_function
import os
import sys
import numpy as np
import pandas as pd
import src.preprocessing as preprocessing
from src.preprocessing import preprocess, preprocess_players
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, Lasso
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
from sklearn.preprocessing import PolynomialFeatures
from src.build_db import connect
from src.db_helpers import parse_date
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from scipy.sparse import coo_matrix, csc_matrix, hstack
warnings.filterwarnings("ignore")

In [2]:
# connect to db
db_name = 'dota_db'
with open(os.path.expanduser('~/.pgpass')) as f:
    for line in f:
        host, port, db, user, password = [x.strip() for x in line.split(':')]
        if db == db_name:
            dota_con, meta = connect(user=user, password=password, db=db, host=host, port=port)
            break

In [3]:
query = '''
SELECT DISTINCT ON(match_id) match_id, players, radiant_win FROM match_history WHERE duration >= 900;
'''
high_skill_df = pd.read_sql(query, dota_con)
high_skill_df = preprocess_players(high_skill_df)

In [75]:
y = high_skill_df['radiant_win'].values
X = high_skill_df.drop(['radiant_win', 'match_id'], axis=1).values

In [76]:
def polynomial_features(X):
    X_sparse = csc_matrix(X)
    sparse_product = []
    for i in xrange(X.shape[1] -1 ):
        for j in xrange(i, X.shape[1]):
            sparse_product.append(X_sparse[:, i].multiply(X_sparse[:, j]))
    X_sparse_poly = hstack(sparse_product)
    return X_sparse_poly

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5000)

In [78]:
X_poly_train = polynomial_features(X_train)
X_poly_test = polynomial_features(X_test)

# Gradient Boosting

In [79]:
gb = XGBClassifier(n_estimators=1000, n_jobs=-1, max_depth=10)
gb.fit(X_train, y_train)
gb.score(X_test, y_test)

0.60560000000000003

In [80]:
gb.score(X_train, y_train)

0.97591924298614696

###### Polynomial features

In [81]:
gb = XGBClassifier(n_estimators=1000, n_jobs=-1)
gb.fit(X_poly_train, y_train)
gb.score(X_poly_test, y_test)

0.61899999999999999

In [82]:
gb.score(X_poly_train, y_train)

0.69102459629808255

# Random Forests

In [83]:
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.624

In [84]:
rf.score(X_train, y_train)

0.99998336964294621

###### Polynomial features

In [85]:
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, )
rf.fit(X_poly_train, y_train)
rf.score(X_poly_test, y_test)

0.62860000000000005

In [86]:
rf.score(X_poly_train, y_train)

0.99998336964294621

# Ridge Regression

In [87]:
ridge = RidgeClassifier(alpha=1000, fit_intercept=False)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.627

In [88]:
ridge.score(X_train, y_train)

0.62621609485955665

###### Polynomial Features

In [89]:
ridge = RidgeClassifier(alpha=1000)
ridge.fit(X_poly_train, y_train)
ridge.score(X_poly_test, y_test)

0.63539999999999996

In [90]:
ridge.score(X_poly_train, y_train)

0.67930019457517754

In [91]:
ridge.intercept_

array([ 0.1408771])

# Flipping Teams

In [92]:
high_skill_df.head()

Unnamed: 0,match_id,radiant_win,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,radiant_6,radiant_7,radiant_8,...,dire_105,dire_106,dire_107,dire_108,dire_109,dire_110,dire_111,dire_112,dire_113,dire_114
0,3470130536,False,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,3470130574,True,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3470130667,True,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3470130732,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3470130756,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
X_dire = np.concatenate([X_train[:, 114:], X_train[:, :114], np.ones((X_train.shape[0],1))], axis=1)
X_radiant = np.concatenate([X_train, np.zeros((X_train.shape[0], 1))], axis=1)
y_dire = ~y_train

In [94]:
X_train_sym = np.concatenate([X_radiant, X_dire], axis=0)
y_train_sym = np.concatenate([y_train, y_dire], axis=0)
X_test_sym = np.concatenate([X_test, np.zeros((X_test.shape[0], 1))], axis=1)

In [95]:
lr = RidgeClassifier(alpha=1000)
lr.fit(X_train_sym, y_train_sym)
lr.score(X_test_sym, y_test)

0.625