In [4]:
from __future__ import division, print_function
import os
import sys
import numpy as np
import pandas as pd
import src.preprocessing as preprocessing
from src.preprocessing import preprocess
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.preprocessing import PolynomialFeatures
from src.build_db import connect
warnings.filterwarnings("ignore")

In [3]:
# connect to db
db_name = 'dota_db'
with open(os.path.expanduser('~/.pgpass')) as f:
    for line in f:
        host, port, db, user, password = [x.strip() for x in line.split(':')]
        if db == db_name:
            con, meta = connect(user=user, password=password, db=db, host=host, port=port)
            break

In [6]:
df = pd.read_sql('SELECT DISTINCT ON(match_id) match_id, players, radiant_win FROM match_history;', con)

In [7]:
df.head()

Unnamed: 0,match_id,players,radiant_win
0,3470130536,"[{u'backpack_2': 0, u'gold_spent': 6360, u'gol...",False
1,3470130574,"[{u'backpack_2': 0, u'gold_spent': 12830, u'go...",True
2,3470130667,"[{u'backpack_2': 0, u'gold_spent': 12940, u'go...",True
3,3470130732,"[{u'backpack_2': 0, u'gold_spent': 13190, u'go...",False
4,3470130756,"[{u'backpack_2': 36, u'gold_spent': 11590, u'g...",True


In [8]:
example = df['players'].iloc[0]

In [11]:
example[0]['hero_id'], example[0]['player_slot']

(12, 0)

In [20]:
preprocessing.parse_players(example)

([12, 103, 37, 7, 40], [27, 83, 4, 109, 71])

In [22]:
pr_df = preprocessing.preprocess_players(df)

In [24]:
pr_df.shape

(12797, 230)

In [27]:
pr_df.head()

Unnamed: 0,match_id,radiant_win,radiant_0,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,radiant_6,radiant_7,...,dire_105,dire_106,dire_107,dire_108,dire_109,dire_110,dire_111,dire_112,dire_113,dire_114
0,3470130536,False,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,3470130574,True,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,3470130667,True,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3470130732,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3470130756,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
y = pr_df['radiant_win'].values
X = pr_df.drop(['radiant_win', 'match_id'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Logistic Regression

In [30]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.64124999999999999

# Random Forest

In [33]:
rf = RandomForestClassifier(n_estimators=400)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.63468749999999996

# XGBoost

In [32]:
xgb = XGBClassifier(n_estimators=200)
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

0.64000000000000001

# Regularized Regression

In [38]:
rlr = RidgeClassifier(alpha=2)
rlr.fit(X_train, y_train)
rlr.score(X_test, y_test)

0.64312499999999995

In [39]:
X_train.shape

(9597, 228)

In [40]:
X_test.shape

(3200, 228)