In [2]:
from __future__ import division, print_function
import os
import src.preprocessing as preprocessing
import sys
import numpy as np
import pandas as pd
from src.preprocessing import preprocess
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.preprocessing import PolynomialFeatures
warnings.filterwarnings("ignore")

In [2]:
sys.path.append(os.path.join( '..', 'data'))
from build_db import connect

In [3]:
# connect to db
db_name = 'dota_db'
with open(os.path.expanduser('~/.pgpass')) as f:
    for line in f:
        host, port, db, user, password = [x.strip() for x in line.split(':')]
        if db == db_name:
            con, meta = connect(user=user, password=password, db=db, host=host, port=port)
            break

In [4]:
df = pd.read_sql('SELECT * FROM matches WHERE duration >= 600 LIMIT 20000;', con)

In [5]:
processed_df = preprocess(df)

In [6]:
y = processed_df['team1_win'].values
X = processed_df.drop(['match_id', 'team1_win'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Logistic Regression

In [7]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.58599999999999997

In [8]:
params = {'C': [1, .1, .05], 'penalty': ['l1', 'l2'], 'n_jobs': [-1]}
gs = GridSearchCV(LogisticRegression(), params)
lr_best = gs.fit(X_train, y_train)
lr_best.score(X_test, y_test)

0.58699999999999997

In [74]:
lr_best.best_params_

{'C': 0.05, 'n_jobs': -1, 'penalty': 'l2'}

# XGBoost

In [10]:
xgb = XGBClassifier(n_estimators=300)
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

0.58320000000000005

# Random Forest

In [9]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.5716

In [42]:
feature_args = np.argsort(rf.feature_importances_)[::-1]
# desc
features_by_importance = processed_df.drop(['match_id', 'team1_win'], axis=1).columns[feature_args]

In [75]:
features_by_importance[:30]

Index([u'team1', u't1_7', u't2_84', u't1_87', u't2_36', u't2_5', u't1_5',
       u't2_7', u't2_18', u't1_74', u't1_27', u't2_87', u't2_75', u't2_41',
       u't2_4', u't1_84', u't2_29', u't2_8', u't1_31', u't2_99', u't1_18',
       u't2_27', u't1_47', u't1_41', u't2_95', u't2_2', u't2_31', u't2_16',
       u't1_2', u't2_47'],
      dtype='object')

In [76]:
hero_ids = [int(x.split('_')[1]) for x in features_by_importance[1:]]

In [46]:
db_name = 'dota-draft-test'
with open(os.path.expanduser('~/.pgpass')) as f:
    for line in f:
        host, port, db, user, password = [x.strip() for x in line.split(':')]
        if db == db_name:
            con2, meta = connect(user=user, password=password, db=db, host=host, port=port)
            break
hero_df = pd.read_sql('SELECT * FROM heroes;', con2)

In [58]:
hero_df = hero_df.set_index('id')

In [77]:
for hero_id in hero_ids:
    print(hero_df.loc[hero_id, 'name'])

npc_dota_hero_earthshaker
npc_dota_hero_ogre_magi
npc_dota_hero_disruptor
npc_dota_hero_necrolyte
npc_dota_hero_crystal_maiden
npc_dota_hero_crystal_maiden
npc_dota_hero_earthshaker
npc_dota_hero_sven
npc_dota_hero_invoker
npc_dota_hero_shadow_shaman
npc_dota_hero_disruptor
npc_dota_hero_silencer
npc_dota_hero_faceless_void
npc_dota_hero_bloodseeker
npc_dota_hero_ogre_magi
npc_dota_hero_tidehunter
npc_dota_hero_juggernaut
npc_dota_hero_lich
npc_dota_hero_bristleback
npc_dota_hero_sven
npc_dota_hero_shadow_shaman
npc_dota_hero_viper
npc_dota_hero_faceless_void
npc_dota_hero_troll_warlord
npc_dota_hero_axe
npc_dota_hero_lich
npc_dota_hero_sand_king
npc_dota_hero_axe
npc_dota_hero_viper
npc_dota_hero_slardar
npc_dota_hero_sniper
npc_dota_hero_sniper
npc_dota_hero_spirit_breaker
npc_dota_hero_silencer
npc_dota_hero_night_stalker
npc_dota_hero_kunkka
npc_dota_hero_tidehunter
npc_dota_hero_ancient_apparition
npc_dota_hero_omniknight
npc_dota_hero_invoker
npc_dota_hero_troll_warlord
npc_dota_

# Regularized Logistic Regression

In [11]:
rlr = RidgeClassifier(alpha=.1)
rlr.fit(X_train, y_train)
rlr.score(X_test, y_test)

0.58679999999999999

# Regularized Logistic Regression with Polynomial Features

In [None]:
pf = PolynomialFeatures(interaction_only=True)
# X_poly = pf.fit_transform(X)
# X_poly_train, X_poly_test, y_train, y_test = train_test_split(X_poly, y)

In [12]:
df['picks_bans'].iloc[0]

[{u'hero_id': 58, u'is_pick': False, u'order': 0, u'team': 0},
 {u'hero_id': 60, u'is_pick': False, u'order': 1, u'team': 1},
 {u'hero_id': 88, u'is_pick': False, u'order': 2, u'team': 0},
 {u'hero_id': 7, u'is_pick': False, u'order': 3, u'team': 1},
 {u'hero_id': 16, u'is_pick': True, u'order': 4, u'team': 0},
 {u'hero_id': 31, u'is_pick': True, u'order': 5, u'team': 1},
 {u'hero_id': 13, u'is_pick': True, u'order': 6, u'team': 1},
 {u'hero_id': 90, u'is_pick': True, u'order': 7, u'team': 0},
 {u'hero_id': 99, u'is_pick': False, u'order': 8, u'team': 1},
 {u'hero_id': 36, u'is_pick': False, u'order': 9, u'team': 0},
 {u'hero_id': 65, u'is_pick': False, u'order': 10, u'team': 1},
 {u'hero_id': 114, u'is_pick': False, u'order': 11, u'team': 0},
 {u'hero_id': 18, u'is_pick': True, u'order': 12, u'team': 1},
 {u'hero_id': 40, u'is_pick': True, u'order': 13, u'team': 0},
 {u'hero_id': 55, u'is_pick': True, u'order': 14, u'team': 1},
 {u'hero_id': 49, u'is_pick': True, u'order': 15, u'team'

In [14]:
x = processed_df.iloc[0, 3:]
x[x == 1]

t1_16    1
t1_40    1
t1_49    1
t1_71    1
t1_90    1
t2_13    1
t2_18    1
t2_20    1
t2_31    1
t2_55    1
Name: 0, dtype: object