In [1]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
api_key = "02cd0c8a-8c17-4b6d-82c0-b7f4bc34a56f"

In [3]:
# download basic hero stats that we can use for embedding

_hero_embedding_query = "https://api.opendota.com/api/heroStats?api_key="
resp = requests.get(_hero_embedding_query)
assert resp.ok
stats = pd.DataFrame(json.loads(resp.content)).set_index("id")

In [4]:
stats.columns

Index(['name', 'localized_name', 'primary_attr', 'attack_type', 'roles', 'img',
       'icon', 'base_health', 'base_health_regen', 'base_mana',
       'base_mana_regen', 'base_armor', 'base_mr', 'base_attack_min',
       'base_attack_max', 'base_str', 'base_agi', 'base_int', 'str_gain',
       'agi_gain', 'int_gain', 'attack_range', 'projectile_speed',
       'attack_rate', 'move_speed', 'turn_rate', 'cm_enabled', 'legs',
       'hero_id', 'turbo_picks', 'turbo_wins', 'pro_ban', 'pro_win',
       'pro_pick', '1_pick', '1_win', '2_pick', '2_win', '3_pick', '3_win',
       '4_pick', '4_win', '5_pick', '5_win', '6_pick', '6_win', '7_pick',
       '7_win', '8_pick', '8_win', 'null_pick', 'null_win'],
      dtype='object')

In [5]:
# get roles embedding for each hero
roles_encoding = stats["roles"].str.join("|").str.get_dummies()
roles_encoding

Unnamed: 0_level_0,Carry,Disabler,Durable,Escape,Initiator,Jungler,Nuker,Pusher,Support
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,0,0,1,0,0,1,0,0
2,1,1,1,0,1,1,0,0,0
3,0,1,1,0,0,0,1,0,1
4,1,1,0,0,1,1,1,0,0
5,0,1,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...
128,0,1,0,1,0,0,1,0,1
129,1,1,1,0,1,0,0,0,0
135,1,0,1,0,0,0,0,0,0
136,1,1,0,1,1,0,0,0,1


In [6]:
# basic embedding: roles + attack_type + primary_attribute

embedding = pd.concat([pd.get_dummies(stats[["attack_type", "primary_attr"]]), roles_encoding], axis = 1).reset_index(drop=True)
embedding  # we have our embedding

Unnamed: 0,attack_type_Melee,attack_type_Ranged,primary_attr_agi,primary_attr_int,primary_attr_str,Carry,Disabler,Durable,Escape,Initiator,Jungler,Nuker,Pusher,Support
0,1,0,1,0,0,1,0,0,1,0,0,1,0,0
1,1,0,0,0,1,1,1,1,0,1,1,0,0,0
2,0,1,0,1,0,0,1,1,0,0,0,1,0,1
3,1,0,1,0,0,1,1,0,0,1,1,1,0,0
4,0,1,0,1,0,0,1,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,0,1,0,0,1,0,1,0,1,0,0,1,0,1
119,1,0,0,0,1,1,1,1,0,1,0,0,0,0
120,1,0,0,0,1,1,0,1,0,0,0,0,0,0
121,1,0,0,0,1,1,1,0,1,1,0,0,0,1


In [7]:
# load in dataset of drafts match outcomes

dataset = pd.read_pickle("dota.pickle")
radiant_win, radiant_draft, dire_draft, _, _, num_matches, num_heroes = dataset

In [8]:
heroes_list = np.unique(np.concatenate([radiant_draft, dire_draft], axis=0).flatten())
embedding = embedding.loc[np.arange(num_heroes)].values

# build the draft vector described in paper, a (num_heroes x 1) vector with +1 in the ith entry if the ith hero is picked by radiant, -1 if picked by the dire, 0 otherwise.
drafts = list()
empty_draft = np.zeros((num_heroes, ))

for idx in range(num_matches):
    temp = empty_draft.copy()
    temp[radiant_draft[idx]] = 1
    temp[dire_draft[idx]] = -1

    drafts.append(temp)

drafts = np.asarray(drafts, dtype=np.int64)

# our dataset of drafts
assert drafts.shape == (num_matches, num_heroes)

# embedding dataset for each match
K = drafts @ embedding

# combined datset, both embedding + hero indication
X = np.concatenate((K, drafts), axis=1)
assert X.shape == (num_matches, embedding.shape[1] + num_heroes)

y = radiant_win.copy()

In [9]:
# create train, val and test split
from sklearn.model_selection import train_test_split

idx_train, idx_test = train_test_split(np.arange(num_matches), test_size=0.2, train_size=0.8, shuffle=True)
idx_train, idx_val = train_test_split(idx_train, train_size=0.75, test_size=0.25, shuffle=True)

# keep the split points for X and K the same
X_train, X_val, X_test = X[idx_train], X[idx_val], X[idx_test]
K_train, K_val, K_test = K[idx_train], K[idx_val], K[idx_test]
y_train, y_val, y_test = y[idx_train], y[idx_val], y[idx_test]


In [10]:
# implement a boosted decision tree model.
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


# search our training dataset to optimal fit our model
search_params = {
 'max_depth':range(3,6,2),
 'min_child_weight':range(1,5,2),
 'gamma':[1e-2, 1e-1, 5e-1],
 'subsample':[i/10.0 for i in range(7,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)],
 'reg_alpha':[1e-5, 1e-2, 0.1, 5]
}

static_model_params = {
    "learning_rate": 0.1,
    "n_estimators": 15,
    "max_depth": 4,
    # "objective": "binary:logistic",
    "eval_metric": "logloss",
    "use_label_encoder": False,
    "seed": 123
}

# CV params
n_jobs=1
cv=5

boosted_gsearch_pca = GridSearchCV(estimator = XGBClassifier(**static_model_params), param_grid = search_params,n_jobs=n_jobs, cv=cv) 


  from pandas import MultiIndex, Int64Index


In [14]:
# try fit a smaller subset due to ram limitations

In [15]:
X_temp, y_temp = X_train[:int(1e5)], y_train[:int(1e5)]

In [16]:
y_temp

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [17]:
boosted_gsearch_pca.fit(X_temp, y_temp)  # takes 90 mins, lol

In [18]:
boosted_gsearch_pca.best_estimator_

In [21]:
from sklearn.metrics import accuracy_score

In [25]:
model = boosted_gsearch_pca.best_estimator_

In [26]:
accuracy_score(y_temp, model.predict(X_temp))

0.60397

In [27]:
accuracy_score(y_val, model.predict(X_val))

0.5967162806979662

In [28]:
accuracy_score(y_test, model.predict(X_test))

0.5969606752600929

In [29]:
# not an improvement on Logistic Regression, unfortunately - not sure why this performs so poorly

In [42]:
test = XGBClassifier(max_depth=20, n_estimators=300)

In [43]:
test.fit(X_train[:int(5e5)], y_train[:int(5e5)])





In [44]:
accuracy_score(y_temp, test.predict(X_temp))

1.0

In [45]:
accuracy_score(y_test, test.predict(X_test))

0.6163286004056795

In [46]:
accuracy_score(y_train, test.predict(X_train))

0.7210130880931233

In [47]:
accuracy_score(y_val, test.predict(X_val))

0.6160547929967823

In [48]:
from sklearn.manifold import LocallyLinearEmbedding

embedding = LocallyLinearEmbedding(method="modified")

In [49]:
E = embedding.fit_transform(X_train)