In [329]:
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB


In [370]:
# Read in training data
df = pd.read_csv("data/dota/trainingdata.txt", sep=',', header=None).reset_index()
characters = set()
for i in range(10):
    characters = set(list(characters) + list(set(df[i])))
    if i < 5:
        df[i] = '1' + df[i]
    else:
        df[i] = '2' + df[i]
df.columns = [str(x) for x in df.columns]
df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10
0,0,1Sven,1Lone Druid,1Venomancer,1Clockwerk,1Shadow Shaman,2Invoker,2Gyrocopter,2Anti-Mage,2Alchemist,2Slark,2
1,1,1Riki,1Tinker,1Puck,1Leshrac,1Nyx Assassin,2Slardar,2Sand King,2Spectre,2Necrolyte,2Warlock,1
2,2,1Invoker,1Mirana,1Pudge,1Magnus,1Keeper of the Light,2Rubick,2Tidehunter,2Queen of Pain,2Faceless Void,2Sniper,2
3,3,1Riki,1Centaur Warrunner,1Treant Protector,1Queen of Pain,1Broodmother,2Rubick,2Weaver,2Troll Warlord,2Alchemist,2Drow Ranger,1
4,4,1Razor,1Kunkka,1Drow Ranger,1Leshrac,1Zeus,2Riki,2Bane,2Visage,2Invoker,2Timbersaw,1


In [372]:
# Creating index of team_heroes
char_index = set()
for i in range(10):
    char_index = set(list(char_index) +  df[str(i)].to_list())
n_char = len(char_index)
char_index = list(char_index)
len(char_index), char_index[:5]


(194, ['1Anti-Mage', '2Anti-Mage', '1Tidehunter', '1Wisp', '2Pudge'])

In [391]:
%%time
# Converting heros to onehot
results = np.zeros((len(df), n_char))
for i in range(len(df)):
    for j in range(10):
        current_char = df[str(j)][i]
        results[i, char_index.index(current_char)] = 1
results

X_train = results
y_train = np.where(df['10'] == 2, 0, df['10'])
X_train[:10,:], y_train[:10]

CPU times: user 1.8 s, sys: 15.4 ms, total: 1.82 s
Wall time: 1.82 s


(array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), array([0, 1, 0, 1, 1, 0, 1, 1, 1, 0]))

In [377]:
# Input test cases from file
f = open("data/dota/test_case_1.txt")
x = f.readlines(1)[0]
n = int(x)
X_input = np.zeros((n, n_char))
for i in range(n):
    s = f.readlines(1)[0].strip().split(",")
    pred_char = ["1"+x if cnt < 5 else "2"+x for cnt, x in zip(range(len(s)), s)]
    for current_char in pred_char:
        try:
            X_input[i, char_index.index(current_char)] = 1 if current_char[0] == "1" else - 1
        except:
            pass
X_input

array([[ 0.,  0.,  0., ..., -1.,  0., -1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [392]:
# Input test cases from STDIN (Hackerrank requirement)
n = int(input())
X_input = np.zeros((n, n_char))
for i in range(n):
    s = input().split(",")
    pred_char = ["1"+x if cnt < 5 else "2"+x for cnt, x in zip(range(len(s)), s)]
    for current_char in pred_char:
        try:
            X_input[i, char_index.index(current_char)] = 1 if current_char[0] == "1" else - 1
        except:
            pass
X_input

 1
 a,b,c,d


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

# Decision Tree Classifier
Most of the other classifiers fared worse on the testing set, even though they fare well in cross validation on train. Hence these classifiers have been relegated for the simpler Decision Tree, despite it being far weaker in train

In [389]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [None]:
# Print output
for i in dt.predict(X_input):
    if i == 0:
        print(2)
    else:
        print(1)

# Other classifiers

# Other classifiers

In [393]:
%%time
sgd = SGDClassifier(penalty='l2',
                    loss = 'log',
                    learning_rate = 'adaptive',
                    eta0 = 0.0001,
                    alpha = 0.00001,
                    random_state = 42)
cross_val_score(sgd, X_train, y_train, cv = 3)

parameters = {'loss': ['hinge', 'log', 'modified_huber'],
             'penalty': ['l1','l2','elasticnet'],
             'alpha': [0.00001, 0.0001, 0.01],
              'learning_rate': ['optimal', 'adaptive'],
              'eta0': [0.0001, 0.001, 0.01]
             }

rs = RandomizedSearchCV(sgd, parameters, cv = 5)
rs.fit(X_train, y_train)
rs.best_score_, rs.best_params_

CPU times: user 1min 19s, sys: 396 ms, total: 1min 20s
Wall time: 55.4 s


(0.5955333333333334,
 {'penalty': 'elasticnet',
  'loss': 'modified_huber',
  'learning_rate': 'optimal',
  'eta0': 0.001,
  'alpha': 0.01})

In [390]:
# Manual check of test case accuracy
expected_output = [int(x) for x in open("data/dota/test_case_1_output.txt").read().strip().split("\n")]
check_results = pd.DataFrame({"dt_pred":np.where(dt.predict(X_input)==0,2,1), "prediction": np.where(sgd.predict(X_input) == 0, 2, 1), "actual": expected_output})
check_results['accuracy'] = check_results['prediction'] == check_results['actual']
check_results['dt_accuracy'] = check_results['dt_pred'] == check_results['actual']
check_results['accuracy'].mean(), check_results['dt_accuracy'].mean()

(0.5033333333333333, 0.5186666666666667)

In [394]:
%%time
rf = RandomForestClassifier(n_estimators = 200,
                           max_depth = 200,
                           min_samples_split = 10,
                           min_samples_leaf = 10,
                           random_state = 42)
cross_val_score(rf, X_train, y_train, cv = 3)

parameters = {'n_estimators':[200, 300, 500],
             'max_depth': [100, 200, 500],
             'min_samples_split':[ 10, 50],
             'min_samples_leaf': [5, 10]}

rs = RandomizedSearchCV(rf, parameters, cv = 5)
rs.fit(X_train, y_train)
rs.best_score_, rs.best_params_

CPU times: user 5min 37s, sys: 3.1 s, total: 5min 40s
Wall time: 5min 42s


(0.5923333333333334,
 {'n_estimators': 300,
  'min_samples_split': 50,
  'min_samples_leaf': 10,
  'max_depth': 200})

In [396]:
%%time
nb = GaussianNB()
cross_val_score(nb, X_train, y_train, cv = 3)

CPU times: user 143 ms, sys: 53.8 ms, total: 197 ms
Wall time: 198 ms


array([0.5874825 , 0.5822    , 0.57491498])

In [397]:
%%time
ada = AdaBoostClassifier(random_state = 42)
cross_val_score(ada, X_train, y_train, cv = 3)

CPU times: user 4.02 s, sys: 28.9 ms, total: 4.05 s
Wall time: 4.06 s


array([0.58728254, 0.591     , 0.5905181 ])

In [398]:
%%time
kn = KNeighborsClassifier(n_neighbors = 10)
cross_val_score(kn, X_train, y_train, cv = 3)

CPU times: user 44.9 s, sys: 138 ms, total: 45.1 s
Wall time: 45.2 s


array([0.5334933 , 0.5414    , 0.53730746])

In [399]:
%%time
nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(6, 10), random_state=1, max_iter=1000)
cross_val_score(nn, X_train, y_train, cv = 3)

parameters = {
    'hidden_layer_sizes': [(50,50,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

rs = RandomizedSearchCV(nn, parameters, cv = 3)
rs.fit(X_train, y_train)
rs.best_score_, rs.best_params_

KeyboardInterrupt: 

In [401]:
# VotingClassifier by combining
from sklearn.ensemble import VotingClassifier
eclf = VotingClassifier(estimators=[
        ('sgd', sgd), ('rf', rf), ('nb', nb), ('ada', ada), ('nn', nn)], voting='hard')
cross_val_score(eclf, X_train, y_train, cv = 3)

array([0.58808238, 0.5926    , 0.58611722])