### About

This is the notebook for building PredictionByHero model using random forest (RF) and gradient boosting (GB).

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#load and randomise data
dataset = pd.read_csv('heroSelect.csv', index_col = 0)
dataset = dataset.take(np.random.permutation(len(dataset)))

In [3]:
print('dataset', dataset.shape)
train, test = train_test_split(dataset, test_size = 0.1)
print('train:', train.shape, 'test:', test.shape)

dataset (173365, 273)
train: (156028, 273) test: (17337, 273)


###Model:

In [4]:
def get_data_feed(dataset):
    team_data = dataset.ix[:,1:273]
    winners = pd.get_dummies(dataset['team1Win'])
    return team_data, winners 

In [5]:
trainX, trainY = get_data_feed(train)
testX, testY = get_data_feed(test)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [6]:
def RFmodel(train_x, train_y, test_x, test_y):
    model = RandomForestClassifier(min_samples_split=15,
                               min_samples_leaf=15,
                               criterion="gini",
                               n_estimators=200,
                               random_state=189,
                               max_features="auto")

    print("Training model.")
    #train model
    model.fit(train_x, train_y)
    predicted_labels = model.predict(test_x) 
    print("FINISHED classifying. accuracy score : ")
    print(accuracy_score(test_y, predicted_labels))

In [7]:
def GBmodel(train_x, train_y, test_x, test_y):
    model = GradientBoostingClassifier(min_samples_split=15,
                               min_samples_leaf=15,
                               max_features='sqrt',
                               n_estimators=200,
                               random_state=189,
                               subsample=0.8)

    print("Training model.")
    #train model
    model.fit(train_x, train_y)
    predicted_labels = model.predict(test_x) 
    print("FINISHED classifying. accuracy score : ")
    print(accuracy_score(test_y, predicted_labels))

In [8]:
RFmodel(trainX, trainY, testX, testY)

Training model.
FINISHED classifying. accuracy score : 
0.5255234469631425


In [9]:
def get_data_feedGB(dataset):
    team_data = dataset.ix[:,1:273]
    winners = dataset['team1Win']
    return team_data, winners 

In [None]:
trainX, trainY = get_data_feedGB(train)
testX, testY = get_data_feedGB(test)

In [None]:
GBmodel(trainX, trainY, testX, testY)

Training model.
