### About

This is the notebook for building PredictionByHero model using random forest (RF) and gradient boosting (GB). We will also look at PCA to see if reduction in dimension could help us.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB



In [2]:
#load and randomise data
dataset = pd.read_csv('heroSelect.csv', index_col = 0)
dataset = dataset.take(np.random.permutation(len(dataset)))

In [3]:
print('dataset', dataset.shape)

dataset (173365, 273)


### Using LR and MN
Let's see how simple logistic regression and multinomial perform.

In [4]:
x = dataset.drop('team1Win', axis=1)
y = dataset['team1Win']

#print results
print('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv=2)))
print('MultinominalNB accuracy:', np.mean(cross_val_score(MultinomialNB(), x, y, scoring='accuracy', cv=2)))

Logistic Regression accuracy: 0.529270608683
MultinominalNB accuracy: 0.528111211201


### Defining the model

In [5]:
def RFmodel(train_x, train_y, test_x, test_y):
    model = RandomForestClassifier(min_samples_split=15,
                               min_samples_leaf=15,
                               criterion="gini",
                               n_estimators=200,
                               random_state=189,
                               max_features="auto")

    print("Training model.")
    #train model
    model.fit(train_x, train_y)
    predicted_labels = model.predict(test_x) 
    print("FINISHED classifying. accuracy score : ")
    print(accuracy_score(test_y, predicted_labels))

In [6]:
def GBmodel(train_x, train_y, test_x, test_y):
    model = GradientBoostingClassifier(min_samples_split=15,
                               min_samples_leaf=15,
                               max_features='sqrt',
                               n_estimators=200,
                               random_state=189,
                               subsample=0.8)

    print("Training model.")
    #train model
    model.fit(train_x, train_y)
    predicted_labels = model.predict(test_x) 
    print("FINISHED classifying. accuracy score : ")
    print(accuracy_score(test_y, predicted_labels))

### Try the model without principal component analysis

In [7]:
def get_data_feed_RF(dataset):
    team_data = dataset.iloc[:,1:273]
    winners = pd.get_dummies(dataset['team1Win'])
    return team_data, winners

In [8]:
def get_data_feed_GB(dataset):
    team_data = dataset.iloc[:,1:273]
    winners = dataset['team1Win']
    return team_data, winners

In [9]:
train, test = train_test_split(dataset, test_size = 0.1)
trainX_RF, trainY_RF = get_data_feed_RF(train)
testX_RF, testY_RF = get_data_feed_RF(test)
trainX_GB, trainY_GB = get_data_feed_GB(train)
testX_GB, testY_GB = get_data_feed_GB(test)

#### Model Running:

In [10]:
RFmodel(trainX_RF, trainY_RF, testX_RF, testY_RF)

Training model.
FINISHED classifying. accuracy score : 
0.527369210359


In [11]:
GBmodel(trainX_GB, trainY_GB, testX_GB, testY_GB)

Training model.
FINISHED classifying. accuracy score : 
0.532272019381


### Using Principal Component Analysis on the dataset

In [12]:
# Separating out the features
x = dataset.iloc[:,1:273].values
# Separating out the target
y = dataset.iloc[:,0:1]

# Standardizing the features
x = StandardScaler().fit_transform(x)

In [13]:
pd.DataFrame(data = x).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,262,263,264,265,266,267,268,269,270,271
0,-0.199846,-0.252918,-0.130888,-0.326827,2.368155,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
1,-0.199846,-0.252918,-0.130888,3.059723,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
2,-0.199846,-0.252918,-0.130888,3.059723,-0.42227,-0.352802,2.730115,-0.118332,-0.23818,3.593567,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
3,-0.199846,-0.252918,-0.130888,-0.326827,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
4,-0.199846,-0.252918,-0.130888,-0.326827,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
5,-0.199846,3.95385,-0.130888,-0.326827,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
6,-0.199846,-0.252918,-0.130888,-0.326827,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
7,-0.199846,-0.252918,-0.130888,-0.326827,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
8,-0.199846,-0.252918,-0.130888,-0.326827,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0
9,-0.199846,-0.252918,-0.130888,-0.326827,-0.42227,-0.352802,-0.366285,-0.118332,-0.23818,-0.278275,...,-0.125234,-0.197619,0.0,0.0,0.0,0.0,-0.163001,0.0,0.0,0.0


In [14]:
pca = PCA(n_components=136, whiten=True) #reduce dimension by half
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents)
finalDf = pd.concat([principalDf, y], axis=1)
finalDf.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,127,128,129,130,131,132,133,134,135,team1Win
0,-0.695495,0.588624,0.05337,0.262934,0.207734,-1.068153,-0.371155,-0.32729,1.109409,-1.348514,...,-0.783698,-0.507744,-0.365657,0.279444,-0.32601,0.881307,1.925227,-0.020426,0.544384,-1.0
1,-0.106369,-1.296062,-0.506209,0.999421,-1.721088,0.450704,0.048172,0.615494,0.809916,-0.214167,...,-0.743328,-0.234031,1.755046,-0.360779,3.469277,1.410107,-0.282242,0.067796,-2.562912,-1.0
2,1.604015,-0.057264,-0.418116,1.76137,-0.988363,1.413017,-1.482368,1.672793,0.611771,-0.832969,...,1.667768,0.894235,-0.772172,0.22101,-0.376656,0.45375,3.242296,3.097436,1.840584,-1.0
3,-0.232472,-0.16185,0.107759,0.51388,-0.432835,-0.279387,-0.070221,-1.225025,-0.282572,2.445482,...,1.833812,-0.704022,-0.389914,-0.816592,0.20805,-0.294238,0.310845,-1.101396,-0.426887,-1.0
4,-1.797457,-0.365956,-1.521082,0.415012,-0.278537,-0.520815,-0.410934,1.925119,1.180003,0.498312,...,0.036346,-0.644029,0.283569,-1.686825,-0.842367,0.048724,0.738326,1.120965,1.158546,-1.0
5,-0.459209,0.426411,-0.469873,0.131485,-1.425457,-1.15699,-1.391055,0.773985,1.895068,-0.789987,...,-1.163933,0.091501,-0.581819,-0.865447,-1.494994,-0.097117,1.048388,0.098692,-0.468137,1.0
6,0.834276,0.045706,-0.071843,1.634345,0.985174,-1.309315,-0.602668,-0.744599,-0.906137,-0.081359,...,2.242615,-1.699425,-1.72958,-0.154835,-0.793104,-2.541849,-0.963479,1.440992,0.571478,-1.0
7,1.385556,-0.478857,0.109226,-0.130996,-1.033345,-0.561888,-0.497096,0.629695,-0.938655,0.739291,...,-0.24207,1.617481,-0.157324,0.10096,1.243656,-0.312762,-1.516538,-1.617369,-0.243068,-1.0
8,0.003463,-0.329272,0.384721,-0.240837,-1.392502,-0.49236,0.696097,-0.055994,-1.961645,-1.090175,...,1.07695,1.411304,-1.459586,2.021538,-0.688621,-1.362194,0.497146,-3.707704,0.094551,-1.0
9,-2.520273,-2.459542,0.413584,0.113831,-0.452076,-0.658358,0.122794,0.474335,-0.933415,0.278085,...,-1.288216,0.604381,-0.390425,-0.044636,0.068274,-0.000998,1.160504,-0.59469,0.974285,1.0


In [15]:
train, test = train_test_split(finalDf, test_size = 0.1)

### Model Running:

In [16]:
def get_PCA_data_feed_RF(dataset):
    team_data = dataset.iloc[:,0:135]
    winners = pd.get_dummies(dataset['team1Win'])
    return team_data, winners

In [17]:
def get_PCA_data_feed_GB(dataset):
    team_data = dataset.iloc[:,0:135]
    winners = dataset['team1Win']
    return team_data, winners

In [18]:
trainX_RF, trainY_RF = get_PCA_data_feed_RF(train)
testX_RF, testY_RF = get_PCA_data_feed_RF(test)
trainX_GB, trainY_GB = get_PCA_data_feed_GB(train)
testX_GB, testY_GB = get_PCA_data_feed_GB(test)

In [19]:
RFmodel(trainX_RF, trainY_RF, testX_RF, testY_RF)

Training model.
FINISHED classifying. accuracy score : 
0.500490280902


In [20]:
GBmodel(trainX_GB, trainY_GB, testX_GB, testY_GB)

Training model.
FINISHED classifying. accuracy score : 
0.505335409817


## Conclusion
We can see PCA, in fact, worsen the performance of our models by information loss during dimension reduction.

Also bear in mind that our baseline percentage is 54%.

And by only looking at hero selections, Gradient Boosting Classifier(GB) has the best accuracy (53.2%) but still not enough to pass our baseline.