### About

This is the notebook for building PredictionByHero model using neural network.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB



In [2]:
#load and randomise data
dataset = pd.read_csv('heroSelect.csv', index_col = 0)
dataset = dataset.take(np.random.permutation(len(dataset)))


We would like to know how well our neural network is when compared with LR and MNB.

In [3]:
x = dataset.drop('team1Win', axis=1)
y = dataset['team1Win']

#print results
print('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv=2)))
print('MultinominalNB accuracy:', np.mean(cross_val_score(MultinomialNB(), x, y, scoring='accuracy', cv=2)))


Logistic Regression accuracy: 0.528290017201323
MultinominalNB accuracy: 0.5285380498423687


This tells us percentages of matches are won by team 1. If we blindly choose team 1 to be the winner for all matches, this tells us the "accuracy" for this non-sense method and serves as the baseline. 

In [4]:
t1win = 0
for idx, x in dataset['team1Win'].iteritems():
    if(x==1.0):
        t1win+=1
print(t1win)
print((183452-t1win)/183452)


88670
0.516658308440355


In [5]:
print('dataset', dataset.shape)
dataset, validation = train_test_split(dataset, test_size = 0.1)
train, test = train_test_split(dataset, test_size = 0.1)
print('train:', train.shape, 'validation:', validation.shape, 'test:', test.shape)

dataset (173365, 273)
train: (140425, 273) validation: (17337, 273) test: (15603, 273)


## Tensorflow parts:

In [6]:
sess = tf.InteractiveSession()

In [7]:
#input/output placeholders
x_team1    = tf.placeholder("float", shape=[None, 136], name='x_team1')
x_team2 = tf.placeholder("float", shape=[None, 136], name='x_team1')
y_true = tf.placeholder("float", shape=[None, 2], name='y_true')

#we'll use dropout layers for regularisation which need a keep probability
keep_prob1 = tf.placeholder("float", name='keep_prob1')
keep_prob2 = tf.placeholder("float", name='keep_prob2')

#there doesn't seem to be any other way to differenciate train and validation summaries for TensorBoard
loss_name     = tf.placeholder("string", name='loss_name')
accuracy_name = tf.placeholder("string", name='accuracy_name')

#### Weight init for fully connected layer:

In [8]:
def fc_weight_bias(in_size, out_size):
    initial_weight = tf.truncated_normal([in_size, out_size], stddev=0.2, mean=0.0)
    initial_bias = tf.constant(0.1, shape=[out_size])
    return tf.Variable(initial_weight), tf.Variable(initial_bias)

#### Our model structure

In [9]:
with tf.name_scope("hero_layers_1") as scope:
    W_hero1, b_hero1 = fc_weight_bias(136,100)      
    #note that team1 layer and team2 layer use the same weights and biases
    team1_layer1 = tf.nn.relu(tf.matmul(x_team1, W_hero1) + b_hero1)
    team2_layer1 = tf.nn.relu(tf.matmul(x_team2, W_hero1) + b_hero1)

#second hero layer
with tf.name_scope("hero_layers_2") as scope:    
    W_hero2, b_hero2 = fc_weight_bias(100,100)    
    #again, team1 and team2 use the same weights and biases
    team1_layer2 = tf.nn.relu(tf.matmul(team1_layer1, W_hero2) + b_hero2)
    team2_layer2 = tf.nn.relu(tf.matmul(team2_layer1, W_hero2) + b_hero2)

#now concatenate the team1 and team2 team outputs
with tf.name_scope("hero_layers_concat") as scope:
    team1_team2_concat = tf.concat([team1_layer2, team2_layer2], 1)
    team1_team2_drop = tf.nn.dropout(team1_team2_concat, keep_prob1)
    h_drop1 = tf.nn.dropout(team1_team2_drop, keep_prob1)

with tf.name_scope("hidden_layer_1") as scope:
    W_hidden1, b_hidden1 = fc_weight_bias(200,130)    
    h_hidden1 = tf.nn.relu(tf.matmul(h_drop1, W_hidden1) + b_hidden1)
    h_drop2 = tf.nn.dropout(h_hidden1, keep_prob2)

with tf.name_scope("hidden_layer_2") as scope:
    W_hidden2, b_hidden2 = fc_weight_bias(130,70)    
    h_hidden2 = tf.nn.relu(tf.matmul(h_drop2, W_hidden2) + b_hidden2)

with tf.name_scope("hidden_layer_3") as scope:
    W_hidden3, b_hidden3 = fc_weight_bias(70,25)    
    h_hidden3 = tf.nn.relu(tf.matmul(h_hidden2, W_hidden3) + b_hidden3)

with tf.name_scope("output_layer") as scope:
    W_hidden4, b_hidden4 = fc_weight_bias(25,2)    
    y_pred = tf.nn.softmax(tf.matmul(h_hidden3, W_hidden4) + b_hidden4)

In [10]:
with tf.name_scope("loss_calculations") as scope:
    #cross_entropy = -tf.reduce_sum(y_true * tf.log(y_pred + 1e-8))
    #weights_sum   = tf.add_n([tf.nn.l2_loss(variable) for variable in tf.global_variables()])
    #mean_loss     = cross_entropy + weights_sum
    #loss          = tf.reduce_mean(mean_loss)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    loss = tf.reduce_mean(cross_entropy)

with tf.name_scope("trainer") as scope:
    train_step    = tf.train.AdamOptimizer(0.0001).minimize(loss)

with tf.name_scope("accuracy_calculations") as scope:
    correct  = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, "float"))

In [11]:
sess.run(tf.global_variables_initializer())

#### DataFeed Function

In [12]:
def get_data_feed(dataset, kp1=1.0, kp2=1.0, loss_str='loss', accuracy_str='accuracy'):
    team1_data, team2_data = dataset.ix[:,1:137], dataset.ix[:,137:273]
    winners = pd.get_dummies(dataset['team1Win'])
    return {
        x_team1: team1_data,
        x_team2: team2_data,
        y_true: winners,
        loss_name: loss_str,
        accuracy_name: accuracy_str,
        keep_prob1: kp1,
        keep_prob2: kp2
    }  

In [13]:
train_feed      = get_data_feed(train,      loss_str = 'loss_train',      accuracy_str = 'accuracy_train')
validation_feed = get_data_feed(validation, loss_str = 'loss_validation', accuracy_str = 'accuracy_validation')
test_feed       = get_data_feed(test,       loss_str = 'loss_test',       accuracy_str = 'accuracy_test')

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


#### Batch Function

In [14]:
def get_batches(dataset, batch_size=1700): #1700 is about 1% of the entire training sets
    #randomise before every epoch
    dataset = dataset.take(np.random.permutation(len(dataset)))
    
    i = 0
    while i < len(dataset):
        yield dataset[i : i + batch_size]
        i = i + batch_size 

In [15]:
for i in range(100):    
    for mini_batch in get_batches(train):
        mini_batch_feed = get_data_feed(mini_batch, 0.5, 0.5)   
        train_step.run(feed_dict = mini_batch_feed)
    
    #log every epoch
    train_loss          = loss.eval(feed_dict = train_feed)
    validation_loss     = loss.eval(feed_dict = validation_feed)

    train_accuracy      = accuracy.eval(feed_dict = train_feed)
    validation_accuracy = accuracy.eval(feed_dict = validation_feed)

    print("epoch %d, loss: %g, train: %g, validation: %g"% (i, train_loss, train_accuracy, validation_accuracy)) 


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


epoch 0, loss: 0.697593, train: 0.502838, validation: 0.505508
epoch 1, loss: 0.695341, train: 0.501584, validation: 0.50695
epoch 2, loss: 0.694106, train: 0.504504, validation: 0.514622
epoch 3, loss: 0.693478, train: 0.507367, validation: 0.511969
epoch 4, loss: 0.693174, train: 0.508464, validation: 0.509142
epoch 5, loss: 0.69307, train: 0.508784, validation: 0.510181
epoch 6, loss: 0.692989, train: 0.509553, validation: 0.508392
epoch 7, loss: 0.692962, train: 0.510436, validation: 0.507758
epoch 8, loss: 0.692949, train: 0.511077, validation: 0.506374
epoch 9, loss: 0.692972, train: 0.509133, validation: 0.504355
epoch 10, loss: 0.692969, train: 0.50904, validation: 0.503259
epoch 11, loss: 0.692967, train: 0.508499, validation: 0.504124
epoch 12, loss: 0.692952, train: 0.508834, validation: 0.503605
epoch 13, loss: 0.692943, train: 0.508998, validation: 0.504413
epoch 14, loss: 0.692919, train: 0.509518, validation: 0.50447
epoch 15, loss: 0.692922, train: 0.509354, validation:

KeyboardInterrupt: 

In [None]:
accuracy.eval(feed_dict=test_feed)