### About

This is the notebook for building PredictionByHero model.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
#load and randomise data
dataset = pd.read_csv('heroSelect.csv')
dataset = dataset.take(np.random.permutation(len(dataset)))

#split dependent/independent variables
x = dataset.drop('team1Win', axis=1)
y = dataset['team1Win']

In [3]:
dataset, validation = train_test_split(dataset, test_size = 0.1)
train, test = train_test_split(dataset, test_size = 0.1)
print('train:', train.shape, 'validation:', validation.shape, 'test:', test.shape)

train: (148595, 138) validation: (18346, 138) test: (16511, 138)


## Tensorflow parts:

In [4]:
sess = tf.InteractiveSession()

In [6]:
#input/output placeholders
x_teamComp = tf.placeholder("float", shape=[None, 136], name='x_teamComp')
y_ = tf.placeholder("float", shape=[None, 2], name='y_true')

#we'll use dropout layers for regularisation which need a keep probability
keep_prob1 = tf.placeholder("float", name='keep_prob1')
keep_prob2 = tf.placeholder("float", name='keep_prob2')

#there doesn't seem to be any other way to differenciate train and validation summaries for TensorBoard
loss_name     = tf.placeholder("string", name='loss_name')
accuracy_name = tf.placeholder("string", name='accuracy_name')

#### Weight init for fully connected layer:

In [7]:
def fc_weight_bias(in_size, out_size):
    initial_weight = tf.truncated_normal([in_size, out_size], stddev=0.2, mean=0.0)
    initial_bias = tf.constant(0.1, shape=[out_size])
    return tf.Variable(initial_weight), tf.Variable(initial_bias)

#### Our model structure

In [8]:
#first hero layer
with tf.name_scope("hero_layers_1") as scope:
    W_hero1, b_hero1 = fc_weight_bias(136,100)      
    #note that team1 layer and team2 layer use the same weights and biases
    hero_layer1 = tf.nn.relu(tf.matmul(x_teamComp, W_hero1) + b_hero1)
    
with tf.name_scope("hidden_layer_1") as scope:
    W_hidden1, b_hidden1 = fc_weight_bias(136,100)    
    h_hidden1 = tf.nn.relu(tf.matmul(hero_layer1, W_hidden1) + b_hidden1)
    h_drop1 = tf.nn.dropout(h_hidden1, keep_prob1)

with tf.name_scope("hidden_layer_2") as scope:
    W_hidden2, b_hidden2 = fc_weight_bias(100,60)    
    h_hidden2 = tf.nn.relu(tf.matmul(h_drop1, W_hidden2) + b_hidden2)
    h_drop2 =  tf.nn.dropout(h_hidden2, keep_prob1)
    
with tf.name_scope("hidden_layer_3") as scope:
    W_hidden3, b_hidden3 = fc_weight_bias(60,25)    
    h_hidden3 = tf.nn.relu(tf.matmul(h_drop2, W_hidden3) + b_hidden3)


with tf.name_scope("output_layer") as scope:
    W_hidden4, b_hidden4 = fc_weight_bias(25,2)    
    y = tf.nn.softmax(tf.matmul(h_hidden3, W_hidden4) + b_hidden4)

ValueError: Dimensions must be equal, but are 100 and 136 for 'hidden_layer_1/MatMul' (op: 'MatMul') with input shapes: [?,100], [136,100].

In [None]:
with tf.name_scope("loss_calculations") as scope:
    cross_entropy = -tf.reduce_sum(y_ * tf.log(y + 1e-8))
    weights_sum   = tf.add_n([tf.nn.l2_loss(variable) for variable in tf.all_variables()])
    loss          = cross_entropy + weights_sum
    mean_loss     = tf.reduce_mean(loss)

with tf.name_scope("trainer") as scope:
    train_step    = tf.train.AdamOptimizer(0.0001).minimize(loss)

with tf.name_scope("accuracy_calculations") as scope:
    correct  = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, "float"))

In [10]:
#summarize the accuracy and loss 
accuracy_summary = tf.summary.scalar(accuracy_name, accuracy)
mean_loss_summary = tf.summary.scalar(loss_name, mean_loss)

#summarize the distribution of output values
y_hist = tf.summary.histogram("y", y)

#gather all summaries
merged = tf.summary.merge_all()

writer = tf.summary.FileWriter("logdir", sess.graph_def)

NameError: name 'accuracy' is not defined

In [None]:
sess.run(tf.initialize_all_variables())

#### DataFeed Function

In [None]:
def get_data_feed(dataset, kp1=1.0, kp2=1.0, loss_str='loss', accuracy_str='accuracy'):
    team_data = dataset.ix[:,:136]
    winners = pd.get_dummies(dataset['team1Win'])
    return {
        x_teamComp: team_data,
        y_: winners,
        loss_name: loss_str,
        accuracy_name: accuracy_str,
        keep_prob1: kp1,
        keep_prob2: kp2
    }  

In [None]:
train_feed      = get_data_feed(train,      loss_str = 'loss_train',      accuracy_str = 'accuracy_train')
validation_feed = get_data_feed(validation, loss_str = 'loss_validation', accuracy_str = 'accuracy_validation')
test_feed       = get_data_feed(test,       loss_str = 'loss_test',       accuracy_str = 'accuracy_test')

#### Batch Function

In [None]:
def get_batches(dataset, batch_size=500):
    #randomise before every epoch
    dataset = dataset.take(np.random.permutation(len(dataset)))
    
    i = 0
    while i < len(dataset):
        yield dataset[i : i + batch_size]
        i = i + batch_size 

In [None]:
for i in range(100):    
    for mini_batch in get_batches(train):
        mini_batch_feed = get_data_feed(mini_batch, 0.5, 0.5)   
        train_step.run(feed_dict = mini_batch_feed)
    
    #log every epoch
    train_loss          = loss.eval(feed_dict = train_feed)
    validation_loss     = loss.eval(feed_dict = validation_feed)

    train_accuracy      = accuracy.eval(feed_dict = train_feed)
    validation_accuracy = accuracy.eval(feed_dict = validation_feed)

    train_summary_str      = merged.eval(feed_dict = train_feed)
    validation_summary_str = merged.eval(feed_dict = validation_feed)                

    writer.add_summary(train_summary_str, i)
    writer.add_summary(validation_summary_str, i)
    print("epoch %d, loss: %g, train: %g, validation: %g"% (i, train_loss, train_accuracy, validation_accuracy)) 

writer.close()

In [None]:
accuracy.eval(feed_dict=test_feed)