In [1]:
import numpy as np
import pandas as pd 
from keras.models import load_model, Model
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector, TimeDistributed
from keras.initializers import glorot_uniform
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras import backend as K
import tensorflow as tf

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import os
print(os.listdir())

['IPL-Functional_LSTM-multiout.ipynb', 'machine-learning-ex7', 'IPL-Sequential-LSTM.ipynb', '.ipynb_checkpoints', 'IPL-Sequential_LSTM-multiout.ipynb', 'org.octave.Octave.desktop', 'IPL', 'API requests.ipynb', 'machine-learning-ex6', 'machine-learning-ex1', 'machine-learning-ex8', 'deliveries.csv', 'IPL-Functional_LSTM-multiout-1.ipynb', 'machine-learning-ex4', 'machine-learning-ex2', 'IPL-Functional_LSTM.ipynb', 'machine-learning-ex5', 'matches.csv', 'machine-learning-ex3']


In [3]:
matches=pd.read_csv('matches.csv')
deliveries=pd.read_csv('deliveries.csv')

# Transform 'player dismissed' from NaNs / a player name, to a binary indicator col indicating wicket
deliveries.player_dismissed=deliveries.player_dismissed.notnull().astype(int)
deliveries.columns


Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')

In [4]:
# Turn batting / bowling teams column from a string to a one hot feature encoding


deliveries['batting_team'].unique()
A = deliveries['batting_team'].str.get_dummies()
B = deliveries['bowling_team'].str.get_dummies()
deliveries = deliveries.join(A, lsuffix='_l', rsuffix='_r')
deliveries = deliveries.join(B, lsuffix='_batfirst', rsuffix='_bowlfirst')

deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,Kings XI Punjab_bowlfirst,Kochi Tuskers Kerala_bowlfirst,Kolkata Knight Riders_bowlfirst,Mumbai Indians_bowlfirst,Pune Warriors_bowlfirst,Rajasthan Royals_bowlfirst,Rising Pune Supergiant_bowlfirst,Rising Pune Supergiants_bowlfirst,Royal Challengers Bangalore_bowlfirst,Sunrisers Hyderabad_bowlfirst
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,0,1,0
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,0,1,0
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,0,1,0
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
deliveries.columns

# Feature columns with string columns removed
# I.e. names of batsman, non-striker, bowler and any names relating to a wicket are ommited 
# This is because this is only a prototype. 
# Also I'm not sure there are enough examples to avoid overfitting if the number of features becomes too large

feature_cols = ['match_id', 'inning','over', 'ball', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed', 'Chennai Super Kings_batfirst',
       'Deccan Chargers_batfirst', 'Delhi Capitals_batfirst',
       'Delhi Daredevils_batfirst', 'Gujarat Lions_batfirst',
       'Kings XI Punjab_batfirst', 'Kochi Tuskers Kerala_batfirst',
       'Kolkata Knight Riders_batfirst', 'Mumbai Indians_batfirst',
       'Pune Warriors_batfirst', 'Rajasthan Royals_batfirst',
       'Rising Pune Supergiant_batfirst', 'Rising Pune Supergiants_batfirst',
       'Royal Challengers Bangalore_batfirst', 'Sunrisers Hyderabad_batfirst',
       'Chennai Super Kings_bowlfirst', 'Deccan Chargers_bowlfirst',
       'Delhi Capitals_bowlfirst', 'Delhi Daredevils_bowlfirst',
       'Gujarat Lions_bowlfirst', 'Kings XI Punjab_bowlfirst',
       'Kochi Tuskers Kerala_bowlfirst', 'Kolkata Knight Riders_bowlfirst',
       'Mumbai Indians_bowlfirst', 'Pune Warriors_bowlfirst',
       'Rajasthan Royals_bowlfirst', 'Rising Pune Supergiant_bowlfirst',
       'Rising Pune Supergiants_bowlfirst',
       'Royal Challengers Bangalore_bowlfirst',
       'Sunrisers Hyderabad_bowlfirst']

In [29]:
# Number of hidden states in LSTM, could also be called n_j
n_a=64

# Number of features
feature_size = len(feature_cols)

# number of examples
m = len(deliveries.match_id.unique())

# List of all the match id's used in the dataset (note not in temporal order and some ids are quite random)
match_ids = deliveries.match_id.unique()

# Number of deliveries for each match
Number_of_deliveries = [deliveries.match_id.value_counts(sort=False).to_dict()[x] for x in match_ids]

# Longest game, number of deliveries
max_game_length = max(Number_of_deliveries)

# find the indices of the match starts in the original dataset
ind_start_of_match = np.cumsum(Number_of_deliveries)
# check =np.asarray(deliveries['ball'])[ind_start_of_match]


# Turn pandas df into a np array and select features of interest
npdevs = np.asarray(deliveries[feature_cols])


# Set up global NN layers with shareable weights
LSTM_cell = LSTM(n_a, return_state = True)  
reshapor = Reshape((1, feature_size)) 
densor = Dense(1,activation='sigmoid')

In [30]:
# Insert zeros so that each game is of equal length (the length of the max)
npdevs_pad = npdevs
insertion_counter =0
for ind,i  in enumerate(ind_start_of_match):
    balls_short_of_max = max_game_length-Number_of_deliveries[ind]
    a = np.zeros((balls_short_of_max,feature_size)) 
    npdevs_pad = np.insert(npdevs_pad, i+insertion_counter, a, 0)
    insertion_counter += balls_short_of_max


In [31]:
# Find indices of the match starts in the dataset with extra passing rows added
# I.e. new match at every max_game_length number of delvieries 

ind_start_of_match_pad = np.cumsum(np.ones((ind_start_of_match.shape))*max_game_length)-max_game_length
check =np.asarray(npdevs_pad)[ind_start_of_match_pad.astype(int)[:-1]]


# Convert to tensor (not sure this is needed anymore)
# and remove the last index (which refers to the end of the last game)
ind_start_of_match_pad=tf.convert_to_tensor(ind_start_of_match_pad.astype(int)[:-1])


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [9]:
# X shape is m by game length by feature size

def IPL_model(LSTM_units,dropout):
    """
    
    Arguments:


    Returns:
    model -- a model instance in Keras
    """

    deliveries = Input(shape= (max_game_length,feature_size))
    
    
    # Propagate the input through an LSTM layer with 'LSTM_units'-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(units=LSTM_units,return_sequences=True)(deliveries)
#     # Add dropout with a probability of 'dropout'
#     X = Dropout(rate=dropout)(X)
#     # Propagate X trough another LSTM layer with 'LSTM_units'-dimensional hidden state
#     # The returned output should be a batch of sequences.
#     X = LSTM(units=LSTM_units, return_sequences=True)(X)
#     # Add dropout with a probability of 'dropout'
#     X = Dropout(rate=dropout)(X)
    # Propagate X through a Dense layer with 1 unit
    X = TimeDistributed(Dense(units=1,activation='sigmoid'))(X)

    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=deliveries,outputs=X)
    

    
    return model

In [10]:
LSTM_units = 128
dropout =0.2
model = IPL_model(LSTM_units,dropout)

In [11]:
opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.01)

model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))

# Don't care how much the team wins or loses by, 
# if the score in this col is greater than 1, then the bat first team won

# Labels are set for every ball of the game i.e. Y is shape max_game_length x number_of_games (m)

Y_final = np.array(matches.win_by_runs.clip(upper=1))
Y = Y_final.reshape(m,1)*np.ones((m,max_game_length))
Y = Y.reshape(m,max_game_length,1)
X = np.transpose(npdevs_pad.reshape((max_game_length,m,feature_size),order='F' ),(1, 0, 2))
# X shape is m by game length by feature size


In [13]:
model.summary()
print("Inputs: "+str(model.input_shape))
print("Outputs:"+str(model.output_shape))
print("Actual input:"+str(X.shape))
print("Actual output:"+str(Y.shape))

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 267, 44)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 267, 128)          88576     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 267, 1)            129       
Total params: 88,705
Trainable params: 88,705
Non-trainable params: 0
_________________________________________________________________
Inputs: (None, 267, 44)
Outputs:(None, 267, 1)
Actual input:(756, 267, 44)
Actual output:(756, 267, 1)


In [14]:
# Note that poor val_accuracy here isn't necessarily bad news
# The performance over the whole game is averaged out so good performance
# towards the end of a game could be swamped by uncertainty at the start
model.fit(X, Y, verbose = 2, epochs=200, validation_split=0.1,shuffle=True)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 680 samples, validate on 76 samples
Epoch 1/200
 - 7s - loss: 0.7236 - accuracy: 0.4944 - val_loss: 0.7158 - val_accuracy: 0.4222
Epoch 2/200
 - 7s - loss: 0.6883 - accuracy: 0.5424 - val_loss: 0.7048 - val_accuracy: 0.4276
Epoch 3/200
 - 7s - loss: 0.6848 - accuracy: 0.5447 - val_loss: 0.7019 - val_accuracy: 0.4276
Epoch 4/200
 - 7s - loss: 0.6847 - accuracy: 0.5444 - val_loss: 0.7034 - val_accuracy: 0.4276
Epoch 5/200
 - 7s - loss: 0.6844 - accuracy: 0.5446 - val_loss: 0.6950 - val_accuracy: 0.4276
Epoch 6/200
 - 7s - loss: 0.6832 - accuracy: 0.5381 - val_loss: 0.6746 - val_accuracy: 0.6175
Epoch 7/200
 - 7s - loss: 0.6805 - accuracy: 0.5526 - val_loss: 0.6747 - val_accuracy: 0.6175
Epoch 8/200
 - 8s - loss: 0.6772 - accuracy: 0.5647 - val_loss: 0.6785 - val_accuracy: 0.6136
Epoch 9/200
 - 7s - loss: 0.6710 - accuracy: 0.5710 - val_loss: 0.6712 - val_accuracy: 0.6182
Epoch 10/200
 

Epoch 81/200
 - 7s - loss: 0.6226 - accuracy: 0.6010 - val_loss: 0.6704 - val_accuracy: 0.6152
Epoch 82/200
 - 7s - loss: 0.6220 - accuracy: 0.6009 - val_loss: 0.6671 - val_accuracy: 0.6208
Epoch 83/200
 - 7s - loss: 0.6211 - accuracy: 0.6025 - val_loss: 0.6738 - val_accuracy: 0.6134
Epoch 84/200
 - 7s - loss: 0.6211 - accuracy: 0.6017 - val_loss: 0.6660 - val_accuracy: 0.6219
Epoch 85/200
 - 8s - loss: 0.6206 - accuracy: 0.6017 - val_loss: 0.6685 - val_accuracy: 0.6184
Epoch 86/200
 - 7s - loss: 0.6197 - accuracy: 0.6027 - val_loss: 0.6699 - val_accuracy: 0.6170
Epoch 87/200
 - 7s - loss: 0.6205 - accuracy: 0.6019 - val_loss: 0.6727 - val_accuracy: 0.6132
Epoch 88/200
 - 7s - loss: 0.6193 - accuracy: 0.6033 - val_loss: 0.6670 - val_accuracy: 0.6191
Epoch 89/200
 - 7s - loss: 0.6190 - accuracy: 0.6028 - val_loss: 0.6667 - val_accuracy: 0.6199
Epoch 90/200
 - 7s - loss: 0.6183 - accuracy: 0.6048 - val_loss: 0.6670 - val_accuracy: 0.6186
Epoch 91/200
 - 7s - loss: 0.6187 - accuracy: 0.60

Epoch 167/200
 - 7s - loss: 0.6038 - accuracy: 0.6167 - val_loss: 0.6672 - val_accuracy: 0.6180
Epoch 168/200
 - 7s - loss: 0.6032 - accuracy: 0.6171 - val_loss: 0.6662 - val_accuracy: 0.6189
Epoch 169/200
 - 7s - loss: 0.6032 - accuracy: 0.6175 - val_loss: 0.6662 - val_accuracy: 0.6184
Epoch 170/200
 - 7s - loss: 0.6031 - accuracy: 0.6169 - val_loss: 0.6654 - val_accuracy: 0.6195
Epoch 171/200
 - 7s - loss: 0.6030 - accuracy: 0.6174 - val_loss: 0.6655 - val_accuracy: 0.6182
Epoch 172/200
 - 7s - loss: 0.6030 - accuracy: 0.6172 - val_loss: 0.6652 - val_accuracy: 0.6178
Epoch 173/200
 - 7s - loss: 0.6040 - accuracy: 0.6178 - val_loss: 0.6646 - val_accuracy: 0.6190
Epoch 174/200
 - 7s - loss: 0.6030 - accuracy: 0.6183 - val_loss: 0.6687 - val_accuracy: 0.6147
Epoch 175/200
 - 7s - loss: 0.6024 - accuracy: 0.6189 - val_loss: 0.6641 - val_accuracy: 0.6218
Epoch 176/200
 - 7s - loss: 0.6025 - accuracy: 0.6185 - val_loss: 0.6677 - val_accuracy: 0.6160
Epoch 177/200
 - 7s - loss: 0.6028 - acc

<keras.callbacks.callbacks.History at 0x7f79b9c21a10>

In [21]:

# score, acc = model.evaluate(x_test, y_test)
y_predict = model.predict(X[0:6,:,:])
y_predict -Y[0:6]


array([[[-0.42830646],
        [-0.35539317],
        [-0.31181675],
        ...,
        [-0.00939202],
        [-0.00935304],
        [-0.00930732]],

       [[ 0.45081756],
        [ 0.38250166],
        [ 0.29523429],
        ...,
        [ 0.01730219],
        [ 0.01867646],
        [ 0.0201326 ]],

       [[ 0.38789913],
        [ 0.25861868],
        [ 0.17171052],
        ...,
        [ 0.05770382],
        [ 0.05866542],
        [ 0.0595746 ]],

       [[ 0.45161086],
        [ 0.39486498],
        [ 0.32585096],
        ...,
        [ 0.01739186],
        [ 0.01875523],
        [ 0.02015129]],

       [[-0.46068239],
        [-0.4476558 ],
        [-0.4108355 ],
        ...,
        [-0.00674045],
        [-0.00666755],
        [-0.006598  ]],

       [[ 0.44923455],
        [ 0.39714831],
        [ 0.32638815],
        ...,
        [ 0.05533758],
        [ 0.05632475],
        [ 0.0572699 ]]])