In [None]:
# Prepare Data
#
import pandas as pd;

# This is an extensive data set and contain several other features. 
# This data set contains data about the batsman (both striker and non-striker) and bowler
data = pd.read_csv('../data_files/final_over_data_v2.csv');

# Use the match_id as the Key / Index column.
data.set_index("match_id", inplace = True)

# We have runs to win (numerical data). If the runs to win in last over are more than 36, the match is anyways lost.
# So, we'll convert it to Runs per bowl required. Anything above 36 will be considered as 7
# We'll bin it so that we dont have the continious data.
cut_labels = ['1','2','3','4','5','6', '7'];
cut_bins = [0, 6, 12, 18, 24, 30, 36, 200];

# Insert directly after runs to win column.
data.insert(3,'RPB',pd.cut(data['runs_to_win'], bins=cut_bins, labels=cut_labels))
data.head()

In [20]:
## Check for data quality.
#data["bowler_eg_economy"].isnull()

# For training, we take a subset of columns.
# Now, I have decided NOT to use RPB, but runs to win (because its easy to input from UI)
training_data = data[["runs_to_win","wickets_in_hand","bowler_eg_economy","batsman_eg_sr","non_striker_eg_sr","match_result"]]
training_data.head()

# To check if there is any data with Null values.
#null_data = training_data[training_data.isnull().any(axis=1)]
#null_data.head(10)

Unnamed: 0_level_0,runs_to_win,wickets_in_hand,bowler_eg_economy,batsman_eg_sr,non_striker_eg_sr,match_result
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
829774,20,4,10.5604,180.7692,146.0465,Won
392241,26,4,10.723,109.0909,112.7907,Won
392201,8,6,10.4821,160.0,109.0909,Lost
392234,11,4,9.5215,182.2469,112.7907,Won
829732,5,6,10.723,157.3477,146.0465,Lost


In [21]:
# This is the simplest version of Analysis (my first version)
# We'll use a Decision Tree Classifier
#
# 
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import graphviz
from sklearn.metrics import accuracy_score

# Split the training data into Train and Test.
train_set, test_set = train_test_split(training_data, test_size=0.2)

feature_columns = ["runs_to_win","wickets_in_hand","bowler_eg_economy","batsman_eg_sr","non_striker_eg_sr"]

# Training columns. For now, I'm taking only few columns.
X = train_set[feature_columns]
y = train_set["match_result"]
#y=y.astype('int')

def decision_tree(params,feature,results):
    classifier = tree.DecisionTreeClassifier(**params)
    classifier = classifier.fit(feature,results)
    return classifier

def check_accuracy(classifier):
    test_results = clf.predict(test_set[feature_columns])
    return accuracy_score(test_set["match_result"],test_results)

def random_forest(params,feature,results):
    classifier = RandomForestClassifier(**params)
    classifier = classifier.fit(feature,results)
    return classifier


In [22]:
# Test with Decision Tree with different parameters
params = {}
clf = decision_tree(params,X,y)
print("Accuracy with default parameters is: ", check_accuracy(clf))

params = {"max_depth": 5}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 5 is: ", check_accuracy(clf))

params = {"max_depth": 10}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 10 is: ", check_accuracy(clf))

params = {"max_depth": 15}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 15 is: ", check_accuracy(clf))

params = {"max_depth": 20}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 20 is: ", check_accuracy(clf))

Accuracy with default parameters is:  0.875
Accuracy with Max depth 5 is:  0.9027777777777778
Accuracy with Max depth 10 is:  0.8194444444444444
Accuracy with Max depth 15 is:  0.875
Accuracy with Max depth 20 is:  0.875


In [23]:
# Test with Random Forests with parameters
params = {}
clf = random_forest(params,X,y)
print("Accuracy with default parameters is: ", check_accuracy(clf))

params = {"max_depth": 5}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 5 is: ", check_accuracy(clf))

params = {"max_depth": 10}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 10 is: ", check_accuracy(clf))

params = {"max_depth": 15}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 15 is: ", check_accuracy(clf))

params = {"max_depth": 20}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 20 is: ", check_accuracy(clf))

Accuracy with default parameters is:  0.9166666666666666
Accuracy with Max depth 5 is:  0.9166666666666666
Accuracy with Max depth 10 is:  0.9027777777777778
Accuracy with Max depth 15 is:  0.9027777777777778
Accuracy with Max depth 20 is:  0.9305555555555556


In [None]:
# Plot the confusion matrix to understand the prediction results.
from sklearn.metrics import plot_confusion_matrix
X_test = test_set[feature_columns]
y_test = test_set["match_result"]
plot_confusion_matrix(clf, X_test, y_test)

In [None]:
# Export the tree as a image.
import matplotlib.pyplot as plt

cn=['Won','Lost']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (8,8), dpi=1000)
tree.plot_tree(clf,
               feature_names = feature_columns, 
               class_names=cn,
               filled = True);
fig.savefig('v2_tree.png')

In [None]:
# Save model as a file
from joblib import dump

# Based on the above results, we use this model for future predictions
params = {"max_depth": 10}
final_model = random_forest(params,X,y)

dump(final_model, 'ml_model.joblib')

In [None]:
# Load Model from file
from joblib import load
from sklearn.metrics import accuracy_score

mymodel = load('ml_model.joblib')

tdf = pd.DataFrame([{'runs_to_win': 12,'wickets_in_hand': 5,'bowler_eg_economy': 9,'batsman_eg_sr': 170,'non_striker_eg_sr': 122}])
#tdf.head()
pred_prob = mymodel.predict_proba(tdf)
print(pred_prob)
print(mymodel.classes_)
print(mymodel.predict(tdf))
print(pred_prob[0][1])
#accuracy_score(test_set["match_result"],test_results)
#predict_proba