In [1]:
# Prepare Data
#
import pandas as pd;
data = pd.read_csv('data_files/final_over_data.csv');

# Use the match_id as the Key / Index column.
data.set_index("match_id", inplace = True)

# We have runs to win (numerical data). If the runs to win in last over are more than 36, the match is anyways lost.
# So, we'll convert it to Runs per bowl required. Anything above 36 will be considered as 7
# We'll bin it so that we dont have the continious data.
cut_labels = ['1','2','3','4','5','6', '7'];
cut_bins = [0, 6, 12, 18, 24, 30, 36, 200];

# Insert directly after runs to win column.
data.insert(3,'RPB',pd.cut(data['runs_to_win'], bins=cut_bins, labels=cut_labels))
data.head()

Unnamed: 0_level_0,innings_1_score,runs_to_win,wickets_in_hand,RPB,bowler_runs,bowler_econ,bowler_wickets,bowler_boundaries,batsman_runs,batsman_sr,boundaries,match_result
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
335988,240,43,6,7,38,12.6667,1,6,13,100.0,0,Won
335990,165,3,5,1,16,8.0,1,3,38,211.1111,6,Lost
335994,208,19,3,4,17,5.6667,2,1,35,233.3333,5,Won
335995,214,17,3,3,28,14.0,0,4,1,100.0,0,Lost
335996,182,67,1,7,19,6.3333,2,2,2,66.6667,0,Won


In [2]:
# This is the simplest version of Analysis (my first version)
# We'll use a Decision Tree Classifier
#
# 
from sklearn import tree
from sklearn.model_selection import train_test_split
import graphviz

# Split the training data into Train and Test.
train_set, test_set = train_test_split(data, test_size=0.2)


# Training columns. For now, I'm taking only few columns.
X = train_set[["innings_1_score","RPB","wickets_in_hand","bowler_econ","bowler_boundaries","batsman_sr","boundaries"]]
y = train_set["match_result"]
#y=y.astype('int')

# For now, we'll create a default tree. We'll see the accuracy / complexity and then decide the parameters.
# Some things to be adjusted are max_leaf_nodes
#
clf = tree.DecisionTreeClassifier()
clf=clf.fit(X,y)


In [None]:
# Export the tree as a image.
import matplotlib.pyplot as plt

fn=["innings_1_score","RPB","wickets_in_hand","bowler_econ","bowler_boundaries","batsman_sr","boundaries"]
cn=['Won','Lost']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (8,8), dpi=1000)
tree.plot_tree(clf,
               feature_names = fn, 
               class_names=cn,
               filled = True);
fig.savefig('decision_tree.png')

In [None]:
# Test the Decision tree to calculate the accuracy
test_results = clf.predict(test_set[["innings_1_score","RPB","wickets_in_hand","bowler_econ","bowler_boundaries","batsman_sr","boundaries"]])

from sklearn.metrics import accuracy_score
accuracy_score(test_set["match_result"],test_results)


In [None]:
# Plot the confusion matrix to understand the prediction results.
from sklearn.metrics import plot_confusion_matrix
X_test = test_set[["innings_1_score","RPB","wickets_in_hand","bowler_econ","bowler_boundaries","batsman_sr","boundaries"]]
y_test = test_set["match_result"]
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

In [None]:
# Analyze which matches were not predicted correctly.
#test_results
#tr = pd.DataFrame(test_results, columns=['act'])
#tr.insert(1,'predicted_result',test_set["match_result"])
#tr.head()
#act_vs_predicted = test_set["match_result"]
#act_vs_predicted['predicted_result'] = tr["act"]
#act_vs_predicted.insert(1,'predicted_result',tr["act"])
#pd.concat([act_vs_predicted, pd.DataFrame(test_results)], axis=1)
#act_vs_predicted.head()

In [None]:
# Save model as a file
from joblib import dump
dump(clf, 'ml_model.joblib')

In [10]:
# Load Model from file
from joblib import load
from sklearn.metrics import accuracy_score

mymodel = load('ml_model.joblib')

tdf = pd.DataFrame([{'innings_1_score': 170,'RPB': 2,'wickets_in_hand': 5,'bowler_econ': 9,'bowler_boundaries': 7,'batsman_sr': 125,'boundaries': 3}])
#tdf.head()
test_results = mymodel.predict(tdf)
test_results
#accuracy_score(test_set["match_result"],test_results)

array(['Lost'], dtype=object)