In [None]:
# Prepare Data
#
import pandas as pd;

# In this data set - we only have the current match stats of bowlers and matsman.
data = pd.read_csv('../data_files/final_over_data.csv');

# Use the match_id as the Key / Index column.
data.set_index("match_id", inplace = True)

# We have runs to win (numerical data). If the runs to win in last over are more than 36, the match is anyways lost.
# So, we'll convert it to Runs per bowl required. Anything above 36 will be considered as 7
# We'll bin it so that we dont have the continious data.
cut_labels = ['1','2','3','4','5','6', '7'];
cut_bins = [0, 6, 12, 18, 24, 30, 36, 200];

# Insert directly after runs to win column.
data.insert(3,'RPB',pd.cut(data['runs_to_win'], bins=cut_bins, labels=cut_labels))
data.head()

In [None]:
# This is the simplest version of Analysis (my first version)
# We'll use a Decision Tree Classifier
#
# 
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import graphviz
from sklearn.metrics import accuracy_score

# Split the training data into Train and Test.
train_set, test_set = train_test_split(data, test_size=0.2)

# Training columns. For now, I'm taking only few columns.
feature_columns = ["innings_1_score","RPB","wickets_in_hand","bowler_econ","bowler_boundaries","batsman_sr","boundaries"]
X = train_set[feature_columns]
y = train_set["match_result"]
#y=y.astype('int')

def decision_tree(params,feature,results):
    classifier = tree.DecisionTreeClassifier(**params)
    classifier = classifier.fit(feature,results)
    return classifier

def check_accuracy(classifier):
    test_results = clf.predict(test_set[feature_columns])
    return accuracy_score(test_set["match_result"],test_results)

def random_forest(params,feature,results):
    classifier = RandomForestClassifier(**params)
    classifier = classifier.fit(feature,results)
    return classifier

# For now, we'll create a default tree. We'll see the accuracy / complexity and then decide the parameters.
# Some things to be adjusted are max_leaf_nodes
#


In [None]:
# Test with default parameters
params = {}
clf = decision_tree(params,X,y)
print("Accuracy with default parameters is: ", check_accuracy(clf))

params = {"max_depth": 5}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 5 is: ", check_accuracy(clf))

params = {"max_depth": 10}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 10 is: ", check_accuracy(clf))

params = {"max_depth": 15}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 15 is: ", check_accuracy(clf))

params = {"max_depth": 20}
clf = decision_tree(params,X,y)
print("Accuracy with Max depth 20 is: ", check_accuracy(clf))

In [None]:
# Test with default parameters
params = {}
clf = random_forest(params,X,y)
print("Accuracy with default parameters is: ", check_accuracy(clf))

params = {"max_depth": 5}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 5 is: ", check_accuracy(clf))

params = {"max_depth": 10}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 10 is: ", check_accuracy(clf))

params = {"max_depth": 15}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 15 is: ", check_accuracy(clf))

params = {"max_depth": 20}
clf = random_forest(params,X,y)
print("Accuracy with Max depth 20 is: ", check_accuracy(clf))

In [None]:
# Plot the confusion matrix to understand the prediction results.
from sklearn.metrics import plot_confusion_matrix
X_test = test_set[feature_columns]
y_test = test_set["match_result"]
plot_confusion_matrix(clf, X_test, y_test)

In [None]:
# Export the tree as a image.
import matplotlib.pyplot as plt

cn=['Won','Lost']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (8,8), dpi=1000)
tree.plot_tree(clf,
               feature_names = feature_columns, 
               class_names=cn,
               filled = True);
fig.savefig('v1_tree.png')

In [None]:
# Save model as a file
from joblib import dump
dump(clf, 'ml_model.joblib')

In [None]:
# Load Model from file
from joblib import load
from sklearn.metrics import accuracy_score

mymodel = load('ml_model.joblib')

tdf = pd.DataFrame([{'innings_1_score': 170,'RPB': 2,'wickets_in_hand': 5,'bowler_econ': 9,'bowler_boundaries': 7,'batsman_sr': 125,'boundaries': 3}])
#tdf.head()
pred_prob = mymodel.predict_proba(tdf)
print(pred_prob)
print(mymodel.classes_)
print(mymodel.predict(tdf))
print(pred_prob[0][1])
#clf.decision_path([{'innings_1_score': 170,'RPB': 2,'wickets_in_hand': 5,'bowler_econ': 9,'bowler_boundaries': 7,'batsman_sr': 125,'boundaries': 3}])

#accuracy_score(test_set["match_result"],test_results)