In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
pd.options.mode.chained_assignment = None

In [None]:
dataset = pd.DataFrame(pd.read_csv('Dataset/matches1234.csv'))
dataset.head()

In [None]:
dataset.isnull().any()

In [None]:
dataset['city'].fillna('Dubai', inplace=True)
dataset['umpire1'].fillna('S Ravi', inplace=True)

In [None]:
dataset.isnull().any()

In [None]:
dataset.replace(['Mumbai Indians', 'Kolkata Knight Riders', 'Royal Challengers Bangalore', 'Deccan Chargers',
                 'Chennai Super Kings', 'Rajasthan Royals', 'Delhi Daredevils', 'Gujarat Lions', 'Kings XI Punjab',
                 'Sunrisers Hyderabad', 'Rising Pune Supergiants', 'Kochi Tuskers Kerala', 'Pune Warriors'],
                ['MI','KKR','RCB','DC','CSK','RR','DD','GL','KXIP','SRH','RPS','KTK','PW'],
                inplace=True)

encode = {'team1': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11,
                    'KTK': 12, 'PW': 13},
          'team2': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11,
                    'KTK': 12, 'PW': 13},
          'toss_winner': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11,
                        'KTK': 12, 'PW': 13},
          'winner': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11,
                     'KTK': 12, 'PW': 13, 'Draw': 14}}
dataset.replace(encode, inplace=True)

In [None]:
dataset.head()

In [None]:
X = pd.DataFrame(dataset[['season', 'city', 'team1', 'team2', 'toss_winner', 'toss_decision', 'venue', 'umpire1']])
y = pd.DataFrame(dataset[['winner']])

In [None]:
from sklearn.model_selection import train_test_split
X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_model, y_model, test_size=0.2, random_state=0)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
cols = ['city', 'toss_decision', 'venue', 'umpire1']
for col in cols:
    X[col] = le3.fit_transform(X[col])
    X_model[col] = le2.fit_transform(X_model[col])
    X_train[col] = le.fit_transform(X_train[col])
    X_val[col] = le.transform(X_val[col])
    X_test[col] = le.transform(X_test[col])

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc2 = StandardScaler()
sc3 = StandardScaler()
X = sc3.fit_transform(X)
X_model = sc2.fit_transform(X_model)
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train.values.ravel())

In [None]:
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train.values.ravel())
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train.values.ravel())

In [None]:
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear')
classifier.fit(X_train, y_train.values.ravel())

In [None]:
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf')
classifier.fit(X_train, y_train.values.ravel())

In [None]:
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train.values.ravel())
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
train_score = []
val_score = []
for depth in np.arange(1, 15):
    classifier = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=5)
    classifier.fit(X_train, y_train)
    train_score.append(classifier.score(X_train, y_train))
    val_score.append(classifier.score(X_val, y_val))

plt.plot(np.arange(1, 15), train_score)
plt.plot(np.arange(1, 15), val_score)
plt.legend(['Training Accuracy','Validation Accuracy'])
plt.title('Decision Tree Tuning')
plt.xlabel('Depth')
plt.ylabel('Accuracy')

In [None]:
train_score = []
val_score = []
for leaf in np.arange(10, 20):
    classifier = DecisionTreeClassifier(max_depth=7, min_samples_leaf=leaf)
    classifier.fit(X_train, y_train)
    train_score.append(classifier.score(X_train, y_train))
    val_score.append(classifier.score(X_val, y_val))

plt.plot(np.arange(10, 20), train_score)
plt.plot(np.arange(10, 20), val_score)
plt.legend(['Training Accuracy','Validation Accuracy'])
plt.title('Decision Tree Tuning')
plt.xlabel('Minimum Samples Leaf')
plt.ylabel('Accuracy')

In [None]:
classifier = DecisionTreeClassifier(max_depth=7, min_samples_leaf=14)
classifier.fit(X_train, y_train.values.ravel())
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=1)
classifier.fit(X_train, y_train.values.ravel())
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
train_score = []
val_score = []
for n_estimators in np.arange(1, 50):
    classifier = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1, random_state=1)
    classifier.fit(X_train, y_train.values.ravel())
    train_score.append(classifier.score(X_train, y_train))
    val_score.append(classifier.score(X_val, y_val))

plt.plot(np.arange(1, 50), train_score)
plt.plot(np.arange(1, 50), val_score)
plt.legend(['Training Accuracy','Validation Accuracy'])
plt.title('Random Forest Tuning')
plt.xlabel('Number of Decision Trees')
plt.ylabel('Accuracy')

In [None]:
train_score = []
val_score = []
for min_samples_leaf in np.arange(1, 40):
    classifier = RandomForestClassifier(min_samples_leaf=min_samples_leaf, n_jobs=-1, random_state=1)
    classifier.fit(X_train, y_train.values.ravel())
    train_score.append(classifier.score(X_train, y_train))
    val_score.append(classifier.score(X_val, y_val))

plt.plot(np.arange(1, 40), train_score)
plt.plot(np.arange(1, 40), val_score)
plt.legend(['Training Accuracy','Validation Accuracy'])
plt.title('Random Forest Tuning')
plt.xlabel('Minimum Number of Samples at Leaf Node')
plt.ylabel('Accuracy')

In [None]:
from sklearn.model_selection import GridSearchCV, cross_validate
import math
gsc = GridSearchCV(
        estimator=RandomForestClassifier(n_jobs=-1),
        param_grid={'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                     'min_samples_leaf': [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                     'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
                   }, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result = gsc.fit(X, y)
best_params = grid_result.best_params_
best_rfc = RandomForestClassifier(n_estimators=best_params['n_estimators'], min_samples_leaf=best_params['min_samples_leaf'], max_depth=best_params['max_depth'])

scoring = {
           'abs_error': 'neg_mean_absolute_error',
           'squared_error': 'neg_mean_squared_error'
          }

scores = cross_validate(best_rfc, X, y, cv=10, scoring=scoring, return_train_score=True)
print("MAE :", abs(scores['test_abs_error'].mean()), "| RMSE :", math.sqrt(abs(scores['test_squared_error'].mean())))

In [None]:
print(best_params)
best_rfc = RandomForestClassifier(max_depth=None, min_samples_leaf=1, n_estimators=100, random_state=1)
best_rfc.fit(X_train, y_train.values.ravel())
print(best_rfc.score(X_model, y_model))
print(best_rfc.score(X_test, y_test))

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_model, y_model.values.ravel())

In [None]:
print(classifier.score(X_train, y_train))
print(classifier.score(X_test, y_test))

In [None]:
from sklearn.model_selection import GridSearchCV, cross_validate
import math
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,10,2)
}
gsc = GridSearchCV(
        estimator=XGBClassifier(objective='multi:softprob', n_jobs=-1),
        param_grid=param_test1, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_result = gsc.fit(X, y)
best_params = grid_result.best_params_
best_xgb = XGBClassifier(min_child_weight=best_params['min_child_weight'], max_depth=best_params['max_depth'], n_jobs=-1)

scoring = {
           'abs_error': 'neg_mean_absolute_error',
           'squared_error': 'neg_mean_squared_error'
          }

scores = cross_validate(best_xgb, X, y, cv=10, scoring=scoring, return_train_score=True)
print("MAE :", abs(scores['test_abs_error'].mean()), "| RMSE :", math.sqrt(abs(scores['test_squared_error'].mean())))

In [None]:
print(best_params)
best_xgb = XGBClassifier(min_child_weight=best_params['min_child_weight'], max_depth=best_params['max_depth'], n_jobs=-1)
best_xgb.fit(X_train, y_train)
print(best_xgb.score(X_train, y_train))
print(best_xgb.score(X_test, y_test))