In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [3]:
# Importing datset
train_data = pd.read_csv('newdata//1//train_data.csv',)

test_data = pd.read_csv('newdata//1//test_data.csv')

In [4]:
train_data = data_preprocessing(train_data)
test_data = data_preprocessing(test_data)

In [6]:
cols = ['team1Num50Last15',
       'team1WinpLast5', 'team1OnlyAvgRunsLast15', 'team1WinpLast15F2F',
       'groundAvgRunsLast15', 'groundAvgWicketsLast15', 'lightAvgRunsLast15',
       'lightAvgWicketsLast15', 'seriesAvgRunsLast15',
       'seriesAvgWicketsLast15', 'inn1AvgRunsLast15', 'inn2AvgRunsLast15',
       'inn1AvgWicketsLast15', 'inn2AvgWicketsLast15', 'team1tossWinnerWins',
       'team1BatsFirstWins', 'team1BatsSecondWins', 'team1WinpLight',
       'team1WinpSeries', 'team1AvgRunsMargin', 'team1AvgWicketsMargin',
       'team1AvgWicketsLost', 'team1AvgRR']

In [7]:
train_data_test = train_data.copy()
train_data_test.sort_values(by='match_dt', inplace=True)
X_train, y_train, X_test, y_test = train_data_test[cols][:670], train_data_test['winner_01'][:670], train_data_test[cols][670:], train_data_test['winner_01'][670:]

In [8]:
# XGBOOST
clf_xgb = xgb.XGBClassifier(booster = 'gbtree', random_state=0, njobs=-1, verbosity=0, eval_metric='error', objective='binary:logistic')
param_xgb = {'n_estimators':[50,100,200,500],
            'learning_rate':[0.01, 0.05, 0.1, 0.2],
            'max_depth':[3,5,7,9]}

gs_xgb = GridSearchCV(clf_xgb, param_xgb, cv=5, n_jobs=-1)
gs_xgb.fit(X_train, y_train)
print(f"Best parameters are: {gs_xgb.best_params_}")
y_pred = gs_xgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Accuracy: 0.5719424460431655


In [9]:
a = list(zip(cols, gs_xgb.best_estimator_.feature_importances_*1000))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_xgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_xgb

Unnamed: 0,feat_name,model_feat_imp_train
0,team1Num50Last15,80.80397
1,team1WinpLast5,59.155521
2,inn1AvgWicketsLast15,57.640949
3,team1AvgWicketsLost,57.5107
4,team1WinpLight,55.77634
5,team1BatsFirstWins,50.797565
6,inn1AvgRunsLast15,48.862026
7,lightAvgRunsLast15,48.014484
8,groundAvgRunsLast15,47.978951
9,team1AvgRunsMargin,47.050209


In [10]:
# # CATBOOST
# clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0)
# param_cat = {'learning_rate':[0.01, 0.05, 0.1, 0.2],
#             'depth':[3,5,7,9],
#             'subsample':[0.05, 0.1, 0.2, 0.5],
#             'min_data_in_leaf':[10,40,70,100]}

# clf_cat = GridSearchCV(clf_cat, param_cat, cv=5, n_jobs=-1)
# clf_cat.fit(X_train, y_train)
# print(f"Best parameters are: {clf_cat.best_params_}")
# y_pred = clf_cat.predict(X_test)
# acc = np.mean(y_pred == y_test)
# print(f'Accuracy: {acc}')

In [11]:
# clf_cat.best_estimator_.get_feature_importance(prettified=True)

In [12]:
# CATBOOST
clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0)
clf_cat.fit(X_train, y_train)
y_pred = clf_cat.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Accuracy: 0.5719424460431655


In [13]:
a = list(zip(cols, clf_cat.feature_importances_*10))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_cat = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_cat

Unnamed: 0,feat_name,model_feat_imp_train
0,team1WinpLast5,65.451934
1,team1OnlyAvgRunsLast15,63.1011
2,seriesAvgRunsLast15,60.204633
3,lightAvgRunsLast15,53.583463
4,team1Num50Last15,52.968947
5,groundAvgRunsLast15,51.331304
6,team1AvgRR,50.682441
7,team1tossWinnerWins,46.826899
8,team1AvgWicketsMargin,44.089968
9,team1AvgRunsMargin,44.001332


In [14]:
# # GB
# clf_gb = GradientBoostingClassifier(random_state=0)
# param_gb = {'learning_rate':[0.01, 0.05, 0.1, 0.2],
#             'max_depth':[3,5,7,9],
#             'n_estimators':[50,100,200,500],
#             'subsample':[0.05, 0.1, 0.2, 0.5],
#             'min_samples_split':[2,5,10,15]}

# clf_gb = GridSearchCV(clf_gb, param_gb, cv=5, n_jobs=-1)
# clf_gb.fit(X_train, y_train)
# print(f"Best parameters are: {clf_gb.best_params_}")
# y_pred = clf_gb.predict(X_test)
# acc = np.mean(y_pred == y_test)
# print(f'Accuracy: {acc}')

In [15]:
# feature_importances = clf_gb.best_estimator_.feature_importances_
# a = list(zip(cols, feature_importances))
# a.sort(key=lambda x: x[1], reverse=True)
# a

In [16]:
# GB
clf_gb = GradientBoostingClassifier(random_state=0)

clf_gb.fit(X_train, y_train)
y_pred = clf_gb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Accuracy: 0.5503597122302158


In [17]:
a = list(zip(cols, clf_gb.feature_importances_*1000))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_gb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_gb

Unnamed: 0,feat_name,model_feat_imp_train
0,team1Num50Last15,77.298849
1,team1AvgRR,73.666108
2,team1AvgWicketsLost,66.109236
3,team1WinpLast5,57.483363
4,team1BatsFirstWins,55.017509
5,team1WinpLight,54.69448
6,team1WinpSeries,48.787388
7,team1AvgRunsMargin,48.617492
8,lightAvgRunsLast15,48.550623
9,seriesAvgRunsLast15,46.423331


In [18]:
# LGB
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 5)
param_lgb = {'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'min_data_in_leaf':[10,40,70,100],
            'max_depth':[3,5,7,9],}

clf_lgb = GridSearchCV(clf_lgb, param_lgb, cv=5, n_jobs=-1)
clf_lgb.fit(X_train, y_train)
print(f"Best parameters are: {clf_lgb.best_params_}")
y_pred = clf_lgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'learning_rate': 0.01, 'max_depth': 5, 'min_data_in_leaf': 70}
Accuracy: 0.5539568345323741


In [19]:
a = list(zip(cols, clf_lgb.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_lgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_lgb

Unnamed: 0,feat_name,model_feat_imp_train
0,team1Num50Last15,71
1,lightAvgRunsLast15,44
2,seriesAvgRunsLast15,42
3,groundAvgRunsLast15,40
4,team1WinpLight,27
5,team1WinpLast5,21
6,inn2AvgRunsLast15,20
7,seriesAvgWicketsLast15,19
8,team1BatsFirstWins,18
9,team1tossWinnerWins,17


In [20]:
# # Testing the model
# clf_xgb = gs_xgb.best_estimator_
# clf_cat = clf_cat.best_estimator_
# clf_gb = clf_gb.best_estimator_
# clf_lgbm = clf_lgb.best_estimator_

# models = {
#     'xgb': clf_xgb,
#     'cat': clf_cat,
#     'gb' : clf_gb
#     'lgb': clf_lgbm
# }

# trained_models = [(name, model) for name, model in models.items()]

# ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
# ensemble.fit(X_train, y_train)
# y_pred = ensemble.predict(X_test)

# acc = np.mean(y_pred == y_test)
# print(f'Accuracy for ensemble: {acc}')

In [21]:
# feature_importance = np.mean(np.array([clf.feature_importances_ for clf in ensemble.estimators_]), axis=0)
# a = list(zip(cols, feature_importance))
# a.sort(key=lambda x: x[1], reverse=True)
# feature_importance = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
# feature_importance

In [22]:
# Testing the model
clf_xgb = gs_xgb.best_estimator_
clf_lgb = clf_lgb.best_estimator_

models = {
    'xgb': clf_xgb,
    'cat': clf_cat,
    'gb' : clf_gb,
    'lgb': clf_lgb
}

trained_models = [(name, model) for name, model in models.items()]

ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

acc = np.mean(y_pred == y_test)
print(f'Accuracy for ensemble: {acc}')

Accuracy for ensemble: 0.5503597122302158


In [23]:
feature_importance_en = []
for i in feature_importance_lgb['feat_name'].values:
    imp = []
    for df in [feature_importance_xgb, feature_importance_cat, feature_importance_gb, feature_importance_lgb]:
        imp.append(df[df['feat_name'] == i].values[0][1])
    feature_importance_en.append((i,np.mean(imp)))
feature_importance_en.sort(key=lambda x: x[1], reverse=True)
feature_importance_en = pd.DataFrame(feature_importance_en, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_en

Unnamed: 0,feat_name,model_feat_imp_train
0,team1Num50Last15,70.517941
1,team1WinpLast5,50.772704
2,lightAvgRunsLast15,48.537142
3,seriesAvgRunsLast15,48.1577
4,team1WinpLight,44.281723
5,groundAvgRunsLast15,42.136118
6,team1BatsFirstWins,40.920529
7,team1AvgWicketsLost,40.358804
8,team1OnlyAvgRunsLast15,39.405574
9,team1AvgRR,39.311286


In [24]:
# Testing with base models
clf_xgb = xgb.XGBClassifier(booster = 'gbtree', random_state=0, njobs=-1, verbosity=0, eval_metric='error', objective='binary:logistic')
clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0)
clf_gb = GradientBoostingClassifier(random_state=0)
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 5, max_depth = 5)

models = {
    'xgb': clf_xgb,
    'cat': clf_cat,
    'gb' : clf_gb,
    'lgb': clf_lgb
}

trained_models = [(name, model) for name, model in models.items()]

ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

acc = np.mean(y_pred == y_test)
print(f'Accuracy for ensemble: {acc}')

Accuracy for ensemble: 0.5611510791366906


In [25]:
feature_importance_en = []
for i in feature_importance_lgb['feat_name'].values:
    imp = []
    for df in [feature_importance_xgb, feature_importance_cat, feature_importance_gb, feature_importance_lgb]:
        imp.append(df[df['feat_name'] == i].values[0][1])
    feature_importance_en.append((i,np.mean(imp)))
feature_importance_en.sort(key=lambda x: x[1], reverse=True)
feature_importance_en = pd.DataFrame(feature_importance_en, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_en

Unnamed: 0,feat_name,model_feat_imp_train
0,team1Num50Last15,70.517941
1,team1WinpLast5,50.772704
2,lightAvgRunsLast15,48.537142
3,seriesAvgRunsLast15,48.1577
4,team1WinpLight,44.281723
5,groundAvgRunsLast15,42.136118
6,team1BatsFirstWins,40.920529
7,team1AvgWicketsLost,40.358804
8,team1OnlyAvgRunsLast15,39.405574
9,team1AvgRR,39.311286
