In [11]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

In [12]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [13]:
# Importing datset
train_data = pd.read_csv('newdata//3//train_data.csv',)

test_data = pd.read_csv('newdata//3//test_data.csv')

In [14]:
train_data = data_preprocessing(train_data)
test_data = data_preprocessing(test_data)

In [15]:
cols = list(train_data.columns[14:])[:-1]

In [16]:
train_data_test = train_data.copy()
train_data_test.sort_values(by='match_dt', inplace=True)
X_train, y_train, X_test, y_test = train_data_test[cols][:670], train_data_test['winner_01'][:670], train_data_test[cols][670:], train_data_test['winner_01'][670:]

In [17]:
# XGBOOST
clf_xgb = xgb.XGBClassifier(booster = 'gbtree', random_state=0, n_jobs=-1, verbosity=0, eval_metric='error', objective='binary:logistic')
param_xgb = {
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'colsample_bytree': [0.8, 1.0]
}
clf_xgb = GridSearchCV(clf_xgb, param_xgb, cv=5, n_jobs=-1)
clf_xgb.fit(X_train, y_train)
print(f"Best parameters are {clf_xgb.best_params_}")
y_pred = clf_xgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are {'colsample_bytree': 0.8, 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 700}
Accuracy: 0.539568345323741


In [18]:
a = list(zip(cols, clf_xgb.best_estimator_.feature_importances_*100))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_xgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_xgb.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,team1TotalWicketsPOMLast15,4.563894
1,team1Num50Last15,4.325505
2,team1CaptainRuns,4.233891
3,rel_strength,4.087769
4,seriesAvgRunsLast15,3.735983
5,team1WinpLight,3.556347
6,team1MaidenPercent,3.542907
7,team1WinpLast5,3.497636
8,team1DotPercentLast15,3.200645
9,team1OnlyAvgRunsLast15,3.147822


In [19]:
# CATBOOST
clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0)
param_cat = {
    'depth': [6, 7, 8, 9, 10],
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

clf_cat = GridSearchCV(clf_cat, param_cat, cv=5, n_jobs=-1)
clf_cat.fit(X_train, y_train)
print(f"Best parameters are: {clf_cat.best_params_}")
y_pred = clf_cat.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'depth': 7, 'l2_leaf_reg': 7, 'learning_rate': 0.05}
Accuracy: 0.5827338129496403


In [20]:
a = list(zip(cols, clf_cat.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_cat = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_cat.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,team1TotalWicketsPOMLast15,6.626689
1,team1CaptainRuns,4.688425
2,rel_strength,4.643921
3,team1Num50Last15,3.80831
4,team1TotalRunsPOMLast15,3.626334
5,seriesAvgRunsLast15,3.585987
6,team1MaidenPercent,3.545671
7,team1BowlBoundaryPercentLast15,3.416896
8,team1AvgWicketsLost,3.090341
9,team1OnlyAvgRunsLast15,2.8485


In [21]:
# LGBM
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 16, objective='binary', n_jobs=-1)
param_lgb = {
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'max_depth':[5, 7, 9, 11],
    'colsample_bytree': [0.8, 1.0],
    'min_data_in_leaf': [5, 10, 15, 20, 25, 30, 40, 50, 70, 100]
}

clf_lgb = GridSearchCV(clf_lgb, param_lgb, cv=5, n_jobs=-1)
clf_lgb.fit(X_train, y_train)
print(f"Best parameters are: {clf_lgb.best_params_}")
y_pred = clf_lgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'colsample_bytree': 0.8, 'learning_rate': 0.001, 'max_depth': 5, 'min_data_in_leaf': 40}
Accuracy: 0.5359712230215827


In [22]:
a = list(zip(cols, clf_lgb.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_lgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_lgb['model_feat_imp_train'] = feature_importance_lgb['model_feat_imp_train']/(feature_importance_lgb['model_feat_imp_train'].sum()/100)
feature_importance_lgb.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,team1TotalWicketsPOMLast15,20.996094
1,rel_strength,8.105469
2,team1CaptainRuns,7.714844
3,team1BoundaryLast15,7.128906
4,team1ExtrasPercentLast15,6.640625
5,team1DotPercentLast15,5.566406
6,team1AvgWicketsLost,4.492188
7,inn2AvgRunsLast15,4.003906
8,groundAvgRunsLast15,3.613281
9,lightAvgWicketsLast15,3.613281


In [23]:
models = {
    'xgb': clf_xgb.best_estimator_,
    'cat': clf_cat.best_estimator_,
    'lgb': clf_lgb.best_estimator_
}

trained_models = [(name, model) for name, model in models.items()]

ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

acc = np.mean(y_pred == y_test)
print(f'Accuracy for ensemble: {acc}')

Accuracy for ensemble: 0.5683453237410072


In [24]:
estimators = ensemble.named_estimators_

feature_importance = {}
for _, est in estimators.items():
    if hasattr(est, 'feature_importances_'):
        norm_imp = est.feature_importances_ / sum(est.feature_importances_)
        for i, imp in zip(cols,norm_imp):
            feature_importance.setdefault(i, []).append(imp)

feature_importance = {k: sum(v) / len(v) for k, v in feature_importance.items()}
feature_importance = list(zip(feature_importance, feature_importance.values()))
feature_importance.sort(key=lambda x: x[1], reverse=True)
feature_importance = pd.DataFrame(feature_importance, columns=['feat_name', 'model_feat_imp_train']).head(10)

In [25]:
feature_importance

Unnamed: 0,feat_name,model_feat_imp_train
0,team1TotalWicketsPOMLast15,0.107289
1,rel_strength,0.056124
2,team1CaptainRuns,0.055457
3,team1ExtrasPercentLast15,0.039463
4,team1Num50Last15,0.037855
5,team1BoundaryLast15,0.035976
6,team1DotPercentLast15,0.035937
7,inn2AvgRunsLast15,0.032829
8,seriesAvgRunsLast15,0.032219
9,groundAvgRunsLast15,0.031442
