In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [3]:
# Importing datset
train_data = pd.read_csv('newdata//2//train_data.csv',)

test_data = pd.read_csv('newdata//2//test_data.csv')

In [4]:
train_data = data_preprocessing(train_data)
test_data = data_preprocessing(test_data)

In [5]:
cols = list(train_data.columns[14:])[:-1]

In [6]:
train_data_test = train_data.copy()
train_data_test.sort_values(by='match_dt', inplace=True)
X_train, y_train, X_test, y_test = train_data_test[cols][:670], train_data_test['winner_01'][:670], train_data_test[cols][670:], train_data_test['winner_01'][670:]

In [7]:
# XGBOOST
clf_xgb = xgb.XGBClassifier(booster = 'gbtree', random_state=0, n_jobs=-1, verbosity=0, eval_metric='error', objective='binary:logistic')
param_xgb = {
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'colsample_bytree': [0.8, 1.0]
}
clf_xgb = GridSearchCV(clf_xgb, param_xgb, cv=5, n_jobs=-1)
clf_xgb.fit(X_train, y_train)
print(f"Best parameters are {clf_xgb.best_params_}")
y_pred = clf_xgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
Accuracy: 0.5611510791366906


In [28]:
a = list(zip(cols, clf_xgb.best_estimator_.feature_importances_*100))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_xgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_xgb.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,team1Num50Last15,3.780222
1,team1TotalWicketsPOMLast15,3.736359
2,inn2AvgRunsLast15,3.453582
3,team1BoundaryLast15,3.387228
4,seriesAvgWicketsLast15,3.357276
5,team1DotPercentLast15,3.227316
6,groundAvgRunsLast15,3.145972
7,inn2AvgWicketsLast15,3.035065
8,seriesAvgRunsLast15,3.005761
9,team1WinpLast5,2.965754


In [9]:
# # CATBOOST
# clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0)
# param_cat = {'learning_rate':[0.01, 0.05, 0.1, 0.2],
#             'depth':[3,5,7,9],
#             'subsample':[0.05, 0.1, 0.2, 0.5],
#             'min_data_in_leaf':[10,40,70,100]}

# clf_cat = GridSearchCV(clf_cat, param_cat, cv=5, n_jobs=-1)
# clf_cat.fit(X_train, y_train)
# print(f"Best parameters are: {clf_cat.best_params_}")
# y_pred = clf_cat.predict(X_test)
# acc = np.mean(y_pred == y_test)
# print(f'Accuracy: {acc}')

In [10]:
# clf_cat.best_estimator_.get_feature_importance(prettified=True)

In [11]:
# CATBOOST
clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0)
param_cat = {
    'depth': [6, 7, 8, 9, 10],
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

clf_cat = GridSearchCV(clf_cat, param_cat, cv=5, n_jobs=-1)
clf_cat.fit(X_train, y_train)
print(f"Best parameters are: {clf_cat.best_params_}")
y_pred = clf_cat.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'depth': 10, 'l2_leaf_reg': 9, 'learning_rate': 0.05}
Accuracy: 0.6115107913669064


In [29]:
a = list(zip(cols, clf_cat.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_cat = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_cat.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,team1MaidenPercent,5.181075
1,team1TotalWicketsPOMLast15,4.686983
2,team1AvgRunsMargin,4.546686
3,team1BowlBoundaryPercentLast15,4.352268
4,team1CaptainRuns,4.213771
5,groundAvgRunsLast15,3.803622
6,team1WinpLast5,3.761273
7,team1Num50Last15,3.594079
8,inn2AvgRunsLast15,3.487041
9,lightAvgRunsLast15,3.418773


In [26]:
# LGBM
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 16, objective='binary', n_jobs=-1)
param_lgb = {
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'max_depth':[5, 7, 9, 11],
    'colsample_bytree': [0.8, 1.0],
    'min_data_in_leaf': [5, 10, 15, 20, 25, 30, 40, 50, 70, 100]
}

clf_lgb = GridSearchCV(clf_lgb, param_lgb, cv=5, n_jobs=-1)
clf_lgb.fit(X_train, y_train)
print(f"Best parameters are: {clf_lgb.best_params_}")
y_pred = clf_lgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'colsample_bytree': 0.8, 'learning_rate': 0.001, 'max_depth': 9, 'min_data_in_leaf': 40}
Accuracy: 0.5323741007194245


In [34]:
a = list(zip(cols, clf_lgb.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_lgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_lgb['model_feat_imp_train'] = feature_importance_lgb['model_feat_imp_train']/(feature_importance_lgb['model_feat_imp_train'].sum()/100)
feature_importance_lgb.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,team1Num50Last15,9.86733
1,team1TotalWicketsPOMLast15,9.286899
2,team1CaptainRuns,7.131012
3,groundAvgRunsLast15,6.882255
4,team1MaidenPercent,4.892206
5,seriesAvgWicketsLast15,4.643449
6,team1AvgRR,4.643449
7,inn1AvgRunsLast15,4.228856
8,inn2AvgWicketsLast15,4.063018
9,seriesAvgRunsLast15,3.814262


In [44]:
models = {
    'xgb': clf_xgb.best_estimator_,
    'cat': clf_cat.best_estimator_,
    # 'lgb': clf_lgb.best_estimator_
}

trained_models = [(name, model) for name, model in models.items()]

ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

acc = np.mean(y_pred == y_test)
print(f'Accuracy for ensemble: {acc}')

Accuracy for ensemble: 0.579136690647482


In [45]:
estimators = ensemble.named_estimators_

feature_importance = {}
for _, est in estimators.items():
    if hasattr(est, 'feature_importances_'):
        norm_imp = est.feature_importances_ / sum(est.feature_importances_)
        for i, imp in zip(cols,norm_imp):
            feature_importance.setdefault(i, []).append(imp)

feature_importance = {k: sum(v) / len(v) for k, v in feature_importance.items()}
feature_importance = list(zip(feature_importance, feature_importance.values()))
feature_importance.sort(key=lambda x: x[1], reverse=True)
feature_importance = pd.DataFrame(feature_importance, columns=['feat_name', 'model_feat_imp_train']).head(10)

In [46]:
def isEnsemble(ensemble):
    if len(ensemble.estimators_) == 1:
        return 'no'
    else:
        return 'yes'

In [47]:
isEnsemble(ensemble)

'yes'

In [39]:
feature_importance

Unnamed: 0,feat_name,model_feat_imp_train
0,team1MaidenPercent,0.051811
1,team1TotalWicketsPOMLast15,0.04687
2,team1AvgRunsMargin,0.045467
3,team1BowlBoundaryPercentLast15,0.043523
4,team1CaptainRuns,0.042138
5,groundAvgRunsLast15,0.038036
6,team1WinpLast5,0.037613
7,team1Num50Last15,0.035941
8,inn2AvgRunsLast15,0.03487
9,lightAvgRunsLast15,0.034188
