In [119]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score 
import matplotlib.pyplot as plt

In [120]:
# Importing datset
train_data = pd.read_csv('newdata//4//train_data.csv')
test_data = pd.read_csv('newdata//4//test_data.csv')

In [126]:
cols = list(test_data.columns[13:])
X_train, X_val, y_train, y_val = train_test_split(train_data[cols],train_data['winner_01'], test_size = 0.2, random_state= 0, stratify = train_data['winner_01']) 

In [127]:
eval_set = [(X_val, y_val)]

In [137]:
clf_xgb = xgb.XGBClassifier()
clf_xgb.fit(X_train, y_train, eval_set=eval_set, verbose=False)
accuracy_score(y_val,clf_xgb.predict(X_val)), accuracy_score(y_train,clf_xgb.predict(X_train))

(0.531578947368421, 1.0)

In [129]:
a = list(zip(cols, clf_xgb.feature_importances_*100))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_xgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_xgb

Unnamed: 0,feat_name,model_feat_imp_train
0,rel_strength,5.433463
1,team1Num50Last15,4.146815
2,team1PlayerOfMatchLast15,3.396448
3,team1WinpLast5,3.301453
4,team1TotalWicketsPOMLast15,3.273926
5,seriesAvgRunsLast15,2.736686
6,team1BatBoundaryPercentLast15,2.61532
7,inn2AvgRunsLast15,2.600867
8,lightAvgWicketsLast15,2.544576
9,groundAvgRunsLast15,2.504896


In [12]:
# 6min
clf_xgb = xgb.XGBClassifier(booster = 'gbtree', early_stopping_rounds=50, random_state=0, n_jobs=-1, verbosity=0, eval_metric='error', objective='binary:logistic')
param_xgb = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.05, 0.01, 0.005, 0.001],
    'colsample_bytree': [0.5, 0.7, 0.8, 0.9, 1.0]
}
clf_xgb = GridSearchCV(clf_xgb, param_xgb, cv=5, n_jobs=-1)
clf_xgb.fit(X_train, y_train, eval_set=eval_set, verbose=False)
print(f"Best parameters are {clf_xgb.best_params_}")

Best parameters are {'colsample_bytree': 0.5, 'learning_rate': 0.005, 'max_depth': 11, 'n_estimators': 100}


In [13]:
accuracy_score(y_val,clf_xgb.predict(X_val)), accuracy_score(y_train,clf_xgb.predict(X_train))

(0.5263157894736842, 0.9973614775725593)

In [118]:
a = list(zip(cols, clf_xgb.best_estimator_.feature_importances_*100))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_xgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_xgb

Unnamed: 0,feat_name,model_feat_imp_train
0,rel_strength,6.14585
1,team1Num50Last15,4.158626
2,team1TotalWicketsPOMLast15,3.749948
3,team1WinpLast5,3.738032
4,team1PlayerOfMatchLast15,3.437165
5,team1BatBoundaryPercentLast15,3.302266
6,seriesAvgRunsLast15,3.082201
7,inn2AvgRunsLast15,3.037734
8,team1MaidenPercent,2.878897
9,team1EconBelow8Last15,2.818065


In [17]:
# 12min
clf_cat = cat.CatBoostClassifier(iterations= 1000, random_state=0, verbose=0, eval_metric = 'Accuracy',
                                 early_stopping_rounds=50, od_type='Iter')
param_cat = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.05, 0.01, 0.005, 0.001],
    'l2_leaf_reg': [0.5, 0.7, 0.3]
}

clf_cat = GridSearchCV(clf_cat, param_cat, cv=5, n_jobs=-1)
clf_cat.fit(X_train, y_train, eval_set=eval_set)
print(f"Best parameters are: {clf_cat.best_params_}")

Best parameters are: {'depth': 4, 'l2_leaf_reg': 0.3, 'learning_rate': 0.05}


In [18]:
accuracy_score(y_val,clf_cat.predict(X_val)), accuracy_score(y_train,clf_cat.predict(X_train))

(0.5368421052631579, 0.6055408970976254)

In [25]:
a = list(zip(cols, clf_cat.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_cat = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_cat.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,team1tossWinnerWins,26.077228
1,team1BattingAvgLast15,24.947548
2,team1BatsFirstWins,20.998284
3,rel_strength,14.665785
4,team1BoundaryLast15,9.754401
5,team1DotPercentLast15,1.840065
6,groundAvgWicketsLast15,1.716689
7,team1Num50Last15,0.0
8,team1WinpLast5,0.0
9,team1OnlyAvgRunsLast15,0.0


In [20]:
# 8min
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 16, objective='binary', n_jobs=-1,
                             early_stopping_rounds=50, eval_metric='error', n_estimators=1000)
param_lgb = {
    'learning_rate': [0.05, 0.01, 0.005, 0.001],
    'max_depth':[5, 7, 9, 11],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

clf_lgb = GridSearchCV(clf_lgb, param_lgb, cv=5, n_jobs=-1)
clf_lgb.fit(X_train, y_train, eval_set=eval_set)
print(f"Best parameters are: {clf_lgb.best_params_}")

Best parameters are: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5}


In [21]:
accuracy_score(y_val,clf_lgb.predict(X_val)), accuracy_score(y_train,clf_lgb.predict(X_train))

(0.4842105263157895, 0.7295514511873351)

In [22]:
a = list(zip(cols, clf_lgb.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance_lgb = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance_lgb['model_feat_imp_train'] = feature_importance_lgb['model_feat_imp_train']/(feature_importance_lgb['model_feat_imp_train'].sum()/100)
feature_importance_lgb.head(10)

Unnamed: 0,feat_name,model_feat_imp_train
0,seriesAvgRunsLast15,10.0
1,rel_strength,10.0
2,lightAvgRunsLast15,6.666667
3,lightAvgWicketsLast15,6.666667
4,team1BatsFirstWins,6.666667
5,team1TotalWicketsPOMLast15,6.666667
6,team1CaptainRuns,6.666667
7,team1BatBoundaryPercentLast15,6.666667
8,team1WinpLast5,3.333333
9,seriesAvgWicketsLast15,3.333333


In [23]:
estimators = [('xgb',clf_xgb),('cat',clf_cat),('lgb',clf_lgb)]
for estimator in estimators:
    print(estimator[0]+" :",round(accuracy_score(y_val,estimator[1].predict(X_val)),3), round(accuracy_score(y_train,estimator[1].predict(X_train)),3))

xgb : 0.526 0.997
cat : 0.537 0.606
lgb : 0.484 0.73


In [105]:
class Vote:
    ''' 
    We can pass trained classifiers inside this. Default sklearn Voting Classifier
    doesn't allow to pass prefit classifiers inside. Also, we can't experiment
    by passing different weights in the sklearn tool
    '''
    def __init__(self, estimators, weights = [True]*len(estimators)):
        self.estimators_ = estimators
        self.weights_ = weights

    def predict_proba(self,X):
        ans = np.zeros((len(X),2))
        n = len(self.estimators_)

        denom = 0
        for i in range(n):
            ans += (self.weights_[i]*self.estimators_[i][1].predict_proba(X))
            denom+=self.weights_[i] 

        return ans/denom 

    def predict(self,X):
        self.ans = self.predict_proba(X)
        return np.argmax(self.ans,axis=1)

In [106]:
ensemble = Vote(estimators,weights = [0,0,1])
y_valpred = ensemble.predict(X_val)
accuracy_score(y_val,y_valpred), accuracy_score(y_train,ensemble.predict(X_train))

(1.0, 1.0)

In [107]:
feature_desc = {
    'team1Num50Last15':'Ratio of team1 to team2 number of 50s by players in last 15 games.',
    'team1WinpLast5':'Ratio of team1 to team2 percent  win in last 5 games.',
    'team1OnlyAvgRunsLast15':'Average inning runs of team1 only in last 15 games.',
    'team1WinpLast15F2F':'Team1 win percent against Team2 in last 15 games.',
    'groundAvgRunsLast15':'Average runs scored in the ground in last 15 games.',
    'groundAvgWicketsLast15':'Average wickets lost in the ground in last 15 games.',
    'lightAvgRunsLast15':'Average runs scored in the lighting in last 15 games.',
    'lightAvgWicketsLast15':'Average wickets lost in the lighting in last 15 games.',
    'seriesAvgRunsLast15':'Average runs scored in the series in last 15 games.',
    'seriesAvgWicketsLast15':'Average wickets lost in the series in last 15 games.',
    'inn1AvgRunsLast15':'Average runs scored in inning 1 in last 15 games.',
    'inn2AvgRunsLast15':'Average runs scored in inning 2 in last 15 games.',
    'inn1AvgWicketsLast15':'Average wickets lost in inning 1 in last 15 games.',
    'inn2AvgWicketsLast15':'Average wickets lost in inning 2 in last 15 games.',
    'team1tossWinnerWins': 'Ratio of team1 to team2 percent of toss wins resulting in match wins.',
    'team1BatsFirstWins': 'Ratio of team1 to team2 percent of match wins resulting from batting first.',
    'team1BatsSecondWins': 'Ratio of team1 to team2 percent of match wins resulting from batting second.',
    'team1WinpLight': 'Ratio of team1 to team2 percent win given the lighting condition.',
    'team1WinpSeries': 'Ratio of team1 to team2 percent win given the series name',
    'team1AvgRunsMargin' : 'Ratio of team1 to team2 average of margin of runs won by',
    'team1AvgWicketsMargin' : 'Ratio of team1 to team2 average of margin of wickets won by',
    'team1AvgWicketsLost' :  'Ratio of team1 to team2 avg wickets lost w.r.t total balls faced.',
    'team1AvgRR' : 'Ratio of team1 to team2 average runs rate',
    'team1PlayerOfMatchLast15' : 'Ratio of team1 to team2 total number of player of the match won by players in last 15 games.',
    'team1TotalRunsPOMLast15' : 'Ratio of team1 to team2 total runs by player of the match in last 15 games.',
    'team1TotalWicketsPOMLast15' : 'Ratio of team1 to team2 total wickets by player of the match in last 15 games.',
    'team1SRover120Last15': 'Ratio of team1 to team2 percent player-level strike rate over 120 in last 15 games.',
    'team1CaptainRuns' : 'Ratio of team1 to team2 total runs scored by captains.',
    'team1BoundaryLast15' : 'Ratio of team1 to team2 total boundaries in last 15 games.',
    'team1BatBoundaryPercentLast15' : 'Ratio of team1 to team2 percent of runs scored that are boundaries in last 15 games.',
    'team1EconBelow8Last15' : 'Ratio of team1 to team2 percent player-level economy rate below 8 in last 15 games.',
    'team1DotPercentLast15' : 'Ratio of team1 to team2 percent of balls that are dots in last 15 games.',
    'team1BowlBoundaryPercentLast15' : 'Ratio of team1 to team2 percent of runs conceded that are boundaries in last 15 games.',
    'team1MaidenPercent': 'Ratio of team1 to team2 percent of maidens bowled.',
    'team1ExtrasPercentLast15' : 'Ratio of team1 to team2 percent of extras conceded in last 15 games.',
    'rel_strength' : 'Relative strength of team1 to team2.',
    'metaC_0' : 'Meta feature 0'
}

In [118]:
feature_importance = {}
for _, est in ensemble.estimators_:
    if hasattr(est.best_estimator_, 'feature_importances_'):
        norm_imp = est.best_estimator_.feature_importances_ / sum(est.best_estimator_.feature_importances_)
        for i, imp in zip(cols,norm_imp):
            feature_importance.setdefault(i, []).append(imp)
feature_importance = {k: sum(v) / len(v) for k, v in feature_importance.items()}
feature_importance = list(zip(feature_importance, feature_importance.values()))
feature_importance.sort(key=lambda x: x[1], reverse=True)
feature_importance = pd.DataFrame(feature_importance, columns=['feat_name', 'model_feat_imp_train']).head(10)
feature_importance['feat_description'] = feature_importance['feat_name'].map(feature_desc)
feature_importance['feat_id'] = [i+1 for i in feature_importance.index]
feature_importance['feat_rank_train'] = [i+1 for i in feature_importance.index]
feature_importance

Unnamed: 0,feat_name,model_feat_imp_train,feat_description,feat_id,feat_rank_train
0,seriesAvgRunsLast15,0.053131,Average runs scored in the series in last 15 g...,1,1
1,team1BoundaryLast15,0.046057,Ratio of team1 to team2 total boundaries in la...,2,2
2,inn2AvgRunsLast15,0.036576,Average runs scored in inning 2 in last 15 games.,3,3
3,team1DotPercentLast15,0.034676,Ratio of team1 to team2 percent of balls that ...,4,4
4,team1TotalWicketsPOMLast15,0.03458,Ratio of team1 to team2 total wickets by playe...,5,5
5,team1OnlyAvgRunsLast15,0.033433,Average inning runs of team1 only in last 15 g...,6,6
6,team1SRover120Last15,0.033236,Ratio of team1 to team2 percent player-level s...,7,7
7,lightAvgWicketsLast15,0.032897,Average wickets lost in the lighting in last 1...,8,8
8,team1ExtrasPercentLast15,0.032436,Ratio of team1 to team2 percent of extras conc...,9,9
9,rel_strength,0.032357,Relative strength of team1 to team2.,10,10


In [119]:
X_train

Unnamed: 0,team1Num50Last15,team1WinpLast5,team1OnlyAvgRunsLast15,team1WinpLast15F2F,groundAvgRunsLast15,groundAvgWicketsLast15,lightAvgRunsLast15,lightAvgWicketsLast15,seriesAvgRunsLast15,seriesAvgWicketsLast15,...,team1BattingAvgLast15,team1BoundaryLast15,team1BatBoundaryPercentLast15,team1EconBelow8Last15,team1DotPercentLast15,team1BowlBoundaryPercentLast15,team1MaidenPercent,team1ExtrasPercentLast15,rel_strength,metaC_0
0,1.000000,0.506173,152.400000,66.67,146.500000,4.750000,126.733333,5.633333,146.500000,4.750000,...,0.990150,1.666667,2.012967,0.827928,0.860689,0.962462,1.322238,0.883795,0.426503,0.322537
1,0.650000,0.207921,154.800000,0.00,146.500000,8.250000,161.233333,6.466667,168.100000,6.100000,...,0.988000,0.923077,0.982713,1.048848,0.947892,0.871339,0.993504,0.991732,-0.455738,0.112036
2,3.600000,1.487805,176.066667,0.00,144.933333,5.800000,143.466667,7.166667,145.833333,4.583333,...,3.621225,4.830189,2.001808,0.574945,0.847204,1.270339,0.104620,3.221996,6.857809,0.098293
3,0.909091,1.487805,150.100000,50.00,159.833333,6.750000,149.700000,6.800000,139.266667,6.833333,...,0.817195,0.919118,0.957773,1.152548,0.833770,0.867421,0.025253,1.384209,-0.362717,0.185395
4,0.631579,0.672131,185.066667,60.00,156.400000,6.966667,179.400000,6.766667,178.033333,7.433333,...,0.750259,0.760274,1.046715,1.097804,0.920841,0.927748,1.257079,0.562820,2.146790,0.126186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,1.800000,0.512195,141.666667,0.00,151.250000,7.000000,156.166667,6.833333,0.000000,0.000000,...,1.327609,1.530055,0.973263,0.932057,0.954084,1.001156,0.065268,1.259725,1.953748,0.230136
186,7.000000,81.000000,174.857143,0.00,142.818182,6.727273,147.366667,6.500000,164.214286,6.357143,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.027195
187,0.318182,0.506173,155.866667,0.00,156.928571,6.642857,159.033333,6.000000,0.000000,0.000000,...,0.663923,0.422713,0.629146,1.171394,1.036758,1.008402,0.392600,0.612687,-0.434652,0.985560
188,0.818182,1.327869,164.666667,0.00,189.000000,6.000000,182.133333,6.833333,174.800000,5.900000,...,0.985356,0.995475,1.059332,0.956391,1.020470,1.015711,1.063082,1.243480,-0.240486,0.361643


In [20]:
X_train['y_pred_01'] = ensemble.predict(X_train)
X_test['y_pred_01'] = ensemble.predict(X_test)

X_train['win_pred_score'] = ensemble.predict_proba(X_train)[:,1]
X_test['win_pred_score'] = ensemble.predict_proba(X_test)[:,1]

X_train['win_pred_score'] = np.where( (X_train['y_pred_01']==0), (1-X_train['win_pred_score']), X_train['win_pred_score'])
X_test['win_pred_score'] = np.where( (X_test['y_pred_01']==0), (1-X_test['win_pred_score']), X_test['win_pred_score'])

X_train['win_pred_team_id'] = np.where( (X_train['y_pred_01']==1), (X_train['team1_id']), X_train['team2_id'])
X_test['win_pred_team_id'] = np.where( (X_test['y_pred_01']==1), (X_test['team1_id']), X_test['team2_id'])

In [21]:
def train_hps_depth(ensemble):
    res = []
    for _,i in ensemble.estimators_:
        if hasattr(i, 'max_depth'):
            res.append(f"{i.max_depth}")
        elif i.get_param('depth') is not None:
            res.append(f"{i.get_param('depth')}")
    return ";".join(res)

def train_hps_trees(ensemble):
    res = []
    for _,i in ensemble.estimators_:
        if hasattr(i, 'n_estimators'):
            res.append(f"{i.n_estimators}")
        elif i.get_param('iterations') is not None:
            res.append(f"{i.get_param('iterations')}")
    return ";".join(res)

def train_hps_lr(ensemble):
    res = []
    for _,i in ensemble.estimators_:
        if hasattr(i, 'learning_rate'):
            res.append(f"{i.learning_rate}")
        elif i.get_param('learning_rate') is not None:
            res.append(f"{i.get_param('learning_rate')}")
    return ";".join(res)

def isEnsemble(ensemble):
    if len(ensemble.estimators_) == 1:
        return 'no'
    else:
        return 'yes'

In [22]:
def df_file1(ensemble):
    train_data['dataset_type'] = 'train'
    train_data['train_algorithm'] = ';'.join(dict(ensemble.estimators_).keys())
    train_data['is_ensemble'] = isEnsemble(ensemble)
    train_data['train_hps_trees'] = train_hps_trees(ensemble)
    train_data['train_hps_depth'] = train_hps_depth(ensemble)
    train_data['train_hps_lr'] = train_hps_lr(ensemble)
    train_data['match id'] = train_data['match_id']

    test_data['dataset_type'] = 'r1'
    test_data['train_algorithm'] = ';'.join(dict(ensemble.estimators_).keys())
    test_data['is_ensemble'] = isEnsemble(ensemble)
    test_data['train_hps_trees'] = train_hps_trees(ensemble)
    test_data['train_hps_depth'] = train_hps_depth(ensemble)
    test_data['train_hps_lr'] = train_hps_lr(ensemble)
    test_data['match id'] = test_data['match_id']

    df_file1 = pd.concat([test_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(feature_importance['feat_name'].head(10))], \
                        train_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(feature_importance['feat_name'].head(10))]])

    renaming_dict = {}
    for i,col in enumerate(list(feature_importance['feat_name'].head(10))):
        renaming_dict[col] = f'indep_feat_id{i+1}'
    df_file1.rename(columns=renaming_dict, inplace=True)

    for i in range(1,11):
        if f'indep_feat_id{i}' not in df_file1.columns:
            df_file1[f'indep_feat_id{i}'] = np.nan
    return df_file1

In [23]:
df_file1 = df_file1(ensemble)

In [24]:
df_file2 = feature_importance[['feat_id', 'feat_name', 'feat_description', 'model_feat_imp_train','feat_rank_train']]

In [25]:
df_file1.to_csv('sub//4//ensemble//2024_DS_Track_File1_SATOSHI NAKAMOTO.csv', index=False)
df_file2.to_csv('sub//4//ensemble//2024_DS_Track_File2_SATOSHI NAKAMOTO.csv', index=False)