In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [3]:
# Importing datset
train_data = pd.read_csv('newdata//3//train_data.csv')

test_data = pd.read_csv('newdata//3//test_data.csv')

In [4]:
train_data = data_preprocessing(train_data)
test_data = data_preprocessing(test_data)

In [5]:
cols = list(train_data.columns[14:])[:-1]

In [6]:
X_train, y_train, X_test = train_data[cols], train_data['winner_01'], test_data[cols]

In [7]:
# 45min
clf_xgb = xgb.XGBClassifier(booster = 'gbtree', random_state=0, n_jobs=-1, verbosity=0, eval_metric='error', objective='binary:logistic')
param_xgb = {
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'colsample_bytree': [0.5, 0.7, 0.8, 0.9, 1.0]
}
clf_xgb = GridSearchCV(clf_xgb, param_xgb, cv=5, n_jobs=-1)
clf_xgb.fit(X_train, y_train)
print(f"Best parameters are {clf_xgb.best_params_}")

Best parameters are {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 200}


In [8]:
with open('models//3//clf_xgb.pkl', 'wb') as file:
    pickle.dump(clf_xgb, file)

In [9]:
# 32min
clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0, eval_metric = 'Accuracy')
param_cat = {
    'depth': [6, 7, 8, 9, 10],
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

clf_cat = GridSearchCV(clf_cat, param_cat, cv=5, n_jobs=-1)
clf_cat.fit(X_train, y_train)
print(f"Best parameters are: {clf_cat.best_params_}")

Best parameters are: {'depth': 8, 'l2_leaf_reg': 5, 'learning_rate': 0.001}


In [10]:
with open('models//3//clf_cat.pkl', 'wb') as file:
    pickle.dump(clf_cat, file)

In [11]:
# 6min
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 16, objective='binary', n_jobs=-1)
param_lgb = {
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'max_depth':[5, 7, 9, 11],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'min_data_in_leaf': [5, 10, 15, 20, 25, 30, 40, 50, 70, 100]
}

clf_lgb = GridSearchCV(clf_lgb, param_lgb, cv=5, n_jobs=-1)
clf_lgb.fit(X_train, y_train)
print(f"Best parameters are: {clf_lgb.best_params_}")

Best parameters are: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 7, 'min_data_in_leaf': 70}


In [12]:
with open('models//3//clf_lgb.pkl', 'wb') as file:
    pickle.dump(clf_lgb, file)

In [21]:
models = {
    'xgb': clf_xgb.best_estimator_,
    'cat': clf_cat.best_estimator_,
    'lgb': clf_lgb.best_estimator_
}

trained_models = [(name, model) for name, model in models.items()]

ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
ensemble.fit(X_train, y_train)

In [22]:
with open('models//3//ensemble.pkl', 'wb') as file:
    pickle.dump(ensemble, file)

In [16]:
feature_desc = {
    'team1Num50Last15':'Ratio of team1 to team2 number of 50s by players in last 15 games.',
    'team1WinpLast5':'Ratio of team1 to team2 percent  win in last 5 games.',
    'team1OnlyAvgRunsLast15':'Average inning runs of team1 only in last 15 games.',
    'team1WinpLast15F2F':'Team1 win percent against Team2 in last 15 games.',
    'groundAvgRunsLast15':'Average runs scored in the ground in last 15 games.',
    'groundAvgWicketsLast15':'Average wickets lost in the ground in last 15 games.',
    'lightAvgRunsLast15':'Average runs scored in the lighting in last 15 games.',
    'lightAvgWicketsLast15':'Average wickets lost in the lighting in last 15 games.',
    'seriesAvgRunsLast15':'Average runs scored in the series in last 15 games.',
    'seriesAvgWicketsLast15':'Average wickets lost in the series in last 15 games.',
    'inn1AvgRunsLast15':'Average runs scored in inning 1 in last 15 games.',
    'inn2AvgRunsLast15':'Average runs scored in inning 2 in last 15 games.',
    'inn1AvgWicketsLast15':'Average wickets lost in inning 1 in last 15 games.',
    'inn2AvgWicketsLast15':'Average wickets lost in inning 2 in last 15 games.',
    'team1tossWinnerWins': 'Ratio of team1 to team2 percent of toss wins resulting in match wins.',
    'team1BatsFirstWins': 'Ratio of team1 to team2 percent of match wins resulting from batting first.',
    'team1BatsSecondWins': 'Ratio of team1 to team2 percent of match wins resulting from batting second.',
    'team1WinpLight': 'Ratio of team1 to team2 percent win given the lighting condition.',
    'team1WinpSeries': 'Ratio of team1 to team2 percent win given the series name',
    'team1AvgRunsMargin' : 'Ratio of team1 to team2 average of margin of runs won by',
    'team1AvgWicketsMargin' : 'Ratio of team1 to team2 average of margin of wickets won by',
    'team1AvgWicketsLost' :  'Ratio of team1 to team2 avg wickets lost w.r.t total balls faced.',
    'team1AvgRR' : 'Ratio of team1 to team2 average runs rate',
    'team1PlayerOfMatchLast15' : 'Ratio of team1 to team2 total number of player of the match won by players in last 15 games.',
    'team1TotalRunsPOMLast15' : 'Ratio of team1 to team2 total runs by player of the match in last 15 games.',
    'team1TotalWicketsPOMLast15' : 'Ratio of team1 to team2 total wickets by player of the match in last 15 games.',
    'team1SRover120Last15': 'Ratio of team1 to team2 percent player-level strike rate over 120 in last 15 games.',
    'team1CaptainRuns' : 'Ratio of team1 to team2 total runs scored by captains.',
    'team1BoundaryLast15' : 'Ratio of team1 to team2 total boundaries in last 15 games.',
    'team1BatBoundaryPercentLast15' : 'Ratio of team1 to team2 percent of runs scored that are boundaries in last 15 games.',
    'team1EconBelow8Last15' : 'Ratio of team1 to team2 percent player-level economy rate below 8 in last 15 games.',
    'team1DotPercentLast15' : 'Ratio of team1 to team2 percent of balls that are dots in last 15 games.',
    'team1BowlBoundaryPercentLast15' : 'Ratio of team1 to team2 percent of runs conceded that are boundaries in last 15 games.',
    'team1MaidenPercent': 'Ratio of team1 to team2 percent of maidens bowled.',
    'team1ExtrasPercentLast15' : 'Ratio of team1 to team2 percent of extras conceded in last 15 games.',
    'rel_strength' : 'Relative strength of team1 to team2.'
}

In [60]:
with open('models//3//ensemble.pkl', 'rb') as file:
    ensemble = pickle.load(file)

In [61]:
estimators = ensemble.named_estimators_

feature_importance = {}
for _, est in estimators.items():
    if hasattr(est, 'feature_importances_'):
        norm_imp = est.feature_importances_ / sum(est.feature_importances_)
        for i, imp in zip(cols,norm_imp):
            feature_importance.setdefault(i, []).append(imp)

feature_importance = {k: sum(v) / len(v) for k, v in feature_importance.items()}
feature_importance = list(zip(feature_importance, feature_importance.values()))
feature_importance.sort(key=lambda x: x[1], reverse=True)
feature_importance = pd.DataFrame(feature_importance, columns=['feat_name', 'model_feat_imp_train']).head(10)
feature_importance['feat_description'] = feature_importance['feat_name'].map(feature_desc)
feature_importance['feat_id'] = [i+1 for i in feature_importance.index]
feature_importance['feat_rank_train'] = [i+1 for i in feature_importance.index]

In [62]:
train_data['y_pred_01'] = ensemble.predict(X_train)
test_data['y_pred_01'] = ensemble.predict(X_test)

train_data['win_pred_score'] = ensemble.predict_proba(X_train)[:,1]
test_data['win_pred_score'] = ensemble.predict_proba(X_test)[:,1]

train_data['win_pred_score'] = np.where( (train_data['y_pred_01']==0), (1-train_data['win_pred_score']), train_data['win_pred_score'])
test_data['win_pred_score'] = np.where( (test_data['y_pred_01']==0), (1-test_data['win_pred_score']), test_data['win_pred_score'])

train_data['win_pred_team_id'] = np.where( (train_data['y_pred_01']==1), (train_data['team1_id']), train_data['team2_id'])
test_data['win_pred_team_id'] = np.where( (test_data['y_pred_01']==1), (test_data['team1_id']), test_data['team2_id'])

In [63]:
def train_hps_depth(ensemble):
    res = []
    for i in ensemble.estimators_:
        if hasattr(i, 'max_depth'):
            res.append(f"{i.max_depth}")
        elif i.get_param('depth') is not None:
            res.append(f"{i.get_param('depth')}")
    return ";".join(res)

def train_hps_trees(ensemble):
    res = []
    for i in ensemble.estimators_:
        if hasattr(i, 'n_estimators'):
            res.append(f"{i.n_estimators}")
        elif i.get_param('iterations') is not None:
            res.append(f"{i.get_param('iterations')}")
    return ";".join(res)

def train_hps_lr(ensemble):
    res = []
    for i in ensemble.estimators_:
        if hasattr(i, 'learning_rate'):
            res.append(f"{i.learning_rate}")
        elif i.get_param('learning_rate') is not None:
            res.append(f"{i.get_param('learning_rate')}")
    return ";".join(res)

def isEnsemble(ensemble):
    if len(ensemble.estimators_) == 1:
        return 'no'
    else:
        return 'yes'

In [64]:
def df_file1(ensemble):
    train_data['dataset_type'] = 'train'
    train_data['train_algorithm'] = ';'.join(list(ensemble.named_estimators_.keys()))
    train_data['is_ensemble'] = isEnsemble(ensemble)
    train_data['train_hps_trees'] = train_hps_trees(ensemble)
    train_data['train_hps_depth'] = train_hps_depth(ensemble)
    train_data['train_hps_lr'] = train_hps_lr(ensemble)
    train_data['match id'] = train_data['match_id']

    test_data['dataset_type'] = 'r1'
    test_data['train_algorithm'] = ';'.join(list(ensemble.named_estimators_.keys()))
    test_data['is_ensemble'] = isEnsemble(ensemble)
    test_data['train_hps_trees'] = train_hps_trees(ensemble)
    test_data['train_hps_depth'] = train_hps_depth(ensemble)
    test_data['train_hps_lr'] = train_hps_lr(ensemble)
    test_data['match id'] = test_data['match_id']

    df_file1 = pd.concat([test_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(feature_importance['feat_name'].head(10))], \
                        train_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(feature_importance['feat_name'].head(10))]])

    renaming_dict = {}
    for i,col in enumerate(list(feature_importance['feat_name'].head(10))):
        renaming_dict[col] = f'indep_feat_id{i+1}'
    df_file1.rename(columns=renaming_dict, inplace=True)

    for i in range(1,11):
        if f'indep_feat_id{i}' not in df_file1.columns:
            df_file1[f'indep_feat_id{i}'] = np.nan
    return df_file1

In [65]:
df_file1 = df_file1(ensemble)

In [66]:
df_file2 = feature_importance[['feat_id', 'feat_name', 'feat_description', 'model_feat_imp_train','feat_rank_train']]

In [67]:
df_file1.to_csv('sub//3//ensemble//2024_DS_Track_File1_SATOSHI NAKAMOTO.csv', index=False)
df_file2.to_csv('sub//3//ensemble//2024_DS_Track_File2_SATOSHI NAKAMOTO.csv', index=False)