In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def unwrap_rosters(df):
    df['team1_roster_ids'] = df['team1_roster_ids'].apply(lambda x: x.split(':'))
    df['team2_roster_ids'] = df['team2_roster_ids'].apply(lambda x: x.split(':'))
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [4]:
# Importing datset
train_data = pd.read_csv('data//train_data.csv',)

batsman_data = pd.read_csv('data//batsman_level_data.csv')

bowler_data = pd.read_csv('data//bowler_level_data.csv')

match_data = pd.read_csv('data//match_level_data.csv')

test_data = pd.read_csv('data//round_1_sub_data.csv')

In [5]:
train_data = data_preprocessing(train_data)
train_data = unwrap_rosters(train_data)
match_data = data_preprocessing(match_data)
match_data = unwrap_rosters(match_data)
test_data = data_preprocessing(test_data)
test_data = unwrap_rosters(test_data)

batsman_data = data_preprocessing(batsman_data)
bowler_data = data_preprocessing(bowler_data)

In [6]:
# Reordering columns
match_data = match_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
        'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
        'winner', 'winner_id', 'by', 'win_amount', 'player_of_the_match_id',
        'inning1_runs', 'inning1_wickets', 'inning1_balls', 'inning2_runs', 'inning2_wickets', 'inning2_balls',
        'team1_roster_ids', 'team2_roster_ids']]

train_data = train_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
       'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
       'winner', 'winner_id', 'team1_roster_ids', 'team2_roster_ids', 'team_count_50runs_last15',
       'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15','ground_avg_runs_last15']]

In [7]:
# Cheatsheet features
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

def no50sLastn(player_list, date, n):
    '''
    Function to get total number of 50s scored by players in the roster of a team in last n games.
    
    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    '''
    res_list = []
    for player in player_list:
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0)
        res_list.append(np.nansum(df_rel['gte_50runs']))
    return np.nansum(res_list)

def avgRunsGround(ground_id, date, n):
    '''
    Function to calculate average runs scored in ground/venue.
    
    Input-
    1. ground_id: ID of the ground to calculate the feature for.
    2. date: match date of the current game to calculate the feature for.
    3. n: look-back window of games for the ground.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&(match_data['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2
    return df_rel['avg_runs_inn'].mean()

def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    '''
    match_data['team1_bat_inning'] = np.where( ((match_data['team1']==match_data['toss winner'])&(match_data['toss decision']=='bat'))|\
                                               ((match_data['team2']==match_data['toss winner'])&(match_data['toss decision']=='field')) , 1, 2)

    df_rel = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team_id)|(match_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean()


In [8]:
# Ratio agnostic features
def tossWinnerWins(team1_id, team2_id, date):
    '''
    Function to compute percent of games toss winner wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    '''
    match_data['DtossWinnerWins'] = np.where(match_data['toss_winner'] == match_data['winner'], 1, 0)
    df_rel = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    res = df_rel['DtossWinnerWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['DtossWinnerWins'].values),2)

def teamBatsFirstWins(team1_id, team2_id, date):
    '''
    Function to compute percent of games team that bats first wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    '''
    match_data['toss_decision_bats'] = np.where(match_data['toss_decision'] == 'bat', 1, 0)

    match_data['DteamBatsFirstWins'] = np.where(((match_data['DtossWinnerWins'] == 1) & (match_data['toss_decision_bats'] == 1))
                                          | ((match_data['DtossWinnerWins'] == 0) & (match_data['toss_decision_bats'] == 0)), 1, 0)
  
    df_rel = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    res = df_rel['DteamBatsFirstWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['DteamBatsFirstWins'].values),2)

def teamBatsFirstWinsAtGround(team1_id, team2_id, date, ground):
    '''
    Function to compute percent of games team that bats first wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    4. ground: Ground ID of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['ground_id'] == ground)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    res = df_rel['DteamBatsFirstWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['DteamBatsFirstWins'].values),2)


In [9]:
# Ratio agnostic features

# tossWinnerWins
train_data['tossWinnerWins'] = train_data.apply(lambda x: tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['tossWinnerWins'] = test_data.apply(lambda x: tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# teamBatsFirstWins
train_data['teamBatsFirstWins'] = train_data.apply(lambda x: teamBatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['teamBatsFirstWins'] = test_data.apply(lambda x: teamBatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# teamBatsFirstWins
train_data['teamBatsFirstWinsAtGround'] = train_data.apply(lambda x: teamBatsFirstWinsAtGround(x['team1_id'], x['team2_id'], x['match_dt'],x['ground_id']), axis=1)
test_data['teamBatsFirstWinsAtGround'] = test_data.apply(lambda x: teamBatsFirstWinsAtGround(x['team1_id'], x['team2_id'], x['match_dt'],x['ground_id']), axis=1)


In [10]:
# Team1 to Team2 ratio features
def team1WinpAtGround(team1_id, team2_id, date, ground):
    '''
    Function to compute team1's win% against team2 from the current game at the given ground.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. ground: Ground ID of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['ground_id'] == ground)&\
                      (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                       ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0]
    if win_count == 0:
        return 0
    return round(win_count/df_rel.shape[0], 2)

def team1WinpLight(team1_id, team2_id, date, light):
    '''
    Function to compute team1's win% against team2 from the current game at the given lighting.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. light: Lighting condition of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['lighting'] == light)&\
                      (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                       ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0]
    if win_count == 0:
        return 0
    return round(win_count/df_rel.shape[0], 2)

def team1WinpSeries(team1_id, team2_id, date, series):
    '''
    Function to compute team1's win% against team2 from the current game at the given lighting.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. series: Series name of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['series_name'] == series)&\
                      (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                       ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0]
    if win_count == 0:
        return 0
    return round(win_count/df_rel.shape[0], 2)

def team1AvgRunsMargin(team1_id, team2_id, date, n):
    '''
    Function to calculate team1's average runs margin against team2 in last n games.
    
    Input-
    1. team1_id: ID of team1 to calculate average runs margin.
    2. team2_id: ID of team2 to calculate average runs margin against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for the team.
    '''
    df_rel1 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id))&\
                       (match_data['winner_id']==team1_id)&\
                        (match_data['by']=='runs')]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    
    df_rel2 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id))&\
                       (match_data['winner_id']==team2_id)&\
                        (match_data['by']=='runs')]\
                        .sort_values(by='match_dt', ascending=False).head(n)

    df_gen= match_data[(match_data['match_dt']<date)&\
                      (match_data['by']=='runs')]
    if len(df_gen)==0 or len(df_rel1)==0 or len(df_rel2)==0:
        return 0
    else:
      team1MarginRatio = np.mean(df_rel1['win_amount'])/np.mean(df_gen['win_amount'])
      team2MarginRatio = np.mean(df_rel2['win_amount'])/np.mean(df_gen['win_amount'])
      
      return team1MarginRatio/team2MarginRatio

In [11]:
# Team1 to Team2 ratio features

# team1WinpAtGround
train_data['team1WinpAtGround'] = train_data.apply(lambda x: \
                                  team1WinpAtGround(x['team1_id'], x['team2_id'], x['match_dt'], x['ground_id']), axis=1)
test_data['team1WinpAtGround'] = test_data.apply(lambda x: \
                                  team1WinpAtGround(x['team1_id'], x['team2_id'], x['match_dt'], x['ground_id']), axis=1)

# team1WinpLight
train_data['team1WinpLight'] = train_data.apply(lambda x: \
                                  team1WinpLight(x['team1_id'], x['team2_id'], x['match_dt'], x['lighting']), axis=1)
test_data['team1WinpLight'] = test_data.apply(lambda x: \
                                  team1WinpLight(x['team1_id'], x['team2_id'], x['match_dt'], x['lighting']), axis=1)

# team1WinpSeries
train_data['team1WinpSeries'] = train_data.apply(lambda x: \
                                  team1WinpSeries(x['team1_id'], x['team2_id'], x['match_dt'], x['series_name']), axis=1)
test_data['team1WinpSeries'] = test_data.apply(lambda x: \
                                  team1WinpSeries(x['team1_id'], x['team2_id'], x['match_dt'], x['series_name']), axis=1)

# team1AvgRunsMargin
train_data['team1AvgRunsMargin'] = train_data.apply(lambda x: \
                                  team1AvgRunsMargin(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
test_data['team1AvgRunsMargin'] = test_data.apply(lambda x: \
                                  team1AvgRunsMargin(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)

In [12]:
cols = ['team_count_50runs_last15', 'team_winp_last5',
'team1only_avg_runs_last15', 'team1_winp_team2_last15',
'ground_avg_runs_last15', 'tossWinnerWins', 'teamBatsFirstWins',
'teamBatsFirstWinsAtGround', 'team1WinpAtGround', 'team1WinpLight',
'team1WinpSeries', 'team1AvgRunsMargin']

In [13]:
train_data['winner_01'] = train_data.apply(lambda x: 1 if (x['team1']==x['winner']) else 0, axis=1)
train_data.fillna(0, inplace=True), test_data.fillna(0, inplace=True)

(None, None)

In [182]:
train_data_test = train_data.copy()
train_data_test.sort_values(by='match_dt', inplace=True)
X_train, y_train, X_test, y_test = train_data_test[cols][:670], train_data_test['winner_01'][:670], train_data_test[cols][670:], train_data_test['winner_01'][670:]

In [183]:
# XGBOOST
clf_xgb = xgb.XGBClassifier(random_state=0, njobs=-1, verbosity=0, eval_metric='error', objective='binary:logistic')
param_xgb = {'n_estimators':[50,100,200,500],
            'learning_rate':[0.01, 0.05, 0.1, 0.2],
            'max_depth':[3,5,7,9]}

gs_xgb = GridSearchCV(clf_xgb, param_xgb, cv=5, n_jobs=-1)
gs_xgb.fit(X_train, y_train)
print(f"Best parameters are: {gs_xgb.best_params_}")
y_pred = gs_xgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
Accuracy: 0.5287769784172662


In [184]:
a = list(zip(cols, gs_xgb.best_estimator_.feature_importances_))
a.sort(key=lambda x: x[1], reverse=True)
pd.DataFrame(a)

Unnamed: 0,0,1
0,team_count_50runs_last15,0.114219
1,team_winp_last5,0.110076
2,ground_avg_runs_last15,0.099206
3,team1WinpLight,0.094326
4,team1_winp_team2_last15,0.084615
5,team1AvgRunsMargin,0.084027
6,team1only_avg_runs_last15,0.083391
7,teamBatsFirstWins,0.080805
8,teamBatsFirstWinsAtGround,0.072386
9,tossWinnerWins,0.06148


In [140]:
# CATBOOST
clf_cat = cat.CatBoostClassifier(iterations= 100, random_state=0, verbose=0)
param_cat = {'learning_rate':[0.01, 0.05, 0.1, 0.2],
            'depth':[3,5,7,9],
            'subsample':[0.05, 0.1, 0.2, 0.5],
            'min_data_in_leaf':[10,40,70,100]}

clf_cat = GridSearchCV(clf_cat, param_cat, cv=5, n_jobs=-1)
clf_cat.fit(X_train, y_train)
print(f"Best parameters are: {clf_cat.best_params_}")
y_pred = clf_cat.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'depth': 5, 'learning_rate': 0.01, 'min_data_in_leaf': 10, 'subsample': 0.2}
Accuracy: 0.5683453237410072


In [141]:
clf_cat.best_estimator_.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,team_count_50runs_last15,19.932483
1,team_winp_last5,14.483663
2,ground_avg_runs_last15,13.496552
3,team1AvgRunsMargin,10.955421
4,tossWinnerWins,8.882296
5,teamBatsFirstWins,8.290404
6,team1only_avg_runs_last15,7.918128
7,team1WinpSeries,5.154531
8,team1_winp_team2_last15,4.900081
9,team1WinpLight,3.251382


In [142]:
# GB
clf_gb = GradientBoostingClassifier(random_state=0)
param_gb = {'learning_rate':[0.01, 0.05, 0.1, 0.2],
            'max_depth':[3,5,7,9],
            'n_estimators':[50,100,200,500],
            'subsample':[0.05, 0.1, 0.2, 0.5],
            'min_samples_split':[2,5,10,15]}

clf_gb = GridSearchCV(clf_gb, param_gb, cv=5, n_jobs=-1)
clf_gb.fit(X_train, y_train)
print(f"Best parameters are: {clf_gb.best_params_}")
y_pred = clf_gb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 15, 'n_estimators': 200, 'subsample': 0.05}
Accuracy: 0.5359712230215827


In [143]:
feature_importances = clf_gb.best_estimator_.feature_importances_
a = list(zip(cols, feature_importances))
a.sort(key=lambda x: x[1], reverse=True)
a

[('team1only_avg_runs_last15', 0.18297631338590617),
 ('team_count_50runs_last15', 0.1647814263835458),
 ('ground_avg_runs_last15', 0.15185736698936908),
 ('team1AvgRunsMargin', 0.14459407414173786),
 ('team_winp_last5', 0.1118761670634373),
 ('tossWinnerWins', 0.07084008482562533),
 ('team1_winp_team2_last15', 0.05733567230240002),
 ('teamBatsFirstWins', 0.04715559824897765),
 ('team1WinpSeries', 0.02930069659172903),
 ('team1WinpLight', 0.015183492627022933),
 ('team1WinpAtGround', 0.014293781401351372),
 ('teamBatsFirstWinsAtGround', 0.009805326038897427)]

In [185]:
# LGB
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 5)
param_lgb = {'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'min_data_in_leaf':[10,40,70,100],
            'max_depth':[3,5,7,9],}

clf_lgb = GridSearchCV(clf_lgb, param_lgb, cv=5, n_jobs=-1)
clf_lgb.fit(X_train, y_train)
print(f"Best parameters are: {clf_lgb.best_params_}")
y_pred = clf_lgb.predict(X_test)
acc = np.mean(y_pred == y_test)
print(f'Accuracy: {acc}')

Best parameters are: {'learning_rate': 0.01, 'max_depth': 3, 'min_data_in_leaf': 100}
Accuracy: 0.579136690647482


In [186]:
feature_importances = clf_lgb.best_estimator_.feature_importances_
a = list(zip(cols, feature_importances))
a.sort(key=lambda x: x[1], reverse=True)
a

[('team_count_50runs_last15', 115),
 ('team1AvgRunsMargin', 94),
 ('ground_avg_runs_last15', 66),
 ('team_winp_last5', 54),
 ('tossWinnerWins', 33),
 ('team1only_avg_runs_last15', 20),
 ('teamBatsFirstWins', 12),
 ('team1_winp_team2_last15', 0),
 ('teamBatsFirstWinsAtGround', 0),
 ('team1WinpAtGround', 0),
 ('team1WinpLight', 0),
 ('team1WinpSeries', 0)]

In [187]:
# Testing the model
train_data_test = train_data.copy()
train_data_test.sort_values(by='match_dt', inplace=True)
X_train, y_train, X_test, y_test = train_data_test[cols][:670], train_data_test['winner_01'][:670], train_data_test[cols][670:], train_data_test['winner_01'][670:]

clf_xgb = gs_xgb.best_estimator_
# clf_cat = clf_cat.best_estimator_
# clf_gb = clf_gb.best_estimator_
clf_lgbm = clf_lgb.best_estimator_

models = {
    'xgb': clf_xgb,
    # 'cat': clf_cat,
    # 'gb' : clf_gb
    'lgbm': clf_lgbm

}

trained_models = [(name, model) for name, model in models.items()]

ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

acc = np.mean(y_pred == y_test)
print(f'Accuracy for ensemble: {acc}')

Accuracy for ensemble: 0.5503597122302158


In [188]:
feature_importance = np.mean(np.array([clf.feature_importances_ for clf in ensemble.estimators_]), axis=0)
a = list(zip(cols, feature_importance))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
feature_importance

Unnamed: 0,feat_name,model_feat_imp_train
0,team_count_50runs_last15,57.55711
1,team1AvgRunsMargin,47.042013
2,ground_avg_runs_last15,33.049603
3,team_winp_last5,27.055038
4,tossWinnerWins,16.53074
5,team1only_avg_runs_last15,10.041696
6,teamBatsFirstWins,6.040402
7,team1WinpLight,0.047163
8,team1_winp_team2_last15,0.042307
9,teamBatsFirstWinsAtGround,0.036193


## Actual

In [14]:
X_train, y_train, X_test = train_data[cols], train_data['winner_01'], test_data[cols]

In [192]:
# X_train, y_train, X_test = train_data[cols], train_data['winner_01'], test_data[cols]
# clf_xgb = gs_xgb.best_estimator_
# clf_lgbm = clf_lgb.best_estimator_

# models = {
#     'xgb': clf_xgb,
#     'lgbm': clf_lgbm

# }

# trained_models = [(name, model) for name, model in models.items()]

# ensemble = VotingClassifier(estimators=trained_models, voting='soft', verbose=False, n_jobs=-1)
# ensemble.fit(X_train, y_train)

In [191]:
# feature_importance = np.mean(np.array([clf.feature_importances_ for clf in ensemble.estimators_]), axis=0)
# a = list(zip(cols, feature_importance))
# a.sort(key=lambda x: x[1], reverse=True)
# feature_importance = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train'])
# feature_importance

In [15]:
# LGB
clf_lgb = lgb.LGBMClassifier(random_state=0, verbose=-1, num_leaves = 5)
param_lgb = {'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'min_data_in_leaf':[10,40,70,100],
            'max_depth':[3,5,7,9],}

clf_lgb = GridSearchCV(clf_lgb, param_lgb, cv=5, n_jobs=-1)
clf_lgb.fit(X_train, y_train)
print(f"Best parameters are: {clf_lgb.best_params_}")

Best parameters are: {'learning_rate': 0.05, 'max_depth': 5, 'min_data_in_leaf': 100}


In [16]:
feature_desc = {'ground_avg_runs_last15':'average runs scored in the ground in last 15 games',
                'team_count_50runs_last15':'Ratio of number of 50s by players in team1 to number of 50s by players in team2 in last 15 games',
                'team1only_avg_runs_last15':'team1\'s avg inning runs in last 15 games',
                'team_winp_last5':'Ratio of team1\'s win % to team2\'s win % in last 5 games',
                'tossWinnerWins': 'Ratio of toss winner winning past matches',
                'teamBatsFirstWins': 'Ratio of team batting first winning past matches',
                'team1WinpSeries': 'Ratio of team1\'s win % to team2\'s win % in given series',
                'team1_winp_team2_last15':'Team1\'s win percentage against Team2 in last 15 games',
                'team1WinpAtGround': 'iRatio of team1\'s win % to team2\'s win % in given ground',
                'team1WinpLight': 'Ratio of team1\'s win % to team2\'s win % in given lighting',
                'team1AvgRunsMargin': 'Ratio of team1\'s average runs margin to team2\'s average runs margin in last 15 games',}

In [28]:
clf_lgb.best_estimator_.feature_importances_

array([50, 64, 62, 14, 73, 32, 22,  4, 13, 10, 15, 41])

In [32]:
feature_importance = clf_lgb.best_estimator_.feature_importances_
a = list(zip(cols, feature_importance))
a.sort(key=lambda x: x[1], reverse=True)
feature_importance = pd.DataFrame(a, columns=['feat_name', 'model_feat_imp_train']).head(10)
feature_importance['model_feat_imp_train'] = feature_importance['model_feat_imp_train']/feature_importance['model_feat_imp_train'].max()
feature_importance['feat_description'] = feature_importance['feat_name'].map(feature_desc)
feature_importance['feat_id'] = [i+1 for i in feature_importance.index]
feature_importance['feat_rank_train'] = [i+1 for i in feature_importance.index]

In [18]:
train_data['y_pred_01'] = clf_lgb.predict(X_train)
test_data['y_pred_01'] = clf_lgb.predict(X_test)

train_data['win_pred_score'] = clf_lgb.predict_proba(X_train)[:,1]
test_data['win_pred_score'] = clf_lgb.predict_proba(X_test)[:,1]

train_data['win_pred_score'] = np.where( (train_data['y_pred_01']==0), (1-train_data['win_pred_score']), train_data['win_pred_score'])
test_data['win_pred_score'] = np.where( (test_data['y_pred_01']==0), (1-test_data['win_pred_score']), test_data['win_pred_score'])

train_data['win_pred_team_id'] = np.where( (train_data['y_pred_01']==1), (train_data['team1_id']), train_data['team2_id'])
test_data['win_pred_team_id'] = np.where( (test_data['y_pred_01']==1), (test_data['team1_id']), test_data['team2_id'])

In [24]:
## refactor
train_data['dataset_type'] = 'train'
train_data['train_algorithm'] = 'lightgbm'
train_data['is_ensemble'] = 'no'
train_data['train_hps_trees'] = clf_lgb.best_estimator_.n_estimators_
train_data['train_hps_depth'] = clf_lgb.best_estimator_.max_depth
train_data['train_hps_lr'] = clf_lgb.best_estimator_.learning_rate
train_data['match id'] = train_data['match_id']

test_data['dataset_type'] = 'r1'
test_data['train_algorithm'] = 'lightgbm'
test_data['is_ensemble'] = 'no'
test_data['train_hps_trees'] = clf_lgb.best_estimator_.n_estimators_
test_data['train_hps_depth'] = clf_lgb.best_estimator_.max_depth
test_data['train_hps_lr'] = clf_lgb.best_estimator_.learning_rate
test_data['match id'] = test_data['match_id']

df_file1 = pd.concat([test_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(feature_importance['feat_name'].head(10))], \
                     train_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(feature_importance['feat_name'].head(10))]])

renaming_dict = {}
for i,col in enumerate(list(feature_importance['feat_name'].head(10))):
    renaming_dict[col] = f'indep_feat_id{i+1}'
df_file1.rename(columns=renaming_dict, inplace=True)

for i in range(1,11):
    if f'indep_feat_id{i}' not in df_file1.columns:
        df_file1[f'indep_feat_id{i}'] = np.nan

In [20]:
df_file1.shape

(1219, 19)

In [34]:
df_file2 = feature_importance[['feat_id', 'feat_name', 'feat_description', 'model_feat_imp_train','feat_rank_train']]

In [35]:
df_file2.shape

(10, 5)

In [36]:
df_file1.to_csv('sub/2/primary_submission.csv', index=False)
df_file2.to_csv('sub/2/secondary_submission.csv', index=False)