In [280]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [281]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def unwrap_rosters(df):
    df['team1_roster_ids'] = df['team1_roster_ids'].apply(lambda x: x.split(':'))
    df['team2_roster_ids'] = df['team2_roster_ids'].apply(lambda x: x.split(':'))
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [282]:
# Importing datset
train_data = pd.read_csv('data//train_data.csv',)

batsman_data = pd.read_csv('data//batsman_level_data.csv')

bowler_data = pd.read_csv('data//bowler_level_data.csv')

match_data = pd.read_csv('data//match_level_data.csv')

test_data = pd.read_csv('data//round_1_sub_data.csv')

In [283]:
train_data = data_preprocessing(train_data)
train_data = unwrap_rosters(train_data)
match_data = data_preprocessing(match_data)
match_data = unwrap_rosters(match_data)
test_data = data_preprocessing(test_data)
test_data = unwrap_rosters(test_data)

batsman_data = data_preprocessing(batsman_data)
bowler_data = data_preprocessing(bowler_data)

In [284]:
# Reordering columns
match_data = match_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
        'toss_winner', 'toss_decision', 'venue', 'ground_id', 'city', 'lighting', 'series_name', 'season', 'series_type',
        'winner', 'winner_id', 'by', 'win_amount', 'player_of_the_match_id',
        'inning1_runs', 'inning1_wickets', 'inning1_balls', 'inning2_runs', 'inning2_wickets', 'inning2_balls',
        'umpire1', 'umpire2', 'team1_roster_ids', 'team2_roster_ids']]

train_data = train_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
       'toss_winner', 'toss_decision', 'venue', 'ground_id', 'city', 'lighting', 'series_name', 'season',
       'winner', 'winner_id', 'team1_roster_ids', 'team2_roster_ids', 'team_count_50runs_last15',
       'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15','ground_avg_runs_last15']]

In [285]:
# Feature Engineering Functions

def tossWinnerWins(team1_id, team2_id, date):
    '''
    Function to compute percent of games toss winner wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    '''
    match_data['tossWinnerWins'] = np.where(match_data['toss_winner'] == match_data['winner'], 1, 0)
    df_rel = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    res = df_rel['tossWinnerWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['tossWinnerWins'].values),2)
    
def teamBatsFirstWins(team1_id, team2_id, date):
    '''
    Function to compute percent of games team that bats first wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    '''
    match_data['toss_decision_bats'] = np.where(match_data['toss_decision'] == 'bat', 1, 0)

    match_data['teamBatsFirstWins'] = np.where(((match_data['tossWinnerWins'] == 1) & (match_data['toss_decision_bats'] == 1))
                                          | ((match_data['tossWinnerWins'] == 0) & (match_data['toss_decision_bats'] == 0)), 1, 0)
  
    df_rel = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    res = df_rel['teamBatsFirstWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['teamBatsFirstWins'].values),2)
    
def team1WinpAtGround(team1_id, team2_id, date, ground):
    '''
    Function to compute team1's win% against team2 from the current game at the given ground.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. ground: Ground ID of the current game.
    '''

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                        (match_data['ground_id'] == ground)&\
                      (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                       ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    df_rel2 = train_data[(train_data['match_dt']<date)&\
                        (train_data['ground_id'] == ground)&\
                      (((train_data['team1_id']==team1_id)&(train_data['team2_id']==team2_id))|\
                       ((train_data['team1_id']==team2_id)&(train_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    df_rel = pd.concat([df_rel1[['winner_id']], df_rel2[['winner_id']]])
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0]
    if win_count == 0:
        return 0
    return round(win_count/df_rel.shape[0], 2)

In [286]:
# tossWinnerWins
train_data['tossWinnerWins'] = train_data.apply(lambda x: tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['tossWinnerWins'] = test_data.apply(lambda x: tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# teamBatsFirstWins
train_data['teamBatsFirstWins'] = train_data.apply(lambda x: teamBatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['teamBatsFirstWins'] = test_data.apply(lambda x: teamBatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# team1WinpAtGround
train_data['team1WinpAtGround'] = train_data.apply(lambda x: \
                                  team1WinpAtGround(x['team1_id'], x['team2_id'], x['match_dt'], x['ground_id']), axis=1)
test_data['team1WinpAtGround'] = test_data.apply(lambda x: \
                                  team1WinpAtGround(x['team1_id'], x['team2_id'], x['match_dt'], x['ground_id']), axis=1)

In [290]:
train_data['winner_01'] = train_data.apply(lambda x: 1 if (x['team1']==x['winner']) else 0, axis=1)

In [291]:
cols = ['ground_avg_runs_last15', 'team1WinpAtGround',
        'team1_winp_team2_last15', 'team1only_avg_runs_last15',
        'teamBatsFirstWins', 'team_count_50runs_last15',
        'team_winp_last5', 'tossWinnerWins']

In [292]:
X,y = train_data[cols], train_data['winner_01']
X_test = test_data[cols]