In [48]:
# Importing libraries
import pandas as pd
import numpy as np

In [49]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def unwrap_rosters(df):
    df['team1_roster_ids'] = df['team1_roster_ids'].apply(lambda x: x.split(':'))
    df['team2_roster_ids'] = df['team2_roster_ids'].apply(lambda x: x.split(':'))
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [50]:
# Importing datset
train_data = pd.read_csv('data//train_data.csv',)

batsman_data = pd.read_csv('data//batsman_level_data.csv')

bowler_data = pd.read_csv('data//bowler_level_data.csv')

match_data = pd.read_csv('data//match_level_data.csv')

test_data = pd.read_csv('data//round_1_sub_data.csv')

In [51]:
train_data = data_preprocessing(train_data)
train_data = unwrap_rosters(train_data)
match_data = data_preprocessing(match_data)
match_data = unwrap_rosters(match_data)
test_data = data_preprocessing(test_data)
test_data = unwrap_rosters(test_data)

batsman_data = data_preprocessing(batsman_data)
bowler_data = data_preprocessing(bowler_data)

In [52]:
# Reordering columns
match_data['toss_winner_id'] = np.where(match_data['toss_winner'] == match_data['team1'], match_data['team1_id'], match_data['team2_id'])
match_data = match_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
        'toss_winner_id', 'toss_decision', 'ground_id', 'lighting', 'series_name',
        'winner_id', 'by', 'win_amount', 'player_of_the_match_id',
        'inning1_runs', 'inning1_wickets', 'inning1_balls', 'inning2_runs', 'inning2_wickets', 'inning2_balls',
        'team1_roster_ids', 'team2_roster_ids']]

train_data['toss_winner_id'] = np.where(train_data['toss_winner'] == train_data['team1'], train_data['team1_id'], train_data['team2_id'])
train_data = train_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
       'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
       'winner_id', 'team1_roster_ids', 'team2_roster_ids', 'team_count_50runs_last15',
       'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15','ground_avg_runs_last15']]

test_data = test_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
       'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
       'team1_roster_ids', 'team2_roster_ids', 'team_count_50runs_last15',
       'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15','ground_avg_runs_last15']]

train_data.rename(columns={'team_count_50runs_last15': 'team1Num50Last15', 'team_winp_last5': 'team1WinpLast5', 'team1only_avg_runs_last15':'team1OnlyAvgRunsLast15',
                           'team1_winp_team2_last15':'team1WinpLast15F2F','ground_avg_runs_last15':'groundAvgRunsLast15'}, inplace=True)

test_data.rename(columns={'team_count_50runs_last15': 'team1Num50Last15', 'team_winp_last5': 'team1WinpLast5', 'team1only_avg_runs_last15':'team1OnlyAvgRunsLast15',
                           'team1_winp_team2_last15':'team1WinpLast15F2F','ground_avg_runs_last15':'groundAvgRunsLast15'}, inplace=True)

In [54]:
# Cheatsheet features
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

def no50sLastn(player_list, date, n):
    '''
    Function to get total number of 50s scored by players in the roster of a team in last n games.
    
    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    '''
    res_list = []
    for player in player_list:
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0)
        res_list.append(np.nansum(df_rel['gte_50runs']))
    return np.nansum(res_list)

def avgRunsGround(ground_id, date, n):
    df_rel = match_data[(match_data['match_dt']<date)&(match_data['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2
    return df_rel['avg_runs_inn'].mean()

def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    '''
    match_data['team1_bat_inning'] = np.where( ((match_data['team1']==match_data['toss winner'])&(match_data['toss decision']=='bat'))|\
                                               ((match_data['team2']==match_data['toss winner'])&(match_data['toss decision']=='field')) , 1, 2)

    df_rel = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team_id)|(match_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean()


In [55]:
def avgWicketsGround(ground_id, date, n):
    df_rel = match_data[(match_data['match_dt']<date)&(match_data['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_wickets_inn'] = (df_rel['inning1_wickets']+df_rel['inning2_wickets'])/2
    return df_rel['avg_wickets_inn'].mean()

def lightAvgRunsLast15(light, date, n):
    df_rel = match_data[(match_data['match_dt']<date)&(match_data['lighting']==light)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2
    return df_rel['avg_runs_inn'].mean()

def lightAvgWicketsLast15(light, date, n):
    df_rel = match_data[(match_data['match_dt']<date)&(match_data['lighting']==light)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_wickets_inn'] = (df_rel['inning1_wickets']+df_rel['inning2_wickets'])/2
    return df_rel['avg_wickets_inn'].mean()

def seriesAvgRunsLast15(series, date, n):
    df_rel = match_data[(match_data['match_dt']<date)&(match_data['series_name']==series)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2
    return df_rel['avg_runs_inn'].mean()

def seriesAvgWicketsLast15(series, date, n):
    df_rel = match_data[(match_data['match_dt']<date)&(match_data['series_name']==series)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_wickets_inn'] = (df_rel['inning1_wickets']+df_rel['inning2_wickets'])/2
    return df_rel['avg_wickets_inn'].mean()

def inn1AvgRunsLast15(date, n):
    df_rel = match_data[(match_data['match_dt']<date)].sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning1_runs'].mean()

def inn2AvgRunsLast15(date, n):
    df_rel = match_data[(match_data['match_dt']<date)].sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning2_runs'].mean()

def inn1AvgWicketsLast15(date, n):
    df_rel = match_data[(match_data['match_dt']<date)].sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning1_wickets'].mean()

def inn2AvgWicketsLast15(date, n):
    df_rel = match_data[(match_data['match_dt']<date)].sort_values(by='match_dt', ascending=False).head(n)
    return df_rel['inning2_wickets'].mean()

In [56]:
# Team1 to Team2 ratio features

def team1tossWinnerWins(team1_id, team2_id, date):

    match_data['DtossWinnerWins'] = np.where(match_data['toss_winner_id'] == match_data['winner_id'], 1, 0)
    df_rel1 = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    df_rel2 = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    win_count1 = df_rel1[df_rel1['DtossWinnerWins'] == 1]
    win_count1 = win_count1[win_count1['winner_id'] == team1_id]

    win_count2 = df_rel2[df_rel2['DtossWinnerWins'] == 1]
    win_count2 = win_count2[win_count2['winner_id'] == team2_id]

    if len(df_rel1)==0 or len(df_rel2)==0 or len(win_count1) == 0 or len(win_count2) ==0:
        return 1
    else:
        return round((len(win_count1)/len(df_rel1))/(len(win_count2)/len(df_rel2)),2)
  
def team1BatsFirstWins(team1_id, team2_id, date):
    
    match_data['toss_decision_bats'] = np.where(match_data['toss_decision'] == 'bat', 1, 0)

    match_data['DteamBatsFirstWins'] = np.where(((match_data['DtossWinnerWins'] == 1) & (match_data['toss_decision_bats'] == 1))
                                          | ((match_data['DtossWinnerWins'] == 0) & (match_data['toss_decision_bats'] == 0)), 1, 0)

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    df_rel2 = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id)))]\
                        .sort_values(by='match_dt', ascending=False)
        
    win_count1 = df_rel1[df_rel1['DteamBatsFirstWins'] == 1]
    win_count1 = win_count1[win_count1['winner_id'] == team1_id]

    win_count2 = df_rel2[df_rel2['DteamBatsFirstWins'] == 1]
    win_count2 = win_count2[win_count2['winner_id'] == team2_id]

    if len(df_rel1)==0 or len(df_rel2)==0 or len(win_count1) == 0 or len(win_count2) ==0:
        return 1
    else:
        return round((len(win_count1)/len(df_rel1))/(len(win_count2)/len(df_rel2)),2)

def team1BatsSecondWins(team1_id, team2_id, date):

    match_data['DteamBatsSecondWins'] = np.where(((match_data['DtossWinnerWins'] == 1) & (match_data['toss_decision_bats'] == 0))
                                          | ((match_data['DtossWinnerWins'] == 0) & (match_data['toss_decision_bats'] == 1)), 1, 0)

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    df_rel2 = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id)))]\
                        .sort_values(by='match_dt', ascending=False)
        
    win_count1 = df_rel1[df_rel1['DteamBatsSecondWins'] == 1]
    win_count1 = win_count1[win_count1['winner_id'] == team1_id]

    win_count2 = df_rel2[df_rel2['DteamBatsSecondWins'] == 1]
    win_count2 = win_count2[win_count2['winner_id'] == team2_id]

    if len(df_rel1)==0 or len(df_rel2)==0 or len(win_count1) == 0 or len(win_count2) ==0:
        return 1
    else:
        return round((len(win_count1)/len(df_rel1))/(len(win_count2)/len(df_rel2)),2)

def team1WinpLight(team1_id, team2_id, date, light):

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                        (match_data['lighting'] == light)&\
                      (((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    df_rel2 = match_data[(match_data['match_dt']<date)&\
                        (match_data['lighting'] == light)&\
                      (((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id)))]\
                        .sort_values(by='match_dt', ascending=False)
 
    win_count1 = df_rel1[df_rel1['winner_id']==team1_id]
    win_count2 = df_rel2[df_rel2['winner_id']==team2_id]

    if len(df_rel1) == 0 or len(df_rel2) == 0 or len(win_count1) == 0 or len(win_count2) == 0:
        return 1
    else:
        return round((len(win_count1)/len(df_rel1))/(len(win_count2)/len(df_rel2)),2)

def team1WinpSeries(team1_id, team2_id, date, series):

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                        (match_data['series_name'] == series)&\
                      (((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    df_rel2 = match_data[(match_data['match_dt']<date)&\
                        (match_data['series_name'] == series)&\
                      (((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    win_count1 = df_rel1[df_rel1['winner_id']==team1_id]
    win_count2 = df_rel2[df_rel2['winner_id']==team2_id]

    if len(df_rel1) == 0 or len(df_rel2) == 0 or len(win_count1) == 0 or len(win_count2) == 0:
        return 1
    else:
        return round((len(win_count1)/len(df_rel1))/(len(win_count2)/len(df_rel2)),2)

def team1AvgRunsMargin(team1_id, team2_id, date, n):

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id))&\
                       (match_data['winner_id']==team1_id)&\
                        (match_data['by']=='runs')]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    
    df_rel2 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id))&\
                       (match_data['winner_id']==team2_id)&\
                        (match_data['by']=='runs')]\
                        .sort_values(by='match_dt', ascending=False).head(n)

    if len(df_rel1)==0 or len(df_rel2)==0:
        return 1
    else:
        return np.mean(df_rel1['win_amount'])/np.mean(df_rel2['win_amount'])
    
def team1AvgWicketsMargin(team1_id, team2_id, date, n):

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id))&\
                       (match_data['winner_id']==team1_id)&\
                        (match_data['by']=='wickets')]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    
    df_rel2 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id))&\
                       (match_data['winner_id']==team2_id)&\
                        (match_data['by']=='wickets')]\
                        .sort_values(by='match_dt', ascending=False).head(n)

    if len(df_rel1)==0 or len(df_rel2)==0:
        return 1
    else:
        return np.mean(df_rel1['win_amount'])/np.mean(df_rel2['win_amount'])

def team1AvgWicketsLost(team1_id, team2_id, date, n):

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    df_rel1['team1'] = team1_id

    df_rel2 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    df_rel2['team2'] = team2_id

    df_rel1['bat_inning'] = np.where( ((df_rel1['team1']==df_rel1['toss_winner_id'])&(df_rel1['toss_decision']=='bat'))|\
                                            ((df_rel1['team1']!=df_rel1['toss_winner_id'])&(df_rel1['toss_decision']=='field')) , 1, 2)

    df_rel2['bat_inning'] = np.where( ((df_rel2['team2']==df_rel2['toss_winner_id'])&(df_rel2['toss_decision']=='bat'))|\
                                            ((df_rel2['team2']!=df_rel2['toss_winner_id'])&(df_rel2['toss_decision']=='field')) , 1, 2)
    
    df_rel1 = df_rel1[df_rel1['bat_inning']==1][['inning1_wickets', 'inning1_balls']].rename(columns={'inning1_wickets':'wickets', 'inning1_balls':'balls'})
    df_rel2 = df_rel2[df_rel2['bat_inning']==1][['inning1_wickets', 'inning1_balls']].rename(columns={'inning1_wickets':'wickets', 'inning1_balls':'balls'})
    
    df_rel1['ball_per_wick'] = df_rel1['balls']/(df_rel1['wickets']+1)
    df_rel2['ball_per_wick'] = df_rel2['balls']/(df_rel2['wickets']+1)

    if len(df_rel1)==0 or len(df_rel2)==0:
        return 1
    else:
        return df_rel1['ball_per_wick'].mean()/df_rel2['ball_per_wick'].mean()

def team1AvgRR(team1_id, team2_id, date, n):

    df_rel1 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    df_rel1['team1'] = team1_id

    df_rel2 = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team2_id)|(match_data['team2_id']==team2_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    df_rel2['team2'] = team2_id

    df_rel1['bat_inning'] = np.where( ((df_rel1['team1']==df_rel1['toss_winner_id'])&(df_rel1['toss_decision']=='bat'))|\
                                            ((df_rel1['team1']!=df_rel1['toss_winner_id'])&(df_rel1['toss_decision']=='field')) , 1, 2)

    df_rel2['bat_inning'] = np.where( ((df_rel2['team2']==df_rel2['toss_winner_id'])&(df_rel2['toss_decision']=='bat'))|\
                                            ((df_rel2['team2']!=df_rel2['toss_winner_id'])&(df_rel2['toss_decision']=='field')) , 1, 2)
    
    df_rel1 = pd.concat([ df_rel1[df_rel1['bat_inning']==1][['inning1_runs', 'inning1_balls']].rename(columns={'inning1_runs':'runs', 'inning1_balls':'balls'}), \
                         df_rel1[df_rel1['bat_inning']==2][['inning2_runs', 'inning2_balls']].rename(columns={'inning2_runs':'runs', 'inning1_balls':'balls'}) ] )
    
    df_rel1['RR'] = df_rel1['runs']/df_rel1['balls']
    
    df_rel2 = pd.concat([ df_rel2[df_rel2['bat_inning']==1][['inning1_runs', 'inning1_balls']].rename(columns={'inning1_runs':'runs', 'inning1_balls':'balls'}), \
                         df_rel2[df_rel2['bat_inning']==2][['inning2_runs', 'inning2_balls']].rename(columns={'inning2_runs':'runs', 'inning1_balls':'balls'}) ] )
    
    df_rel2['RR'] = df_rel2['runs']/df_rel2['balls']

    if len(df_rel1)==0 or len(df_rel2)==0:
        return 1
    else:
        return df_rel1['RR'].mean()/df_rel2['RR'].mean()


In [57]:
train_data['groundAvgWicketsLast15'] = train_data.apply(lambda x: \
                                  avgWicketsGround(x['ground_id'], x['match_dt'], 15), axis=1)

test_data['groundAvgWicketsLast15'] = test_data.apply(lambda x: \
                                  avgWicketsGround(x['ground_id'], x['match_dt'], 15), axis=1)

train_data['lightAvgRunsLast15'] = train_data.apply(lambda x: \
                                  lightAvgRunsLast15(x['lighting'], x['match_dt'], 15), axis=1)

test_data['lightAvgRunsLast15'] = test_data.apply(lambda x: \
                                  lightAvgRunsLast15(x['lighting'], x['match_dt'], 15), axis=1)

train_data['lightAvgWicketsLast15'] = train_data.apply(lambda x: \
                                  lightAvgWicketsLast15(x['lighting'], x['match_dt'], 15), axis=1)

test_data['lightAvgWicketsLast15'] = test_data.apply(lambda x: \
                                  lightAvgWicketsLast15(x['lighting'], x['match_dt'], 15), axis=1)

train_data['seriesAvgRunsLast15'] = train_data.apply(lambda x: \
                                  seriesAvgRunsLast15(x['series_name'], x['match_dt'], 15), axis=1)

test_data['seriesAvgRunsLast15'] = test_data.apply(lambda x: \
                                  seriesAvgRunsLast15(x['series_name'], x['match_dt'], 15), axis=1)

train_data['seriesAvgWicketsLast15'] = train_data.apply(lambda x: \
                                  seriesAvgWicketsLast15(x['series_name'], x['match_dt'], 15), axis=1)

test_data['seriesAvgWicketsLast15'] = test_data.apply(lambda x: \
                                  seriesAvgWicketsLast15(x['series_name'], x['match_dt'], 15), axis=1)

train_data['inn1AvgRunsLast15'] = train_data.apply(lambda x: \
                                  inn1AvgRunsLast15(x['match_dt'], 15), axis=1)

test_data['inn1AvgRunsLast15'] = test_data.apply(lambda x: \
                                  inn1AvgRunsLast15(x['match_dt'], 15), axis=1)

train_data['inn2AvgRunsLast15'] = train_data.apply(lambda x: \
                                  inn2AvgRunsLast15(x['match_dt'], 15), axis=1)

test_data['inn2AvgRunsLast15'] = test_data.apply(lambda x: \
                                  inn2AvgRunsLast15(x['match_dt'], 15), axis=1)

train_data['inn1AvgWicketsLast15'] = train_data.apply(lambda x: \
                                  inn1AvgWicketsLast15(x['match_dt'], 15), axis=1)

test_data['inn1AvgWicketsLast15'] = test_data.apply(lambda x: \
                                  inn1AvgWicketsLast15(x['match_dt'], 15), axis=1)

train_data['inn2AvgWicketsLast15'] = train_data.apply(lambda x: \
                                  inn2AvgWicketsLast15(x['match_dt'], 15), axis=1)

test_data['inn2AvgWicketsLast15'] = test_data.apply(lambda x: \
                                  inn2AvgWicketsLast15(x['match_dt'], 15), axis=1)

In [58]:
# Team1 to Team2 ratio features

# team1tossWinnerWins
train_data['team1tossWinnerWins'] = train_data.apply(lambda x: team1tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['team1tossWinnerWins'] = test_data.apply(lambda x: team1tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# team1BatsFirstWins
train_data['team1BatsFirstWins'] = train_data.apply(lambda x: team1BatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['team1BatsFirstWins'] = test_data.apply(lambda x: team1BatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# team1BatsSecondWins
train_data['team1BatsSecondWins'] = train_data.apply(lambda x: team1BatsSecondWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['team1BatsSecondWins'] = test_data.apply(lambda x: team1BatsSecondWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# team1WinpLight
train_data['team1WinpLight'] = train_data.apply(lambda x: team1WinpLight(x['team1_id'], x['team2_id'], x['match_dt'], x['lighting']), axis=1)
test_data['team1WinpLight'] = test_data.apply(lambda x: team1WinpLight(x['team1_id'], x['team2_id'], x['match_dt'], x['lighting']), axis=1)

# team1WinpSeries
train_data['team1WinpSeries'] = train_data.apply(lambda x: team1WinpSeries(x['team1_id'], x['team2_id'], x['match_dt'], x['series_name']), axis=1)
test_data['team1WinpSeries'] = test_data.apply(lambda x: team1WinpSeries(x['team1_id'], x['team2_id'], x['match_dt'], x['series_name']), axis=1)

# team1AvgRunsMargin
train_data['team1AvgRunsMargin'] = train_data.apply(lambda x: team1AvgRunsMargin(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
test_data['team1AvgRunsMargin'] = test_data.apply(lambda x: team1AvgRunsMargin(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)

# team1AvgWicketsMargin
train_data['team1AvgWicketsMargin'] = train_data.apply(lambda x: team1AvgWicketsMargin(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)
test_data['team1AvgWicketsMargin'] = test_data.apply(lambda x: team1AvgWicketsMargin(x['team1_id'], x['team2_id'], x['match_dt'], 15), axis=1)

# team1AvgWicketsLost
train_data['team1AvgWicketsLost'] = train_data.apply(lambda x: team1AvgWicketsLost(x['team1_id'], x['team2_id'], x['match_dt'],15), axis=1)
test_data['team1AvgWicketsLost'] = test_data.apply(lambda x: team1AvgWicketsLost(x['team1_id'], x['team2_id'], x['match_dt'],15), axis=1)

# team1AvgRR
train_data['team1AvgRR'] = train_data.apply(lambda x: team1AvgRR(x['team1_id'], x['team2_id'], x['match_dt'],15), axis=1)
test_data['team1AvgRR'] = test_data.apply(lambda x: team1AvgRR(x['team1_id'], x['team2_id'], x['match_dt'],15), axis=1)

In [131]:
# team1_id, team2_id, date = train_data.iloc[0][['team1_id', 'team2_id', 'match_dt']]
# team1_id, team2_id, date

In [132]:
# df_rel1 = match_data[(match_data['match_dt']<date)&\
#                       ((match_data['team1_id']==team1_id)|(match_data['team2_id']==team1_id))]\
#                         .sort_values(by='match_dt', ascending=False).head(15)
# df_rel1['team1'] = team1_id
# df_rel1['bat_inning'] = np.where( ((df_rel1['team1']==df_rel1['toss_winner_id'])&(df_rel1['toss_decision']=='bat'))|\
#                                         ((df_rel1['team1']!=df_rel1['toss_winner_id'])&(df_rel1['toss_decision']=='field')) , 1, 2)

# df_rel1 = df_rel1[df_rel1['bat_inning']==1]
# df_rel1[['team1_id', 'team2_id', 'toss_winner_id', 'toss_decision', 'bat_inning']]

In [60]:
train_data.columns

Index(['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
       'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
       'winner_id', 'team1_roster_ids', 'team2_roster_ids', 'team1Num50Last15',
       'team1WinpLast5', 'team1OnlyAvgRunsLast15', 'team1WinpLast15F2F',
       'groundAvgRunsLast15', 'groundAvgWicketsLast15', 'lightAvgRunsLast15',
       'lightAvgWicketsLast15', 'seriesAvgRunsLast15',
       'seriesAvgWicketsLast15', 'inn1AvgRunsLast15', 'inn2AvgRunsLast15',
       'inn1AvgWicketsLast15', 'inn2AvgWicketsLast15', 'team1tossWinnerWins',
       'team1BatsFirstWins', 'team1BatsSecondWins', 'team1WinpLight',
       'team1WinpSeries', 'team1AvgRunsMargin', 'team1AvgWicketsMargin',
       'team1AvgWicketsLost', 'team1AvgRR'],
      dtype='object')

In [61]:
cols = ['team1Num50Last15',
       'team1WinpLast5', 'team1OnlyAvgRunsLast15', 'team1WinpLast15F2F',
       'groundAvgRunsLast15', 'groundAvgWicketsLast15', 'lightAvgRunsLast15',
       'lightAvgWicketsLast15', 'seriesAvgRunsLast15',
       'seriesAvgWicketsLast15', 'inn1AvgRunsLast15', 'inn2AvgRunsLast15',
       'inn1AvgWicketsLast15', 'inn2AvgWicketsLast15', 'team1tossWinnerWins',
       'team1BatsFirstWins', 'team1BatsSecondWins', 'team1WinpLight',
       'team1WinpSeries', 'team1AvgRunsMargin', 'team1AvgWicketsMargin',
       'team1AvgWicketsLost', 'team1AvgRR']

In [62]:
train_data['winner_01'] = train_data.apply(lambda x: 1 if (x['team1_id']==x['winner_id']) else 0, axis=1)
train_data.fillna(0, inplace=True), test_data.fillna(0, inplace=True)

(None, None)

In [66]:
# train_data.to_csv('newdata//1//train_data.csv', index=False)
# test_data.to_csv('newdata//1//test_data.csv', index=False)