In [189]:
# Importing libraries
import pandas as pd
import numpy as np
pd.options.mode.copy_on_write = True 

In [190]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def evaluate(l):
    return [eval(x) for x in l]

def unwrap_rosters(df):
    df['team1_roster_ids'] = df['team1_roster_ids'].apply(lambda x: evaluate(x.split("'")[1::2]))
    df['team2_roster_ids'] = df['team2_roster_ids'].apply(lambda x: evaluate(x.split("'")[1::2]))
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [191]:
# Importing datset
train_data = pd.read_csv('newdata//3//train_data.csv',)

batsman_data = pd.read_csv('data//batsman_level_data.csv')

bowler_data = pd.read_csv('data//bowler_level_data.csv')

match_data = pd.read_csv('data//match_level_data.csv')

test_data = pd.read_csv('newdata//3//test_data.csv')

In [192]:
train_data = data_preprocessing(train_data)
train_data = unwrap_rosters(train_data)
match_data = data_preprocessing(match_data)
match_data = unwrap_rosters(match_data)
test_data = data_preprocessing(test_data)
test_data = unwrap_rosters(test_data)

batsman_data = data_preprocessing(batsman_data)
bowler_data = data_preprocessing(bowler_data)

In [193]:
batsman_data['bowler_details'] = batsman_data[batsman_data['bowler_details'].isna() == False].apply(lambda x: x['bowler_details'].split(':')[1:3], axis=1)
batsman_data['batsman_details'] = batsman_data[batsman_data['batsman_details'].isna() == False].apply(lambda x: x['batsman_details'].split(':')[1:3], axis=1)
bowler_data['bowler_details'] = bowler_data[bowler_data['bowler_details'].isna() == False].apply(lambda x: x['bowler_details'].split(':')[1:3], axis=1)

In [269]:
train_data.drop(columns=['winner_01'], inplace=True)

In [111]:
def ngiveLastNgamesPlayer(player_id, date, bat_or_bowl, n):

    if bat_or_bowl == 'bat':
        df_topick = batsman_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

def giveLastNgamesPlayer(player_id, date, bat_or_bowl):

    if bat_or_bowl == 'bat':
        df_topick = batsman_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False)

def team1DeathBoundaries(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['over_faced_first', 'Fours', 'Sixes']]
        df_rel1 = df_rel1[df_rel1['over_faced_first'] > 16]
        df_rel1['boundaries'] = df_rel1['Fours'] + df_rel1['Sixes']
        res_list1.append(np.sum(df_rel1['boundaries']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['over_faced_first', 'Fours', 'Sixes']]
        df_rel2 = df_rel2[df_rel2['over_faced_first'] > 16]
        df_rel2['boundaries'] = df_rel2['Fours'] + df_rel2['Sixes']
        res_list2.append(np.sum(df_rel2['boundaries']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1DeathRuns(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['over_faced_first', 'runs']]
        df_rel1 = df_rel1[df_rel1['over_faced_first'] > 16]
        res_list1.append(np.sum(df_rel1['runs']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['over_faced_first', 'runs']]
        df_rel2 = df_rel2[df_rel2['over_faced_first'] > 16]
        res_list2.append(np.sum(df_rel2['runs']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)

def team1Dravidian(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['balls_faced', 'runs']]
        df_rel1 = df_rel1[df_rel1['balls_faced'] > 15]
        res_list1.append(np.sum(df_rel1['runs']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['balls_faced', 'runs']]
        df_rel2 = df_rel2[df_rel2['balls_faced'] > 15]
        res_list2.append(np.sum(df_rel2['runs']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)


In [266]:
def team1RunsLeft(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['batsman_details', 'runs']]
        if len(df_rel1)>0:
            flag = ['Left' in i[0] for i in df_rel1['batsman_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['runs']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['batsman_details', 'runs']]
        if len(df_rel2)>0:
            flag = ['Left' in i[0] for i in df_rel2['batsman_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['runs']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1RunsRight(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['batsman_details', 'runs']]
        if len(df_rel1)>0:
            flag = ['Right' in i[0] for i in df_rel1['batsman_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['runs']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['batsman_details', 'runs']]
        if len(df_rel2)>0:
            flag = ['Right' in i[0] for i in df_rel2['batsman_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['runs']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1WicketsLeft(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel1)>0:
            flag = ['Left' in i[1] for i in df_rel1['bowler_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['wicket_count']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel2)>0:
            flag = ['Left' in i[1] for i in df_rel2['bowler_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['wicket_count']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1WicketsRight(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel1)>0:
            flag = ['Right' in i[1] for i in df_rel1['bowler_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['wicket_count']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel2)>0:
            flag = ['Right' in i[1] for i in df_rel2['bowler_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['wicket_count']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1WicketsFast(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel1)>0:
            flag = ['fast' in i[1] for i in df_rel1['bowler_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['wicket_count']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel2)>0:
            flag = ['fast' in i[1] for i in df_rel2['bowler_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['wicket_count']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1WicketsOffbreak(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel1)>0:
            flag = ['offbreak' in i[1] for i in df_rel1['bowler_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['wicket_count']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel2)>0:
            flag = ['offbreak' in i[1] for i in df_rel2['bowler_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['wicket_count']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1WicketsOrthodox(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel1)>0:
            flag = ['orthodox' in i[1] for i in df_rel1['bowler_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['wicket_count']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel2)>0:
            flag = ['orthodox' in i[1] for i in df_rel2['bowler_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['wicket_count']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)
    
def team1WicketsGoogly(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel1)>0:
            flag = ['googly' in i[1] for i in df_rel1['bowler_details'].values]
            df_rel1 = df_rel1[flag]
            res_list1.append(np.sum(df_rel1['wicket_count']))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bowl')[['bowler_details', 'wicket_count']]
        if len(df_rel2)>0:
            flag = ['googly' in i[1] for i in df_rel2['bowler_details'].values]
            df_rel2 = df_rel2[flag]
            res_list2.append(np.sum(df_rel2['wicket_count']))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list1)/np.sum(res_list2)

def team1CaptainWickets(player_list1, player_list2, date):

    res_list1 = []
    for player in player_list1:
        df_rel1 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['is_bowler_captain']]
        df_rel1 = df_rel1[df_rel1['is_bowler_captain'] == 1]
        res_list1.append(len(df_rel1))

    res_list2 = []
    for player in player_list2:
        df_rel2 = giveLastNgamesPlayer(player_id=player, date=date, bat_or_bowl='bat')[['is_bowler_captain']]
        df_rel2 = df_rel2[df_rel2['is_bowler_captain'] == 1]
        res_list2.append(len(df_rel2))
            
    if np.sum(res_list2) == 0 or np.sum(res_list1) == 0:
        return 1
    else:
        return np.sum(res_list2)/np.sum(res_list1)

In [97]:
train_data['team1DeathRuns'] = train_data.apply(lambda x:team1DeathRuns(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1DeathRuns'] = test_data.apply(lambda x: team1DeathRuns(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1DeathBoundaries'] = train_data.apply(lambda x:team1DeathBoundaries(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1DeathBoundaries'] = test_data.apply(lambda x: team1DeathBoundaries(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1Dravidian'] = train_data.apply(lambda x:team1Dravidian(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1Dravidian'] = test_data.apply(lambda x: team1Dravidian(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1RunsLeft'] = train_data.apply(lambda x:team1RunsLeft(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1RunsLeft'] = test_data.apply(lambda x: team1RunsLeft(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1RunsRight'] = train_data.apply(lambda x:team1RunsRight(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1RunsRight'] = test_data.apply(lambda x: team1RunsRight(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

In [248]:
train_data['team1WicketsLeft'] = train_data.apply(lambda x:team1WicketsLeft(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1WicketsLeft'] = test_data.apply(lambda x: team1WicketsLeft(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1WicketsRight'] = train_data.apply(lambda x:team1WicketsRight(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1WicketsRight'] = test_data.apply(lambda x: team1WicketsRight(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1WicketsFast'] = train_data.apply(lambda x:team1WicketsFast(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1WicketsFast'] = test_data.apply(lambda x: team1WicketsFast(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1WicketsOffbreak'] = train_data.apply(lambda x:team1WicketsOffbreak(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1WicketsOffbreak'] = test_data.apply(lambda x: team1WicketsOffbreak(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1WicketsOrthodox'] = train_data.apply(lambda x:team1WicketsOrthodox(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1WicketsOrthodox'] = test_data.apply(lambda x: team1WicketsOrthodox(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1WicketsGoogly'] = train_data.apply(lambda x:team1WicketsGoogly(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1WicketsGoogly'] = test_data.apply(lambda x: team1WicketsGoogly(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

train_data['team1CaptainWickets'] = train_data.apply(lambda x:team1CaptainWickets(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)
test_data['team1CaptainWickets'] = test_data.apply(lambda x: team1CaptainWickets(x['team1_roster_ids'], x['team2_roster_ids'], x['match_dt']), axis=1)

In [279]:
train_data['winner_01'] = train_data.apply(lambda x: 1 if (x['team1_id']==x['winner_id']) else 0, axis=1)
train_data.fillna(0, inplace=True), test_data.fillna(0, inplace=True)

(None, None)

In [281]:
train_data.to_csv('newdata//4//train_data.csv', index=False)
test_data.to_csv('newdata//4//test_data.csv', index=False)