## Training and Testing Pipeline

In [669]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [670]:
train_data = pd.read_csv('testing/train_data.csv')
test_data = pd.read_csv('testing/test_data.csv')

In [671]:
# Display options
pd.set_option('display.max_columns',50)

In [672]:
train_data.shape
train_data.head(2)

(948, 23)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5


In [673]:
test_data.shape
test_data.head(2)

(271, 21)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,Bulawayo,2022-07-11,day match,Ud Ss of Aa tr of Ze,2022,3226,0.0,0.019608,,0.0,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,Providence,2022-09-21,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.5


# Calculating batsmen features

In [674]:
batsmen=pd.read_csv('dataset/batsman_level_scorecard.csv')

In [675]:
batsmen.head(2)

Unnamed: 0,match id,batsman,batsman_id,batsman_details,is_batsman_captain,is_batsman_keeper,inning,runs,balls_faced,over_faced_first,wicket kind,out_by_bowler,out_by_fielder,bowler_id,bowler_details,is_bowler_keeper,is_bowler_captain,strike_rate,Fours,Sixes,match_dt
0,8638034,KD Ce,7907451.0,NZ:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,7,5,1.1,caught,JS Nm,TM Jn,2486896.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,140.0,1.0,,2021-01-01
1,8638034,TL St,4381761.0,NZ:Right-hand bat:None:,0.0,1.0,1,46,46,1.2,caught,R Ra,MG Bl,6718382.0,NZ:Left-hand bat:Slow left-arm orthodox:,0.0,0.0,100.0,4.0,1.0,2021-01-01


In [676]:
# calculating 2 dictionaries to store {batsman_id: batsman average } and {batsman_id: batsman strike rate}

total_runs = {}  
balls_faced = {}
matches_played = {}

for i in range(len(batsmen)):
    batsman_id = batsmen['batsman_id'][i]
    runs = batsmen['runs'][i]
    balls = batsmen['balls_faced'][i]
    match_id = batsmen['match id'][i]
    if batsman_id in total_runs:
        total_runs[batsman_id] += runs
        balls_faced[batsman_id] += balls
        matches_played[batsman_id] += 1
    else:
        total_runs[batsman_id] = runs
        balls_faced[batsman_id] = balls
        matches_played[batsman_id] = 1
        
batsman_avg = {}
batsman_sr = {}

for batsman_id in total_runs:
    batsman_avg[batsman_id] = total_runs[batsman_id] / matches_played[batsman_id]
    batsman_sr[batsman_id] = total_runs[batsman_id] / balls_faced[batsman_id] * 100
 

# Bowler level features

In [677]:
bowlers=pd.read_csv('dataset/bowler_level_scorecard.csv')

In [678]:
bowlers.head()

Unnamed: 0,match id,bowler,bowler_id,bowler_details,is_bowler_captain,is_bowler_keeper,inning,runs,wicket_count,balls_bowled,economy,maiden,dots,Fours,Sixes,wides,noballs,match_dt
0,8638034,BV Ss,6718396.0,NZ:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,21,0,12,10.5,0,3.0,1,1,3,1,2021-01-01
1,8638034,HK Bt,1585464.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,1,34,1,24,8.5,0,7.0,3,1,2,0,2021-01-01
2,8638034,JS Nm,2486896.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,1,23,2,24,5.75,0,12.0,1,1,0,0,2021-01-01
3,8638034,LV vn Bk,3083667.0,NED:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,31,1,24,7.75,0,9.0,4,0,2,0,2021-01-01
4,8638034,PF Yd,4950294.0,NZ:Right-hand bat:Legbreak googly:,0.0,0.0,1,21,1,24,5.25,0,8.0,0,0,1,0,2021-01-01


In [679]:
# calculating 2 dictionaries to store {bowler_id: bowler strike rate } , {bowler_id: bowler economy} , {bowler_id: bowler extras rate}

total_matches = {}
total_wickets = {}
total_ball_bowled = {}
total_extras = {}
total_economy = {}

for i in range(len(bowlers)):
    bowler_id = bowlers['bowler_id'][i]
    wickets = bowlers['wicket_count'][i]
    balls = bowlers['balls_bowled'][i]
    runs = bowlers['runs'][i]
    extras = bowlers['wides'][i] + bowlers['noballs'][i]
    match_id = bowlers['match id'][i]
    economy=bowlers['economy'][i]
    if bowler_id in total_matches:
        total_matches[bowler_id] += 1
        total_wickets[bowler_id] += wickets
        total_ball_bowled[bowler_id] += balls
        total_extras[bowler_id] += extras
        total_economy[bowler_id] += economy
    else:
        total_matches[bowler_id] = 1
        total_wickets[bowler_id] = wickets
        total_ball_bowled[bowler_id] = balls
        total_extras[bowler_id] = extras
        total_economy[bowler_id] = economy
        
bowler_sr = {}
bowler_economy = {}
bowler_extras = {}

for bowler_id in total_matches:
    bowler_economy[bowler_id] = total_economy[bowler_id] / total_matches[bowler_id]
    bowler_extras[bowler_id] = total_extras[bowler_id] / total_ball_bowled[bowler_id]
    if total_wickets[bowler_id] == 0:
        bowler_sr[bowler_id] = 0
    else:
        bowler_sr[bowler_id] = total_ball_bowled[bowler_id] /total_wickets[bowler_id]
    

## Match level features

In [680]:
match=pd.read_csv('dataset/match_level_scorecard.csv')

In [681]:
match.head()

Unnamed: 0,match id,team1,team2,winner,by,win amount,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,umpire1,umpire2,inning1_runs,inning1_wickets,inning1_balls,inning2_runs,inning2_wickets,inning2_balls,team1_id,team1_roster_ids,team2_id,team2_roster_ids,series_type,winner_id,player_of_the_match_id
0,8638034,Nn Ds,Wn,Wn,wickets,9.0,Wn,field,By Ol,Mount Maunganui,2021-01-01,night match,Sr Sh,2020/21,17681,Dn Mw,TJ Pe,148,6,130,152.0,1.0,97.0,17982,7907451.0:4381761.0:31464.0:258649.0:4949790.0...,18570,2653993.0:6718326.0:6718382.0:2486896.0:228878...,other_domestic,18570,
1,8588005,Me Rs,Sy Tr,Sy Tr,runs,7.0,Sy Tr,field,Ca Ol,Carrara,2021-01-01,night match,Bg Bh Le,2020/21,6663,D Tr,PJ Ge,166,6,106,117.0,2.0,74.0,33942,37351.0:46794.0:5406540.0:2231928.0:181404.0:1...,33963,1506098.0:1749075.0:36665.0:2083409.0:7534652....,other_domestic,33963,1749075.0
2,8587837,Sy Ss,Be Ht,Be Ht,wickets,4.0,Sy Ss,bat,Be Ct Gd,Brisbane,2021-01-02,night match,Bg Bh Le,2020/21,1476,GJ Dn,PJ Ge,165,8,126,171.0,6.0,119.0,33956,7869987.0:7620283.0:2076192.0:4002340.0:306369...,33921,7620269.0:2286437.0:87191.0:5786766.0:3114803....,other_domestic,33921,3890984.0
3,8638041,Nn Ds,Oo,Oo,wickets,2.0,Oo,field,By Ol,Mount Maunganui,2021-01-02,night match,Sr Sh,2020/21,17681,GA St,TJ Pe,153,7,128,156.0,8.0,126.0,17982,7907451.0:4381761.0:31464.0:4949790.0:258649.0...,18360,2319638.0:256080.0:7918280.0:3913447.0:2690498...,other_domestic,18360,
4,8587921,Ht Hs,Me Ss,Ht Hs,runs,21.0,Me Ss,field,Be Ol,Hobart,2021-01-02,day/night match,Bg Bh Le,2020/21,6348,GA Ad,NR Je,164,5,126,143.0,9.0,123.0,33928,4223883.0:2161599.0:1655436.0:5788418.0:319948...,33949,363047.0:2275097.0:3901078.0:2275195.0:4230127...,other_domestic,33928,3125849.0


In [682]:
# # calculating 2 dictionaries to store {team_id: team average }
#  if team1 has won toss and choose bat then inning1 score will be team1 score and inning2 score will be team2 score

total_runs = {}
total_matches={}
for i in range(len(match)):
    
    toss_winner = match['toss winner'][i]
    team2=toss_winner
    
    if toss_winner == match['team2_id'][i]:
        team1 = match['team1_id'][i]
    else:
        team1 = match['team2_id'][i]
    
    if match['toss decision'][i] == 'bat':
        team2_score=match['inning1_runs'][i]
        team1_score=match['inning2_runs'][i]
    
    else:
        team1_score=match['inning1_runs'][i]
        team2_score=match['inning2_runs'][i]
    
    if team1 in total_runs:
        total_runs[team1] += team1_score
        total_matches[team1] += 1
    
    else:
        total_runs[team1] = team1_score
        total_matches[team1] = 1
        
    if team2 in total_runs:
        total_runs[team2] += team2_score
        total_matches[team2] += 1
    
    else:
        total_runs[team2] = team2_score
        total_matches[team2] = 1
        
team_avg = {}

for team_id in total_runs:
    team_avg[team_id] = total_runs[team_id] / total_matches[team_id]

In [683]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = match[(match['match_dt']<date)&\
                      (((match['team1_id']==team1_id)&(match['team2_id']==team2_id))|((match['team1_id']==team2_id)&(match['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.


In [684]:
def win_percentage_last_n_matches(team_id, date,n):
    '''
    Function to calculate a team's win percentage in their last n games.

    '''
    # Filter games where team1 or team2 is the input team_id and the match date is before the input date
    df_rel = match[(match['match_dt'] < date) &
                            ((match['team1_id'] == team_id) | (match['team2_id'] == team_id))]\
                            .sort_values(by='match_dt', ascending=False).head(n)
    
    # Calculate the number of wins
    wins = df_rel[(df_rel['winner_id'] == team_id)].shape[0]
    
    # Calculate the number of matches
    total_matches = df_rel.shape[0]
    
    if total_matches == 0:
        return 0.0  # Avoid division by zero
    
    # Calculate win percentage
    win_percentage = (wins / total_matches) * 100
    return win_percentage


In [685]:
# Function to get player IDs
def get_player_ids(roster_id_str):
    players_id = [int(float(x)) for x in roster_id_str.split(":")]
    return players_id

In [686]:
batsman_lvl_data=pd.read_csv('dataset/batsman_level_scorecard.csv')

In [687]:
# batsman_lvl_data.head(10)

In [688]:
def calculate_sixes_last15_games(team_roster_ids, match_dates, batsman_lvl_data):
    total_sixes = []

    for tm_roster_id, match_date in zip(team_roster_ids, match_dates):
        players = get_player_ids(tm_roster_id)
        bats_indices = [i for i, player in enumerate(players) if player in batsman_lvl_data['batsman_id'].values]
        sixes_count = 0

        for i in bats_indices:
            my_batsid = players[i]
            rdat = batsman_lvl_data[(batsman_lvl_data['batsman_id'] == my_batsid) & (batsman_lvl_data['match_dt'] < match_date)]
            rdat = rdat.sort_values('match_dt')

            if len(rdat) >= 15:
                rdat_last15 = rdat.tail(15)
            else:
                rdat_last15 = rdat

            sixes_count += rdat_last15['Sixes'].fillna(0).sum()

        total_sixes.append(sixes_count)

    return total_sixes

In [689]:


# Function to calculate team batsmen average strike rate of last 5 matches
def team_bats_avg_strike_last5(team_roster_ids, match_dates, batsman_lvl_data):
    team_bats_avg_strike = []

    for tm_roster_id, match_date in zip(team_roster_ids, match_dates):
        players = get_player_ids(tm_roster_id)
        bats_indices = [i for i, player in enumerate(players) if player in batsman_lvl_data['batsman_id'].values]
        bats_strike = []

        for i in bats_indices:
            my_batsid = players[i]
            rdat = batsman_lvl_data[(batsman_lvl_data['batsman_id'] == my_batsid) & (batsman_lvl_data['match_dt'] < match_date)]
            rdat = rdat.sort_values('match_dt')
            rdat_len = len(rdat)

            if rdat_len >= 5:
                rdat_last5 = rdat.tail(5)
            elif rdat_len == 0:
                bats_strike.append(0)
                continue
            else:
                rdat_last5 = rdat.tail(rdat_len)

            bats_strike.append(rdat_last5['strike_rate'].mean())

        team_bats_avg_strike.append(np.mean(bats_strike))

    return team_bats_avg_strike


In [690]:
bowler_lvl_data=pd.read_csv('dataset/bowler_level_scorecard.csv')

In [691]:


# Function to calculate performance score
def calculate_performance_score(runs, wicket_count, balls_bowled, economy, maiden, dots, Fours, Sixes, wides, noballs):
    weight_wickets = 25
    weight_maidens = 5
    weight_dots = 2
    weight_runs = -1
    weight_fours = -0.5
    weight_sixes = -0.8
    weight_wides = -0.2
    weight_noballs = -0.4

    performance_score = (wicket_count * weight_wickets) + (maiden * weight_maidens) + (dots * weight_dots) + (runs * weight_runs) + (Fours * weight_fours) + (Sixes * weight_sixes) + (wides * weight_wides) + (noballs * weight_noballs)

    return performance_score


In [692]:
def team_bowler_feat_last5(team_roster_ids, match_dates, bowler_lvl_data):
    team_bowlers_avg_perfor = []
    team_bowlers_avg_eco = []

    for tm_roster_id, match_date in zip(team_roster_ids, match_dates):
        players = get_player_ids(tm_roster_id)
        bowler_indices = [i for i, player in enumerate(players) if player in bowler_lvl_data['bowler_id'].values]
        bowler_summary = []
        bowler_economy = []

        for i in bowler_indices:
            my_bowlerid = players[i]
            rdat = bowler_lvl_data[(bowler_lvl_data['bowler_id'] == my_bowlerid) & (bowler_lvl_data['match_dt'] < match_date)]
            rdat = rdat.sort_values('match_dt')
            rdat_len = len(rdat)

            if rdat_len >= 5:
                rdat_last5 = rdat.tail(5)
            elif rdat_len == 0:
                bowler_summary.append(0)
                bowler_economy.append(0)
                continue
            else:
                rdat_last5 = rdat.tail(rdat_len)

            rdat_last5 = rdat_last5.copy()  # Create a copy to avoid the SettingWithCopyWarning
            rdat_last5['performance_score'] = rdat_last5.apply(lambda row: calculate_performance_score(row['runs'], row['wicket_count'], row['balls_bowled'], row['economy'], row['maiden'], row['dots'], row['Fours'], row['Sixes'], row['wides'], row['noballs']), axis=1)

            bowler_summary.append(rdat_last5['performance_score'].mean())
            bowler_economy.append(rdat_last5['economy'].mean())

        team_bowlers_avg_perfor.append(np.mean(bowler_summary) if bowler_summary else 0)
        team_bowlers_avg_eco.append(np.mean(bowler_economy) if bowler_economy else 0)

    overall_avg_perfor = np.mean([x for x in team_bowlers_avg_perfor if x != 0])
    overall_avg_eco = np.mean([x for x in team_bowlers_avg_eco if x != 0])

    team_bowlers_avg_perfor = [x if x != 0 else overall_avg_perfor for x in team_bowlers_avg_perfor]
    team_bowlers_avg_eco = [x if x != 0 else overall_avg_eco for x in team_bowlers_avg_eco]

    return team_bowlers_avg_perfor, team_bowlers_avg_eco

In [693]:
train_data.head(2)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5


In [694]:
test_data.head(2)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,Bulawayo,2022-07-11,day match,Ud Ss of Aa tr of Ze,2022,3226,0.0,0.019608,,0.0,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,Providence,2022-09-21,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.5


In [695]:
def calculate_batsmen_above_fifty(team_roster_ids, match_dates, batsman_lvl_data):
    batsmen_above_fifty = []

    for tm_roster_id, match_date in zip(team_roster_ids, match_dates):
        players = get_player_ids(tm_roster_id)
        bats_indices = [i for i, player in enumerate(players) if player in batsman_lvl_data['batsman_id'].values]
        fifty_scores = 0
        total_matches = 0

        for i in bats_indices:
            my_batsid = players[i]
            rdat = batsman_lvl_data[(batsman_lvl_data['batsman_id'] == my_batsid) & (batsman_lvl_data['match_dt'] < match_date)]
            rdat = rdat.sort_values('match_dt')
            total_matches += len(rdat)
            fifty_scores += (rdat['runs'] > 50).sum()

        if total_matches == 0:
            batsmen_above_fifty.append(0)
        else:
            batsmen_above_fifty.append(fifty_scores / total_matches)

    return batsmen_above_fifty

In [696]:
def calculate_batsmen_above_seventy_five(team_roster_ids, match_dates, batsman_lvl_data):
    batsmen_above_fifty = []

    for tm_roster_id, match_date in zip(team_roster_ids, match_dates):
        players = get_player_ids(tm_roster_id)
        bats_indices = [i for i, player in enumerate(players) if player in batsman_lvl_data['batsman_id'].values]
        fifty_scores = 0
        total_matches = 0

        for i in bats_indices:
            my_batsid = players[i]
            rdat = batsman_lvl_data[(batsman_lvl_data['batsman_id'] == my_batsid) & (batsman_lvl_data['match_dt'] < match_date)]
            rdat = rdat.sort_values('match_dt')
            total_matches += len(rdat)
            fifty_scores += (rdat['runs'] > 75).sum()

        if total_matches == 0:
            batsmen_above_fifty.append(0)
        else:
            batsmen_above_fifty.append(fifty_scores / total_matches)

    return batsmen_above_fifty

# Combining the features of dataset

In [697]:
# from geopy.geocoders import Nominatim

# # Initialize Nominatim API
# geolocator = Nominatim(user_agent="my_geocoder")

# Creating new columns in train_data and initializing with 0
train_data['batsmens_averages_1'] = 0
train_data['batsmens_strike_rates_1'] = 0
train_data['bowlers_strike_rates_1'] = 0
train_data['bowlers_economy_1'] = 0
train_data['bowlers_extras_1'] = 0

train_data['batsmens_averages_2'] = 0
train_data['batsmens_strike_rates_2'] = 0
train_data['bowlers_strike_rates_2'] = 0
train_data['bowlers_economy_2'] = 0
train_data['bowlers_extras_2'] = 0

train_data['batsmens_averages_ratios'] = 0
train_data['batsmens_strike_rates_ratios'] = 0
train_data['bowlers_strike_rates_ratios'] = 0
train_data['bowlers_economy_ratios'] = 0
train_data['bowlers_extras_ratios'] = 0

train_data['team1_winp_team2_last5']=0
train_data['team1_winp_team2_last10']=0
train_data['team_average_score_ratios'] = 0
train_data['win_percentage_team1_last_15'] = 0
train_data['win_percentage_team2_last_15'] = 0

train_data['batsmen_performance_ratio_last5'] = 0
train_data['bowlers_performance_ratio_last5'] = 0
train_data['bowlers_economy_ratios_last5'] = 0
train_data['team_sixes_ratios_last15'] = 0
train_data['team_batsmen_above_fifty_ratios'] = 0
train_data['team_batsmen_above_seventy_five_ratios'] = 0

# train_data['location_humidity'] = 0
# train_data['location_temperature'] = 0

# Filling the values in the new columns
for i in range(len(train_data)):
    # Getting ids of batsmen and bowlers from team roster id separated by :
    team1_players_ids = train_data['team1_roster_ids'][i].split(':')
    team2_players_ids = train_data['team2_roster_ids'][i].split(':')
    # print(i,team1_players_ids,team2_players_ids)
    # For team1 check if id is present in the dictionary and add the values
    for player_id in team1_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            train_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
            train_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            train_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
            train_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
            train_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
            
    # For team2 check if id is present in the dictionary and add the values
    for player_id in team2_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            train_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
            train_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            train_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
            train_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
            train_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
    
    # filling ratios
    if train_data['batsmens_averages_2'][i] == 0:
        train_data['batsmens_averages_2'][i] = 1
        
    if train_data['batsmens_strike_rates_2'][i] == 0:
        train_data['batsmens_strike_rates_2'][i] = 1
    if train_data['bowlers_strike_rates_2'][i] == 0:
        train_data['bowlers_strike_rates_2'][i] = 1
        
    if train_data['bowlers_economy_2'][i] == 0:
        train_data['bowlers_economy_2'][i] = 1
        
    if train_data['bowlers_extras_2'][i] == 0:
        train_data['bowlers_extras_2'][i] = 1
        
    
    train_data.at[i, 'batsmens_averages_ratios'] = train_data['batsmens_averages_1'][i] / train_data['batsmens_averages_2'][i]
    train_data.at[i, 'batsmens_strike_rates_ratios'] = train_data['batsmens_strike_rates_1'][i] / train_data['batsmens_strike_rates_2'][i]
    train_data.at[i, 'bowlers_strike_rates_ratios'] = train_data['bowlers_strike_rates_1'][i] / train_data['bowlers_strike_rates_2'][i]
    train_data.at[i, 'bowlers_economy_ratios'] = train_data['bowlers_economy_1'][i] / train_data['bowlers_economy_2'][i]
    train_data.at[i, 'bowlers_extras_ratios'] = train_data['bowlers_extras_1'][i] / train_data['bowlers_extras_2'][i]
    
    
    # filling team win% against each other in last 10 games
    team1_id = train_data['team1_id'][i]
    team2_id = train_data['team2_id'][i]
    date = train_data['match_dt'][i]
    
    train_data.at[i, 'team1_winp_team2_last5'] = winpCrossLastn(team1_id, team2_id, date, 5)
    train_data.at[i, 'team1_winp_team2_last10'] = winpCrossLastn(team1_id, team2_id, date, 10)

            
    # filling team average scores
    team1_id = train_data['team1_id'][i]
    team2_id = train_data['team2_id'][i]
    
    if team1_id in team_avg:
        team1_average_score= team_avg[team1_id]
    else:
        team1_average_score= 0
        
    if team2_id in team_avg:
        team2_average_score = team_avg[team2_id]
    else:
        team2_average_score = 0
    
    if team2_average_score == 0:
        train_data.at[i, 'team_average_score_ratios'] = 1
    
    else :
        train_data.at[i, 'team_average_score_ratios'] = team1_average_score / team2_average_score
    # filling win% of team1 in last 15 games from matchscore_updated
    
    train_data.at[i, 'win_percentage_team1_last_15'] = win_percentage_last_n_matches(team1_id, date, 15)
    train_data.at[i, 'win_percentage_team2_last_15'] = win_percentage_last_n_matches(team2_id, date, 15)
    
    team1_roster_ids = train_data['team1_roster_ids'][i]
    team2_roster_ids = train_data['team2_roster_ids'][i]
    match_date = train_data['match_dt'][i]
    
    team1_bats_avg_strike_last5 = team_bats_avg_strike_last5([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_bats_avg_strike_last5 = team_bats_avg_strike_last5([team2_roster_ids], [match_date], batsman_lvl_data)
    if team2_bats_avg_strike_last5[0] == 0:
        train_data.at[i, 'batsmen_performance_ratio_last5'] = 0
        
    else:
        train_data.at[i, 'batsmen_performance_ratio_last5'] = team1_bats_avg_strike_last5[0] / team2_bats_avg_strike_last5[0]
        
    team1_bowlers_avg_perfor_last5, team1_bowlers_avg_eco_last5 = team_bowler_feat_last5([team1_roster_ids], [match_date], bowler_lvl_data)
    team2_bowlers_avg_perfor_last5, team2_bowlers_avg_eco_last5 = team_bowler_feat_last5([team2_roster_ids], [match_date], bowler_lvl_data)
    
    if team2_bowlers_avg_perfor_last5[0] == 0:
        train_data.at[i, 'bowlers_performance_ratio_last5'] = 0
    else:
        train_data.at[i, 'bowlers_performance_ratio_last5'] = team1_bowlers_avg_perfor_last5[0] / team2_bowlers_avg_perfor_last5[0]
    
    if team2_bowlers_avg_eco_last5[0] == 0:
        train_data.at[i, 'bowlers_economy_ratios_last5'] = 0
    else:
        train_data.at[i, 'bowlers_economy_ratios_last5'] = team1_bowlers_avg_eco_last5[0] / team2_bowlers_avg_eco_last5[0]
        
    team1_sixes_last15 = calculate_sixes_last15_games([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_sixes_last15 = calculate_sixes_last15_games([team2_roster_ids], [match_date], batsman_lvl_data)
    
    if team2_sixes_last15[0] == 0:  
        train_data.at[i, 'team_sixes_ratios_last15'] = 0
    else:
        train_data.at[i, 'team_sixes_ratios_last15'] = team1_sixes_last15[0] / team2_sixes_last15[0]
        
    team1_batsmen_above_fifty = calculate_batsmen_above_fifty([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_batsmen_above_fifty = calculate_batsmen_above_fifty([team2_roster_ids], [match_date], batsman_lvl_data)
    
    if team2_batsmen_above_fifty[0] == 0:
        train_data.at[i, 'team_batsmen_above_fifty_ratios'] = 0
    else:
        train_data.at[i, 'team_batsmen_above_fifty_ratios'] = team1_batsmen_above_fifty[0] / team2_batsmen_above_fifty[0]
    
    team1_batsmen_above_seventy_five = calculate_batsmen_above_seventy_five([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_batsmen_above_seventy_five = calculate_batsmen_above_seventy_five([team2_roster_ids], [match_date], batsman_lvl_data)
    
    if team2_batsmen_above_seventy_five[0] == 0:
        train_data.at[i, 'team_batsmen_above_seventy_five_ratios'] = 0
        
    else:
        train_data.at[i, 'team_batsmen_above_seventy_five_ratios'] = team1_batsmen_above_seventy_five[0] / team2_batsmen_above_seventy_five[0]

  train_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
  train_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
  train_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
  train_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
  train_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
  train_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
  train_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
  train_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
  train_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
  train_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
  train_data.at[i, 'batsmens_averages_ratios'] = train_data['batsmens_averages_1'][i] / train_data['batsmens_averages_2'][i]
  train_data.at[i, 'batsmens_strike_rates_ratios'] = train_data['batsmens_strike_rates_1'][i] / train_data['batsmens_strike_rates_2'][i]
  train_data.at[i, 'bowlers_strike_rates_ratios'] 

  train_data.at[i, 'win_percentage_team1_last_15'] = win_percentage_last_n_matches(team1_id, date, 15)
  train_data.at[i, 'team1_winp_team2_last5'] = winpCrossLastn(team1_id, team2_id, date, 5)
  train_data.at[i, 'team1_winp_team2_last10'] = winpCrossLastn(team1_id, team2_id, date, 10)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return 

In [698]:
# dropping team1_roster_ids , team2_roster_ids , winner , team1 , team2 ,  columns 
train_data = train_data.drop(['series_name','season','team1_roster_ids', 'team2_roster_ids', 'winner', 'team1', 'team2','batsmens_averages_1','batsmens_strike_rates_1','bowlers_strike_rates_1','bowlers_economy_1','bowlers_extras_1','batsmens_averages_2','batsmens_strike_rates_2','bowlers_strike_rates_2','bowlers_economy_2','bowlers_extras_2'], axis=1)

In [699]:
train_data.head(2)

Unnamed: 0,match id,team1_id,team2_id,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios
0,9331181,11283,12634,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,100.0,1.181688,60.0,63.636364,0.765906,0.792797,0.866174,1.692308,1.197368,0.0
1,8797060,20,41,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,50.0,0.999614,60.0,40.0,0.918417,2.808974,0.807064,0.590164,1.745868,2.585227


In [700]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_columns = ['toss winner', 'toss decision', 'venue', 'match_dt', 'lighting',  'city']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_columns:
    train_data[col] = label_encoder.fit_transform(train_data[col])

# Display the first 5 rows of the dataframe
train_data.head(5)


Unnamed: 0,match id,team1_id,team2_id,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios
0,9331181,11283,12634,12634,44,1,44,55,212,1,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,100.0,1.181688,60.0,63.636364,0.765906,0.792797,0.866174,1.692308,1.197368,0.0
1,8797060,20,41,20,138,1,53,13,20,1,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,50.0,0.999614,60.0,40.0,0.918417,2.808974,0.807064,0.590164,1.745868,2.585227
2,9433269,10576,8987,10576,81,1,111,83,370,1,251,0.857143,0.672131,173.266667,0.0,154.333333,1.000138,0.909712,0.85019,0.717828,1.024538,0.0,0.0,1.116027,40.0,46.666667,0.894351,0.802434,1.069791,0.784314,0.770274,0.277903
3,9587073,36084,36070,36084,34,0,10,105,434,0,14300,2.166667,1.97561,164.266667,50.0,144.25,1.116102,1.052218,0.813452,0.878287,0.700727,50.0,50.0,0.882667,60.0,66.666667,1.424835,2.466747,0.840831,1.709677,0.922251,0.795991
4,9516457,30407,48341,48341,40,1,75,25,331,2,7118,0.818182,1.327869,164.666667,0.0,189.0,1.035394,1.240634,1.084338,1.168234,1.480742,0.0,0.0,1.0267,53.333333,69.230769,1.074782,0.646214,0.987516,1.261905,0.835192,1.461586


In [701]:
# filling nan values with 0

train_data.fillna(0, inplace=True)

In [702]:
# changing column names of match id to match_id , toss winner to toss_winner , toss decision to toss_decision

train_data['match_id'] = train_data['match id']
train_data['toss_winner'] = train_data['toss winner']
train_data['toss_decision'] = train_data['toss decision']

train_data = train_data.drop(['match id', 'toss winner', 'toss decision'], axis=1)

## For test data also

In [703]:
# from geopy.geocoders import Nominatim

# # Initialize Nominatim API
# geolocator = Nominatim(user_agent="my_geocoder")

# Creating new columns in test_data and initializing with 0
test_data['batsmens_averages_1'] = 0
test_data['batsmens_strike_rates_1'] = 0
test_data['bowlers_strike_rates_1'] = 0
test_data['bowlers_economy_1'] = 0
test_data['bowlers_extras_1'] = 0

test_data['batsmens_averages_2'] = 0
test_data['batsmens_strike_rates_2'] = 0
test_data['bowlers_strike_rates_2'] = 0
test_data['bowlers_economy_2'] = 0
test_data['bowlers_extras_2'] = 0

test_data['batsmens_averages_ratios'] = 0
test_data['batsmens_strike_rates_ratios'] = 0
test_data['bowlers_strike_rates_ratios'] = 0
test_data['bowlers_economy_ratios'] = 0
test_data['bowlers_extras_ratios'] = 0

test_data['team1_winp_team2_last5']=0
test_data['team1_winp_team2_last10']=0
test_data['team_average_score_ratios'] = 0
test_data['win_percentage_team1_last_15'] = 0
test_data['win_percentage_team2_last_15'] = 0

test_data['batsmen_performance_ratio_last5'] = 0
test_data['bowlers_performance_ratio_last5'] = 0
test_data['bowlers_economy_ratios_last5'] = 0
test_data['team_sixes_ratios_last15'] = 0
test_data['team_batsmen_above_fifty_ratios'] = 0
test_data['team_batsmen_above_seventy_five_ratios'] = 0

# test_data['location_humidity'] = 0
# test_data['location_temperature'] = 0

# Filling the values in the new columns
for i in range(len(test_data)):
    # Getting ids of batsmen and bowlers from team roster id separated by :
    team1_players_ids = test_data['team1_roster_ids'][i].split(':')
    team2_players_ids = test_data['team2_roster_ids'][i].split(':')
    # print(i,team1_players_ids,team2_players_ids)
    # For team1 check if id is present in the dictionary and add the values
    for player_id in team1_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            test_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
            test_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            test_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
            test_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
            test_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
            
    # For team2 check if id is present in the dictionary and add the values
    for player_id in team2_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            test_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
            test_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            test_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
            test_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
            test_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
    
    # filling ratios
    if test_data['batsmens_averages_2'][i] == 0:
        test_data['batsmens_averages_2'][i] = 1
        
    if test_data['batsmens_strike_rates_2'][i] == 0:
        test_data['batsmens_strike_rates_2'][i] = 1
    if test_data['bowlers_strike_rates_2'][i] == 0:
        test_data['bowlers_strike_rates_2'][i] = 1
        
    if test_data['bowlers_economy_2'][i] == 0:
        test_data['bowlers_economy_2'][i] = 1
        
    if test_data['bowlers_extras_2'][i] == 0:
        test_data['bowlers_extras_2'][i] = 1
        
    
    test_data.at[i, 'batsmens_averages_ratios'] = test_data['batsmens_averages_1'][i] / test_data['batsmens_averages_2'][i]
    test_data.at[i, 'batsmens_strike_rates_ratios'] = test_data['batsmens_strike_rates_1'][i] / test_data['batsmens_strike_rates_2'][i]
    test_data.at[i, 'bowlers_strike_rates_ratios'] = test_data['bowlers_strike_rates_1'][i] / test_data['bowlers_strike_rates_2'][i]
    test_data.at[i, 'bowlers_economy_ratios'] = test_data['bowlers_economy_1'][i] / test_data['bowlers_economy_2'][i]
    test_data.at[i, 'bowlers_extras_ratios'] = test_data['bowlers_extras_1'][i] / test_data['bowlers_extras_2'][i]
    
    
    # filling team win% against each other in last 10 games
    team1_id = test_data['team1_id'][i]
    team2_id = test_data['team2_id'][i]
    date = test_data['match_dt'][i]
    
    test_data.at[i, 'team1_winp_team2_last5'] = winpCrossLastn(team1_id, team2_id, date, 5)
    test_data.at[i, 'team1_winp_team2_last10'] = winpCrossLastn(team1_id, team2_id, date, 10)

            
    # filling team average scores
    team1_id = test_data['team1_id'][i]
    team2_id = test_data['team2_id'][i]
    
    if team1_id in team_avg:
        team1_average_score= team_avg[team1_id]
    else:
        team1_average_score= 0
        
    if team2_id in team_avg:
        team2_average_score = team_avg[team2_id]
    else:
        team2_average_score = 0
    
    if team2_average_score == 0:
        test_data.at[i, 'team_average_score_ratios'] = 1
    
    else :
        test_data.at[i, 'team_average_score_ratios'] = team1_average_score / team2_average_score
    # filling win% of team1 in last 15 games from matchscore_updated
    
    test_data.at[i, 'win_percentage_team1_last_15'] = win_percentage_last_n_matches(team1_id, date, 15)
    test_data.at[i, 'win_percentage_team2_last_15'] = win_percentage_last_n_matches(team2_id, date, 15)
    
    team1_roster_ids = test_data['team1_roster_ids'][i]
    team2_roster_ids = test_data['team2_roster_ids'][i]
    match_date = test_data['match_dt'][i]
    
    team1_bats_avg_strike_last5 = team_bats_avg_strike_last5([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_bats_avg_strike_last5 = team_bats_avg_strike_last5([team2_roster_ids], [match_date], batsman_lvl_data)
    if team2_bats_avg_strike_last5[0] == 0:
        test_data.at[i, 'batsmen_performance_ratio_last5'] = 0
        
    else:
        test_data.at[i, 'batsmen_performance_ratio_last5'] = team1_bats_avg_strike_last5[0] / team2_bats_avg_strike_last5[0]
        
    team1_bowlers_avg_perfor_last5, team1_bowlers_avg_eco_last5 = team_bowler_feat_last5([team1_roster_ids], [match_date], bowler_lvl_data)
    team2_bowlers_avg_perfor_last5, team2_bowlers_avg_eco_last5 = team_bowler_feat_last5([team2_roster_ids], [match_date], bowler_lvl_data)
    
    if team2_bowlers_avg_perfor_last5[0] == 0:
        test_data.at[i, 'bowlers_performance_ratio_last5'] = 0
    else:
        test_data.at[i, 'bowlers_performance_ratio_last5'] = team1_bowlers_avg_perfor_last5[0] / team2_bowlers_avg_perfor_last5[0]
    
    if team2_bowlers_avg_eco_last5[0] == 0:
        test_data.at[i, 'bowlers_economy_ratios_last5'] = 0
    else:
        test_data.at[i, 'bowlers_economy_ratios_last5'] = team1_bowlers_avg_eco_last5[0] / team2_bowlers_avg_eco_last5[0]
        
    team1_sixes_last15 = calculate_sixes_last15_games([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_sixes_last15 = calculate_sixes_last15_games([team2_roster_ids], [match_date], batsman_lvl_data)
    
    if team2_sixes_last15[0] == 0:  
        test_data.at[i, 'team_sixes_ratios_last15'] = 0
    else:
        test_data.at[i, 'team_sixes_ratios_last15'] = team1_sixes_last15[0] / team2_sixes_last15[0]
        
    team1_batsmen_above_fifty = calculate_batsmen_above_fifty([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_batsmen_above_fifty = calculate_batsmen_above_fifty([team2_roster_ids], [match_date], batsman_lvl_data)
    
    if team2_batsmen_above_fifty[0] == 0:
        test_data.at[i, 'team_batsmen_above_fifty_ratios'] = 0
    else:
        test_data.at[i, 'team_batsmen_above_fifty_ratios'] = team1_batsmen_above_fifty[0] / team2_batsmen_above_fifty[0]
    
    team1_batsmen_above_seventy_five = calculate_batsmen_above_seventy_five([team1_roster_ids], [match_date], batsman_lvl_data)
    team2_batsmen_above_seventy_five = calculate_batsmen_above_seventy_five([team2_roster_ids], [match_date], batsman_lvl_data)
    
    if team2_batsmen_above_seventy_five[0] == 0:
        test_data.at[i, 'team_batsmen_above_seventy_five_ratios'] = 0
        
    else:
        test_data.at[i, 'team_batsmen_above_seventy_five_ratios'] = team1_batsmen_above_seventy_five[0] / team2_batsmen_above_seventy_five[0]

  test_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
  test_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
  test_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
  test_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
  test_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
  test_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
  test_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
  test_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
  test_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
  test_data.at[i, 'batsmens_averages_ratios'] = test_data['batsmens_averages_1'][i] / test_data['batsmens_averages_2'][i]
  test_data.at[i, 'batsmens_strike_rates_ratios'] = test_data['batsmens_strike_rates_1'][i] / test_data['batsmens_strike_rates_2'][i]
  test_data.at[i, 'bowlers_strike_rates_ratios'] = test_data['bowlers_strike_rates_1'][i] / test_data['bowlers_strike_rates_2'][i]
  t

  test_data.at[i, 'team_batsmen_above_fifty_ratios'] = team1_batsmen_above_fifty[0] / team2_batsmen_above_fifty[0]
  test_data.at[i, 'team_batsmen_above_seventy_five_ratios'] = team1_batsmen_above_seventy_five[0] / team2_batsmen_above_seventy_five[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['bowlers_extras_2'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['bowlers_extras_2'][i] = 1
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rco

In [704]:
# dropping team1_roster_ids , team2_roster_ids , winner , team1 , team2 ,  columns 
test_data = test_data.drop(['series_name','season','team1_roster_ids', 'team2_roster_ids','team1', 'team2','batsmens_averages_1','batsmens_strike_rates_1','bowlers_strike_rates_1','bowlers_economy_1','bowlers_extras_1','batsmens_averages_2','batsmens_strike_rates_2','bowlers_strike_rates_2','bowlers_economy_2','bowlers_extras_2'], axis=1)

In [705]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_columns = ['toss winner', 'toss decision', 'venue', 'match_dt', 'lighting', 'city']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_columns:
    test_data[col] = label_encoder.fit_transform(test_data[col])

# Display the first 5 rows of the dataframe
test_data.head(5)


Unnamed: 0,match id,team1_id,team2_id,toss winner,toss decision,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios
0,9250275,28594,90,104,1,11,13,58,0,3226,0.0,0.019608,,0.0,,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,0.0,0.975779,0.0,50.0,0.300243,,,0.076923,0.0,0.0
1,9262189,36084,36098,40,1,59,70,89,2,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,66.67,1.05519,42.857143,46.666667,1.017284,1.133359,0.928929,0.9375,1.01142,1.372642
2,9128776,30428,48334,82,0,8,58,40,2,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,100.0,1.054849,46.666667,77.777778,0.906726,0.837286,1.123313,1.117647,1.02521,1.056277
3,9586919,36098,36112,92,1,23,38,195,2,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,75.0,0.88804,53.333333,40.0,0.648555,2.125587,0.960721,0.608696,0.309804,0.263333
4,9128538,30414,48341,31,1,46,71,34,2,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,0.0,0.984463,46.666667,66.666667,1.238929,0.393975,1.171551,2.0,1.730696,2.562762


In [706]:
# filling nan values with 0

test_data.fillna(0, inplace=True)

In [707]:
# changing column names of match id to match_id , toss winner to toss_winner , toss decision to toss_decision

test_data['match_id'] = test_data['match id']
test_data['toss_winner'] = test_data['toss winner']
test_data['toss_decision'] = test_data['toss decision']

test_data = test_data.drop(['match id', 'toss winner', 'toss decision'], axis=1)

In [708]:
test_data.head(15)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios,match_id,toss_winner,toss_decision
0,28594,90,11,13,58,0,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,0.0,0.975779,0.0,50.0,0.300243,0.0,0.0,0.076923,0.0,0.0,9250275,104,1
1,36084,36098,59,70,89,2,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,66.67,1.05519,42.857143,46.666667,1.017284,1.133359,0.928929,0.9375,1.01142,1.372642,9262189,40,1
2,30428,48334,8,58,40,2,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,100.0,1.054849,46.666667,77.777778,0.906726,0.837286,1.123313,1.117647,1.02521,1.056277,9128776,82,0
3,36098,36112,23,38,195,2,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,75.0,0.88804,53.333333,40.0,0.648555,2.125587,0.960721,0.608696,0.309804,0.263333,9586919,92,1
4,30414,48341,46,71,34,2,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,0.0,0.984463,46.666667,66.666667,1.238929,0.393975,1.171551,2.0,1.730696,2.562762,9128538,31,1
5,30407,30435,57,20,166,2,7118,0.4,0.672131,177.066667,50.0,184.5,0.941093,0.995272,1.001928,0.934268,0.73699,50.0,50.0,1.018264,46.666667,40.0,0.862513,-4.472477,0.901007,0.956522,0.579996,0.968807,9516653,57,1
6,36014,30414,45,23,161,2,2050,0.681818,0.344262,183.466667,25.0,156.35,0.921257,1.034135,0.921563,0.955612,0.797946,25.0,25.0,1.028385,40.0,53.333333,1.105788,0.822466,1.014703,0.822222,0.735842,0.634871,9516534,12,1
7,9701,6698,81,81,57,2,1042,3.2,0.60396,163.066667,0.0,177.321429,0.937888,1.114808,0.63588,0.685779,1.635887,0.0,0.0,0.961671,66.666667,66.666667,0.928353,6.45702,0.801164,1.727273,1.140684,1.267427,9085474,19,1
8,7727,7573,82,76,184,1,8301,2.428571,2.904762,167.0,100.0,151.6,1.301483,1.165019,1.046915,0.940972,1.415819,100.0,100.0,0.991761,80.0,40.0,1.060601,2.122972,0.825663,1.566667,1.912002,1.333955,9433752,30,1
9,30428,36014,67,42,153,1,13880,1.727273,21.0,171.066667,66.67,186.5,1.161314,1.008119,0.596095,1.011756,1.146592,66.67,66.67,0.989076,46.666667,33.333333,1.106032,-0.051787,1.335084,1.433333,1.523553,1.287834,9516359,89,1


In [709]:
train_data.shape
test_data.shape

(948, 32)

(271, 31)

In [710]:
new_test_data = test_data

In [711]:
new_train_data=train_data

In [712]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

X_train = train_data.drop(['winner_id'], axis=1)
y_train = train_data['winner_id']

X_test = test_data

In [713]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

In [714]:
unique_classes = np.unique(y_train)
single_occurrence_classes = [c for c in unique_classes if np.sum(y_train == c) == 1]

# Oversample each single-occurrence class to 8 instances
oversampled_X_train = []

oversampled_y = []
for class_label in single_occurrence_classes:
    class_X_train = X_train[y_train == class_label]
    class_y = y_train[y_train == class_label]
    oversampled_X_train.extend([class_X_train] * (8 - len(class_X_train)))
    oversampled_y.extend([class_y] * (8 - len(class_y)))

# Concatenate the oversampled data with the original data
X_train_resampled = np.concatenate([X_train] + oversampled_X_train)
y_resampled = np.concatenate([y_train] + oversampled_y)

In [715]:
# i79 = np.where(y_test == 79)[0][0]
# i79

In [716]:
# Initialize SMOTE
smote = SMOTE(random_state=101, k_neighbors=1)

# Apply SMOTE to the entire dataset
X_train, y_train = smote.fit_resample(X_train_resampled, y_resampled)

In [717]:
# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [718]:
# CatBoost
# cat_model = CatBoostClassifier()
# cat_model.fit(X_train, y_train)

# cat_accuracy = accuracy_score(y_test, cat_pred)
# cat_f1 = f1_score(y_test, cat_pred, average='macro')

In [719]:
test_data.shape

(271, 31)

In [720]:
X_test.shape

(271, 31)

In [721]:
# cat_model.predict(X_test)

In [722]:
new_test_data.head(5)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios,match_id,toss_winner,toss_decision
0,28594,90,11,13,58,0,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,0.0,0.975779,0.0,50.0,0.300243,0.0,0.0,0.076923,0.0,0.0,9250275,104,1
1,36084,36098,59,70,89,2,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,66.67,1.05519,42.857143,46.666667,1.017284,1.133359,0.928929,0.9375,1.01142,1.372642,9262189,40,1
2,30428,48334,8,58,40,2,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,100.0,1.054849,46.666667,77.777778,0.906726,0.837286,1.123313,1.117647,1.02521,1.056277,9128776,82,0
3,36098,36112,23,38,195,2,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,75.0,0.88804,53.333333,40.0,0.648555,2.125587,0.960721,0.608696,0.309804,0.263333,9586919,92,1
4,30414,48341,46,71,34,2,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,0.0,0.984463,46.666667,66.666667,1.238929,0.393975,1.171551,2.0,1.730696,2.562762,9128538,31,1


In [723]:
team1_ids = new_test_data['team1_id']
team2_ids = new_test_data['team2_id']

In [724]:
import xgboost as xgb
import numpy as np
import pandas as pd


# Ensure y_train contains zero-indexed integer labels
unique_classes = np.unique(y_train)
class_to_index = {label: idx for idx, label in enumerate(unique_classes)}
y_train_indexed = np.array([class_to_index[label] for label in y_train])

# Create DMatrix for training and testing
xgb_train = xgb.DMatrix(X_train, label=y_train_indexed, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, enable_categorical=True)  # No label for xgb_test

# Define parameters and add num_class
n = 139
params = {
    'objective': 'multi:softprob',
    'max_depth': 6,
    'learning_rate': 0.2, 'subsample': 0.7, 'colsample_bytree': 0.9,
    'num_class': len(set(y_train))  # Number of unique classes
    
}

# Train the model
xgb_model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=n)

# Make predictions
xgb_preds = xgb_model.predict(xgb_test)

# Compare probabilities and assign the team with higher probability
y_pred_01 = []
win_prob=[]
for i in range(len(xgb_preds)):
    team1_id = team1_ids[i]
    team2_id = team2_ids[i]
    
    team1_index = class_to_index.get(team1_id, -1)
    team2_index = class_to_index.get(team2_id, -1)
    
    if team1_index == -1 and team2_index == -1:
        y_pred_01.append(team1_id)
        win_prob.append(0)
        
        continue
        # raise ValueError(f"Team ID {team1_id} or {team2_id} not found in class_to_index mapping.")
    
    team1_prob = xgb_preds[i][team1_index]  # Probability for team1_id
    team2_prob = xgb_preds[i][team2_index]  # Probability for team2_id
    
    if team1_prob > team2_prob:
        y_pred_01.append(team1_id)  # team1_id
        win_prob.append(team1_prob)
    else:
        y_pred_01.append(team2_id)  # team2_id
        win_prob.append(team2_prob)

# Print predictions
# print(f'Predicted class labels for X_test: {y_pred_01}')

In [725]:
# printing winning team along with their winning probability
for i in range(len(y_pred_01)):
    print(f"Match {i+1}: {y_pred_01[i]} with probability {win_prob[i]}")

Match 1: 90 with probability 0.0009046654449775815
Match 2: 36084 with probability 0.42845436930656433
Match 3: 30428 with probability 0.16140346229076385
Match 4: 36098 with probability 0.06657750904560089
Match 5: 48341 with probability 0.8800460696220398
Match 6: 30407 with probability 0.10263784974813461
Match 7: 30414 with probability 0.7234331965446472
Match 8: 6698 with probability 0.5003462433815002
Match 9: 7727 with probability 0.14492301642894745
Match 10: 30428 with probability 0.0011194198159500957
Match 11: 36098 with probability 0.2747907340526581
Match 12: 202 with probability 0.14626243710517883
Match 13: 47501 with probability 0.7284798622131348
Match 14: 48 with probability 0.6446639895439148
Match 15: 62 with probability 0.5444321036338806
Match 16: 33921 with probability 0.05497395247220993
Match 17: 6698 with probability 0.8874927163124084
Match 18: 20 with probability 0.8439624905586243
Match 19: 36070 with probability 0.7795150876045227
Match 20: 36070 with prob

In [726]:
# adding y_pred_01 to test_data
new_test_data['win_pred_team_id'] = y_pred_01

In [727]:
# adding winning probability to test_data
new_test_data['win_pred_score'] = win_prob

In [728]:
new_test_data.head(10)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios,match_id,toss_winner,toss_decision,win_pred_team_id,win_pred_score
0,28594,90,11,13,58,0,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,0.0,0.975779,0.0,50.0,0.300243,0.0,0.0,0.076923,0.0,0.0,9250275,104,1,90,0.000905
1,36084,36098,59,70,89,2,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,66.67,1.05519,42.857143,46.666667,1.017284,1.133359,0.928929,0.9375,1.01142,1.372642,9262189,40,1,36084,0.428454
2,30428,48334,8,58,40,2,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,100.0,1.054849,46.666667,77.777778,0.906726,0.837286,1.123313,1.117647,1.02521,1.056277,9128776,82,0,30428,0.161403
3,36098,36112,23,38,195,2,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,75.0,0.88804,53.333333,40.0,0.648555,2.125587,0.960721,0.608696,0.309804,0.263333,9586919,92,1,36098,0.066578
4,30414,48341,46,71,34,2,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,0.0,0.984463,46.666667,66.666667,1.238929,0.393975,1.171551,2.0,1.730696,2.562762,9128538,31,1,48341,0.880046
5,30407,30435,57,20,166,2,7118,0.4,0.672131,177.066667,50.0,184.5,0.941093,0.995272,1.001928,0.934268,0.73699,50.0,50.0,1.018264,46.666667,40.0,0.862513,-4.472477,0.901007,0.956522,0.579996,0.968807,9516653,57,1,30407,0.102638
6,36014,30414,45,23,161,2,2050,0.681818,0.344262,183.466667,25.0,156.35,0.921257,1.034135,0.921563,0.955612,0.797946,25.0,25.0,1.028385,40.0,53.333333,1.105788,0.822466,1.014703,0.822222,0.735842,0.634871,9516534,12,1,30414,0.723433
7,9701,6698,81,81,57,2,1042,3.2,0.60396,163.066667,0.0,177.321429,0.937888,1.114808,0.63588,0.685779,1.635887,0.0,0.0,0.961671,66.666667,66.666667,0.928353,6.45702,0.801164,1.727273,1.140684,1.267427,9085474,19,1,6698,0.500346
8,7727,7573,82,76,184,1,8301,2.428571,2.904762,167.0,100.0,151.6,1.301483,1.165019,1.046915,0.940972,1.415819,100.0,100.0,0.991761,80.0,40.0,1.060601,2.122972,0.825663,1.566667,1.912002,1.333955,9433752,30,1,7727,0.144923
9,30428,36014,67,42,153,1,13880,1.727273,21.0,171.066667,66.67,186.5,1.161314,1.008119,0.596095,1.011756,1.146592,66.67,66.67,0.989076,46.666667,33.333333,1.106032,-0.051787,1.335084,1.433333,1.523553,1.287834,9516359,89,1,30428,0.001119


## Filling submission 1 csv

In [729]:
submission1=pd.read_csv('sample_template/submission_template_file1.csv')

In [730]:
submission1.head(2)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10
0,9250275,r1,,,,,,,,,,,,,,,,,
1,9262189,r1,,,,,,,,,,,,,,,,,


In [731]:
# filling submisssion 1 win_pred_team_id, win_pred_score with new_test_data win_pred_team_id, win_pred_score according to match id

# also filling train_algorithm as xgboost , is_ensemble=no , train_hps_tree=150, train_hps_depth=6, train_hps_lr=0.1 for all rows

for i in range(len(submission1)):
    match_id = submission1['match id'][i]
    if match_id not in new_test_data['match_id'].values:
        continue
    new_test_data_row = new_test_data[new_test_data['match_id'] == match_id]
    submission1.at[i, 'win_pred_team_id'] = new_test_data_row['win_pred_team_id'].values[0]
    submission1.at[i, 'win_pred_score'] = new_test_data_row['win_pred_score'].values[0]
    submission1.at[i, 'train_algorithm'] = 'xgboost'
    submission1.at[i, 'is_ensemble'] = 'no'
    submission1.at[i, 'train_hps_trees'] = 150
    submission1.at[i, 'train_hps_depth'] = 6
    submission1.at[i, 'train_hps_lr'] = 0.1

  submission1.at[i, 'train_algorithm'] = 'xgboost'
  submission1.at[i, 'is_ensemble'] = 'no'


In [732]:
submission1.head(10)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10
0,9250275,r1,90.0,0.000905,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
1,9262189,r1,36084.0,0.428454,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
2,9128776,r1,30428.0,0.161403,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
3,9586919,r1,36098.0,0.066578,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
4,9128538,r1,48341.0,0.880046,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
5,9516653,r1,30407.0,0.102638,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
6,9516534,r1,30414.0,0.723433,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
7,9085474,r1,6698.0,0.500346,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
8,9433752,r1,7727.0,0.144923,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
9,9516359,r1,30428.0,0.001119,xgboost,no,150.0,6.0,0.1,,,,,,,,,,


## Now genearting for train data

In [733]:
new_train_data.head(2)

Unnamed: 0,team1_id,team2_id,winner_id,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios,match_id,toss_winner,toss_decision
0,11283,12634,12634,44,55,212,1,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,100.0,1.181688,60.0,63.636364,0.765906,0.792797,0.866174,1.692308,1.197368,0.0,9331181,44,1
1,20,41,20,53,13,20,1,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,50.0,0.999614,60.0,40.0,0.918417,2.808974,0.807064,0.590164,1.745868,2.585227,8797060,138,1


In [734]:
train_team1_ids = new_train_data['team1_id']
train_team2_ids = new_train_data['team2_id']

In [735]:
# scalling and x train and y train
X_train = new_train_data.drop(['winner_id'], axis=1)
y_train = new_train_data['winner_id']

# scalling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)


In [736]:
# Create DMatrix for training and testing
xgb_train = xgb.DMatrix(X_train, enable_categorical=True)
# xgb_test = xgb.DMatrix(X_test, enable_categorical=True)  # No label for xgb_test

In [737]:
xgb_preds = xgb_model.predict(xgb_train)

In [738]:
xgb_preds.shape

(948, 139)

In [739]:


# Compare probabilities and assign the team with higher probability
y_pred_01 = []
win_prob=[]
for i in range(len(xgb_preds)):
    team1_id = train_team1_ids[i]
    team2_id = train_team2_ids[i]
    
    team1_index = class_to_index.get(team1_id, -1)
    team2_index = class_to_index.get(team2_id, -1)
    
    if team1_index == -1 and team2_index == -1:
        y_pred_01.append(team1_id)
        win_prob.append(0)
        
        continue
        # raise ValueError(f"Team ID {team1_id} or {team2_id} not found in class_to_index mapping.")
    
    team1_prob = xgb_preds[i][team1_index]  # Probability for team1_id
    team2_prob = xgb_preds[i][team2_index]  # Probability for team2_id
    
    if team1_prob > team2_prob:
        y_pred_01.append(team1_id)  # team1_id
        win_prob.append(team1_prob)
    else:
        y_pred_01.append(team2_id)  # team2_id
        win_prob.append(team2_prob)

In [740]:
# printing winning team along with their winning probability
for i in range(len(y_pred_01)):
    print(f"Match {i+1}: {y_pred_01[i]} with probability {win_prob[i]}")

Match 1: 12634 with probability 0.033639341592788696
Match 2: 41 with probability 0.0060182008892297745
Match 3: 10576 with probability 0.7894976139068604
Match 4: 36070 with probability 0.14261484146118164
Match 5: 48341 with probability 0.4648345708847046
Match 6: 40424 with probability 0.9321692585945129
Match 7: 55 with probability 0.008000718429684639
Match 8: 47501 with probability 0.14077581465244293
Match 9: 12634 with probability 0.3403950333595276
Match 10: 55 with probability 0.012842218391597271
Match 11: 30435 with probability 0.05316103994846344
Match 12: 30435 with probability 0.06903556734323502
Match 13: 36014 with probability 0.11568733304738998
Match 14: 8987 with probability 0.0043811132200062275
Match 15: 10618 with probability 0.32011091709136963
Match 16: 30393 with probability 0.3152371346950531
Match 17: 33956 with probability 0.07178231328725815
Match 18: 188 with probability 0.03424298018217087
Match 19: 7727 with probability 0.842010498046875
Match 20: 251 w

In [741]:
len(win_prob)

948

In [742]:
new_train_data['win_pred_team_id'] = y_pred_01

In [743]:
# adding winning probability to test_data
new_train_data['win_pred_score'] = win_prob

In [744]:
new_train_data.head(5)

Unnamed: 0,team1_id,team2_id,winner_id,venue,city,match_dt,lighting,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last5,team1_winp_team2_last10,team_average_score_ratios,win_percentage_team1_last_15,win_percentage_team2_last_15,batsmen_performance_ratio_last5,bowlers_performance_ratio_last5,bowlers_economy_ratios_last5,team_sixes_ratios_last15,team_batsmen_above_fifty_ratios,team_batsmen_above_seventy_five_ratios,match_id,toss_winner,toss_decision,win_pred_team_id,win_pred_score
0,11283,12634,12634,44,55,212,1,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,100.0,1.181688,60.0,63.636364,0.765906,0.792797,0.866174,1.692308,1.197368,0.0,9331181,44,1,12634,0.033639
1,20,41,20,53,13,20,1,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,50.0,0.999614,60.0,40.0,0.918417,2.808974,0.807064,0.590164,1.745868,2.585227,8797060,138,1,41,0.006018
2,10576,8987,10576,111,83,370,1,251,0.857143,0.672131,173.266667,0.0,154.333333,1.000138,0.909712,0.85019,0.717828,1.024538,0.0,0.0,1.116027,40.0,46.666667,0.894351,0.802434,1.069791,0.784314,0.770274,0.277903,9433269,81,1,10576,0.789498
3,36084,36070,36084,10,105,434,0,14300,2.166667,1.97561,164.266667,50.0,144.25,1.116102,1.052218,0.813452,0.878287,0.700727,50.0,50.0,0.882667,60.0,66.666667,1.424835,2.466747,0.840831,1.709677,0.922251,0.795991,9587073,34,0,36070,0.142615
4,30407,48341,48341,75,25,331,2,7118,0.818182,1.327869,164.666667,0.0,189.0,1.035394,1.240634,1.084338,1.168234,1.480742,0.0,0.0,1.0267,53.333333,69.230769,1.074782,0.646214,0.987516,1.261905,0.835192,1.461586,9516457,40,1,48341,0.464835


In [745]:
for i in range(len(submission1)):
    match_id = submission1['match id'][i]
    if match_id not in new_train_data['match_id'].values:
        continue
    new_train_data_row = new_train_data[new_train_data['match_id'] == match_id]
    submission1.at[i, 'win_pred_team_id'] = new_train_data_row['win_pred_team_id'].values[0]
    submission1.at[i, 'win_pred_score'] = new_train_data_row['win_pred_score'].values[0]
    submission1.at[i, 'train_algorithm'] = 'xgboost'
    submission1.at[i, 'is_ensemble'] = 'no'
    submission1.at[i, 'train_hps_trees'] = 150
    submission1.at[i, 'train_hps_depth'] = 6
    submission1.at[i, 'train_hps_lr'] = 0.1

## CSV-2 Generation

In [746]:
cols = train_data.columns[-15:-5]

In [747]:
cols

Index(['team1_winp_team2_last10', 'team_average_score_ratios',
       'win_percentage_team1_last_15', 'win_percentage_team2_last_15',
       'batsmen_performance_ratio_last5', 'bowlers_performance_ratio_last5',
       'bowlers_economy_ratios_last5', 'team_sixes_ratios_last15',
       'team_batsmen_above_fifty_ratios',
       'team_batsmen_above_seventy_five_ratios'],
      dtype='object')

In [748]:
ft_dict = xgb_model.get_score(importance_type='gain')

In [749]:
ft_dict

{'f0': 4.953207969665527,
 'f1': 5.508023262023926,
 'f2': 3.0659382343292236,
 'f3': 4.967489719390869,
 'f4': 3.107064962387085,
 'f5': 3.2824478149414062,
 'f6': 4.2829508781433105,
 'f7': 2.180805206298828,
 'f8': 2.1459102630615234,
 'f9': 2.334364891052246,
 'f10': 1.4695738554000854,
 'f11': 2.6298089027404785,
 'f12': 3.1465866565704346,
 'f13': 1.8925256729125977,
 'f14': 2.5889670848846436,
 'f15': 1.8726139068603516,
 'f16': 2.021083354949951,
 'f17': 4.7671966552734375,
 'f18': 1.2847082614898682,
 'f19': 2.1661763191223145,
 'f20': 1.130432367324829,
 'f21': 2.4093706607818604,
 'f22': 1.2392815351486206,
 'f23': 1.7593015432357788,
 'f24': 1.1012986898422241,
 'f25': 1.5467344522476196,
 'f26': 2.1243104934692383,
 'f27': 0.9955365061759949,
 'f28': 3.4166128635406494,
 'f29': 2.269134044647217,
 'f30': 1.4888408184051514}

In [750]:
sorted_dict = {}
for key in sorted(ft_dict, key=ft_dict.get):
    sorted_dict[key] = ft_dict[key]
sorted_dict

{'f27': 0.9955365061759949,
 'f24': 1.1012986898422241,
 'f20': 1.130432367324829,
 'f22': 1.2392815351486206,
 'f18': 1.2847082614898682,
 'f10': 1.4695738554000854,
 'f30': 1.4888408184051514,
 'f25': 1.5467344522476196,
 'f23': 1.7593015432357788,
 'f15': 1.8726139068603516,
 'f13': 1.8925256729125977,
 'f16': 2.021083354949951,
 'f26': 2.1243104934692383,
 'f8': 2.1459102630615234,
 'f19': 2.1661763191223145,
 'f7': 2.180805206298828,
 'f29': 2.269134044647217,
 'f9': 2.334364891052246,
 'f21': 2.4093706607818604,
 'f14': 2.5889670848846436,
 'f11': 2.6298089027404785,
 'f2': 3.0659382343292236,
 'f4': 3.107064962387085,
 'f12': 3.1465866565704346,
 'f5': 3.2824478149414062,
 'f28': 3.4166128635406494,
 'f6': 4.2829508781433105,
 'f17': 4.7671966552734375,
 'f0': 4.953207969665527,
 'f3': 4.967489719390869,
 'f1': 5.508023262023926}

In [751]:
sorted_list = list(sorted_dict.keys())

In [752]:
feat = pd.read_csv('sample_template/submission_template_file2.csv')

In [753]:
feat['feat_id'] = np.arange(1, 11)

In [754]:
feat['feat_name'] = cols

In [755]:
feat

Unnamed: 0,feat_id,feat_name,feat_description,model_feat_imp_train,feat_rank_train
0,1,team1_winp_team2_last10,,,
1,2,team_average_score_ratios,,,
2,3,win_percentage_team1_last_15,,,
3,4,win_percentage_team2_last_15,,,
4,5,batsmen_performance_ratio_last5,,,
5,6,bowlers_performance_ratio_last5,,,
6,7,bowlers_economy_ratios_last5,,,
7,8,team_sixes_ratios_last15,,,
8,9,team_batsmen_above_fifty_ratios,,,
9,10,team_batsmen_above_seventy_five_ratios,,,


In [756]:
for i in range(10):
    feat['model_feat_imp_train'][i] = ft_dict[f'f{12+i}']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[f'f{12+i}']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[f'f{12+i}']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[f'f{12+i}']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['

In [757]:
filtered_feats = []

for i in range(10):
    filtered_feats.append(f'f{14+i}')

In [758]:
filtered_feats

['f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23']

In [759]:
for i in range(10):
    feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [760]:
feat.to_csv('sample_template/submission_file2-2.csv',index=False)

In [761]:
submission1.to_csv('sample_template/submission_file1-2.csv', index=False)

In [762]:
# train_check=pd.read_csv('testing/train_data.csv')

In [763]:
# # checking count of matchings between y_pred_01 and winner_id
count=0
for i in range(len(y_pred_01)):
    if y_pred_01[i] == new_train_data['winner_id'][i]:
        count+=1
print(count)

799


In [481]:
# correlation with winner_id
correlation = new_train_data.corr()['winner_id'].sort_values(ascending=False)

correlation

winner_id                          1.000000
win_pred_team_id                   0.995377
team2_id                           0.979063
team1_id                           0.968495
win_pred_score                     0.410026
lighting                           0.344856
venue                              0.153232
ground_avg_runs_last15             0.111647
city                               0.074610
team_average_score_ratios          0.064217
toss_decision                      0.052883
bowlers_economy_ratios_last5       0.043910
ground_id                          0.017925
match_id                           0.009817
team1_winp_team2_last10            0.006720
batsmen_performance_ratio_last5    0.005853
team1only_avg_runs_last15          0.005800
team1_winp_team2_last5             0.002836
team1_winp_team2_last15            0.002836
win_percentage_team2_last_15      -0.003678
bowlers_economy_ratios            -0.010764
win_percentage_team1_last_15      -0.022032
bowlers_extras_ratios           