## Training and Testing Pipeline

In [405]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [406]:
train_data = pd.read_csv('testing/train_data.csv')
test_data = pd.read_csv('testing/test_data.csv')

In [407]:
# Display options
pd.set_option('display.max_columns',50)

In [408]:
train_data.shape
train_data.head(2)

(948, 23)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5


In [409]:
test_data.shape
test_data.head(2)

(271, 21)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,Bulawayo,2022-07-11,day match,Ud Ss of Aa tr of Ze,2022,3226,0.0,0.019608,,0.0,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,Providence,2022-09-21,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.5


# Calculating batsmen features

In [410]:
batsmen=pd.read_csv('dataset/batsman_level_scorecard.csv')

In [411]:
batsmen.head(2)

Unnamed: 0,match id,batsman,batsman_id,batsman_details,is_batsman_captain,is_batsman_keeper,inning,runs,balls_faced,over_faced_first,wicket kind,out_by_bowler,out_by_fielder,bowler_id,bowler_details,is_bowler_keeper,is_bowler_captain,strike_rate,Fours,Sixes,match_dt
0,8638034,KD Ce,7907451.0,NZ:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,7,5,1.1,caught,JS Nm,TM Jn,2486896.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,140.0,1.0,,2021-01-01
1,8638034,TL St,4381761.0,NZ:Right-hand bat:None:,0.0,1.0,1,46,46,1.2,caught,R Ra,MG Bl,6718382.0,NZ:Left-hand bat:Slow left-arm orthodox:,0.0,0.0,100.0,4.0,1.0,2021-01-01


In [412]:
# calculating 2 dictionaries to store {batsman_id: batsman average } and {batsman_id: batsman strike rate}

total_runs = {}  
balls_faced = {}
matches_played = {}

for i in range(len(batsmen)):
    batsman_id = batsmen['batsman_id'][i]
    runs = batsmen['runs'][i]
    balls = batsmen['balls_faced'][i]
    match_id = batsmen['match id'][i]
    if batsman_id in total_runs:
        total_runs[batsman_id] += runs
        balls_faced[batsman_id] += balls
        matches_played[batsman_id] += 1
    else:
        total_runs[batsman_id] = runs
        balls_faced[batsman_id] = balls
        matches_played[batsman_id] = 1
        
batsman_avg = {}
batsman_sr = {}

for batsman_id in total_runs:
    batsman_avg[batsman_id] = total_runs[batsman_id] / matches_played[batsman_id]
    batsman_sr[batsman_id] = total_runs[batsman_id] / balls_faced[batsman_id] * 100
 

# Bowler level features

In [413]:
bowlers=pd.read_csv('dataset/bowler_level_scorecard.csv')

In [414]:
bowlers.head()

Unnamed: 0,match id,bowler,bowler_id,bowler_details,is_bowler_captain,is_bowler_keeper,inning,runs,wicket_count,balls_bowled,economy,maiden,dots,Fours,Sixes,wides,noballs,match_dt
0,8638034,BV Ss,6718396.0,NZ:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,21,0,12,10.5,0,3.0,1,1,3,1,2021-01-01
1,8638034,HK Bt,1585464.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,1,34,1,24,8.5,0,7.0,3,1,2,0,2021-01-01
2,8638034,JS Nm,2486896.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,1,23,2,24,5.75,0,12.0,1,1,0,0,2021-01-01
3,8638034,LV vn Bk,3083667.0,NED:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,31,1,24,7.75,0,9.0,4,0,2,0,2021-01-01
4,8638034,PF Yd,4950294.0,NZ:Right-hand bat:Legbreak googly:,0.0,0.0,1,21,1,24,5.25,0,8.0,0,0,1,0,2021-01-01


In [415]:
# calculating 2 dictionaries to store {bowler_id: bowler strike rate } , {bowler_id: bowler economy} , {bowler_id: bowler extras rate}

total_matches = {}
total_wickets = {}
total_ball_bowled = {}
total_extras = {}
total_economy = {}

for i in range(len(bowlers)):
    bowler_id = bowlers['bowler_id'][i]
    wickets = bowlers['wicket_count'][i]
    balls = bowlers['balls_bowled'][i]
    runs = bowlers['runs'][i]
    extras = bowlers['wides'][i] + bowlers['noballs'][i]
    match_id = bowlers['match id'][i]
    economy=bowlers['economy'][i]
    if bowler_id in total_matches:
        total_matches[bowler_id] += 1
        total_wickets[bowler_id] += wickets
        total_ball_bowled[bowler_id] += balls
        total_extras[bowler_id] += extras
        total_economy[bowler_id] += economy
    else:
        total_matches[bowler_id] = 1
        total_wickets[bowler_id] = wickets
        total_ball_bowled[bowler_id] = balls
        total_extras[bowler_id] = extras
        total_economy[bowler_id] = economy
        
bowler_sr = {}
bowler_economy = {}
bowler_extras = {}

for bowler_id in total_matches:
    bowler_economy[bowler_id] = total_economy[bowler_id] / total_matches[bowler_id]
    bowler_extras[bowler_id] = total_extras[bowler_id] / total_ball_bowled[bowler_id]
    if total_wickets[bowler_id] == 0:
        bowler_sr[bowler_id] = 0
    else:
        bowler_sr[bowler_id] = total_ball_bowled[bowler_id] /total_wickets[bowler_id]
    

## Match level features

In [416]:
match=pd.read_csv('dataset/match_level_scorecard.csv')

In [417]:
match.head()

Unnamed: 0,match id,team1,team2,winner,by,win amount,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,umpire1,umpire2,inning1_runs,inning1_wickets,inning1_balls,inning2_runs,inning2_wickets,inning2_balls,team1_id,team1_roster_ids,team2_id,team2_roster_ids,series_type,winner_id,player_of_the_match_id
0,8638034,Nn Ds,Wn,Wn,wickets,9.0,Wn,field,By Ol,Mount Maunganui,2021-01-01,night match,Sr Sh,2020/21,17681,Dn Mw,TJ Pe,148,6,130,152.0,1.0,97.0,17982,7907451.0:4381761.0:31464.0:258649.0:4949790.0...,18570,2653993.0:6718326.0:6718382.0:2486896.0:228878...,other_domestic,18570,
1,8588005,Me Rs,Sy Tr,Sy Tr,runs,7.0,Sy Tr,field,Ca Ol,Carrara,2021-01-01,night match,Bg Bh Le,2020/21,6663,D Tr,PJ Ge,166,6,106,117.0,2.0,74.0,33942,37351.0:46794.0:5406540.0:2231928.0:181404.0:1...,33963,1506098.0:1749075.0:36665.0:2083409.0:7534652....,other_domestic,33963,1749075.0
2,8587837,Sy Ss,Be Ht,Be Ht,wickets,4.0,Sy Ss,bat,Be Ct Gd,Brisbane,2021-01-02,night match,Bg Bh Le,2020/21,1476,GJ Dn,PJ Ge,165,8,126,171.0,6.0,119.0,33956,7869987.0:7620283.0:2076192.0:4002340.0:306369...,33921,7620269.0:2286437.0:87191.0:5786766.0:3114803....,other_domestic,33921,3890984.0
3,8638041,Nn Ds,Oo,Oo,wickets,2.0,Oo,field,By Ol,Mount Maunganui,2021-01-02,night match,Sr Sh,2020/21,17681,GA St,TJ Pe,153,7,128,156.0,8.0,126.0,17982,7907451.0:4381761.0:31464.0:4949790.0:258649.0...,18360,2319638.0:256080.0:7918280.0:3913447.0:2690498...,other_domestic,18360,
4,8587921,Ht Hs,Me Ss,Ht Hs,runs,21.0,Me Ss,field,Be Ol,Hobart,2021-01-02,day/night match,Bg Bh Le,2020/21,6348,GA Ad,NR Je,164,5,126,143.0,9.0,123.0,33928,4223883.0:2161599.0:1655436.0:5788418.0:319948...,33949,363047.0:2275097.0:3901078.0:2275195.0:4230127...,other_domestic,33928,3125849.0


In [418]:
# # calculating 2 dictionaries to store {team_id: team average }
#  if team1 has won toss and choose bat then inning1 score will be team1 score and inning2 score will be team2 score

total_runs = {}
total_matches={}
for i in range(len(match)):
    
    toss_winner = match['toss winner'][i]
    team2=toss_winner
    
    if toss_winner == match['team2_id'][i]:
        team1 = match['team1_id'][i]
    else:
        team1 = match['team2_id'][i]
    
    if match['toss decision'][i] == 'bat':
        team2_score=match['inning1_runs'][i]
        team1_score=match['inning2_runs'][i]
    
    else:
        team1_score=match['inning1_runs'][i]
        team2_score=match['inning2_runs'][i]
    
    if team1 in total_runs:
        total_runs[team1] += team1_score
        total_matches[team1] += 1
    
    else:
        total_runs[team1] = team1_score
        total_matches[team1] = 1
        
    if team2 in total_runs:
        total_runs[team2] += team2_score
        total_matches[team2] += 1
    
    else:
        total_runs[team2] = team2_score
        total_matches[team2] = 1
        
team_avg = {}

for team_id in total_runs:
    team_avg[team_id] = total_runs[team_id] / total_matches[team_id]

In [419]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = match[(match['match_dt']<date)&\
                      (((match['team1_id']==team1_id)&(match['team2_id']==team2_id))|((match['team1_id']==team2_id)&(match['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.


In [420]:
def win_percentage_last_n_matches(team_id, date,n):
    '''
    Function to calculate a team's win percentage in their last n games.

    '''
    # Filter games where team1 or team2 is the input team_id and the match date is before the input date
    df_rel = match[(match['match_dt'] < date) &
                            ((match['team1_id'] == team_id) | (match['team2_id'] == team_id))]\
                            .sort_values(by='match_dt', ascending=False).head(n)
    
    # Calculate the number of wins
    wins = df_rel[(df_rel['winner_id'] == team_id)].shape[0]
    
    # Calculate the number of matches
    total_matches = df_rel.shape[0]
    
    if total_matches == 0:
        return 0.0  # Avoid division by zero
    
    # Calculate win percentage
    win_percentage = (wins / total_matches) * 100
    return win_percentage


In [421]:
train_data.head(2)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5


In [422]:
test_data.head(2)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,Bulawayo,2022-07-11,day match,Ud Ss of Aa tr of Ze,2022,3226,0.0,0.019608,,0.0,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,Providence,2022-09-21,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.5


# Combining the features of dataset

In [423]:
# from geopy.geocoders import Nominatim

# # Initialize Nominatim API
# geolocator = Nominatim(user_agent="my_geocoder")

# Creating new columns in train_data and initializing with 0
train_data['batsmens_averages_1'] = 0
train_data['batsmens_strike_rates_1'] = 0
train_data['bowlers_strike_rates_1'] = 0
train_data['bowlers_economy_1'] = 0
train_data['bowlers_extras_1'] = 0

train_data['batsmens_averages_2'] = 0
train_data['batsmens_strike_rates_2'] = 0
train_data['bowlers_strike_rates_2'] = 0
train_data['bowlers_economy_2'] = 0
train_data['bowlers_extras_2'] = 0

train_data['batsmens_averages_ratios'] = 0
train_data['batsmens_strike_rates_ratios'] = 0
train_data['bowlers_strike_rates_ratios'] = 0
train_data['bowlers_economy_ratios'] = 0
train_data['bowlers_extras_ratios'] = 0

train_data['team1_winp_team2_last10']=0
train_data['team1_average_score'] = 0
train_data['team2_average_score'] = 0
train_data['win_percentage_team1_last_15'] = 0
train_data['win_percentage_team2_last_15'] = 0


# train_data['location_humidity'] = 0
# train_data['location_temperature'] = 0

# Filling the values in the new columns
for i in range(len(train_data)):
    # Getting ids of batsmen and bowlers from team roster id separated by :
    team1_players_ids = train_data['team1_roster_ids'][i].split(':')
    team2_players_ids = train_data['team2_roster_ids'][i].split(':')
    # print(i,team1_players_ids,team2_players_ids)
    # For team1 check if id is present in the dictionary and add the values
    for player_id in team1_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            train_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
            train_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            train_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
            train_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
            train_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
            
    # For team2 check if id is present in the dictionary and add the values
    for player_id in team2_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            train_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
            train_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            train_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
            train_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
            train_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
    
    # filling ratios
    if train_data['batsmens_averages_2'][i] == 0:
        train_data['batsmens_averages_2'][i] = 1
        
    if train_data['batsmens_strike_rates_2'][i] == 0:
        train_data['batsmens_strike_rates_2'][i] = 1
    if train_data['bowlers_strike_rates_2'][i] == 0:
        train_data['bowlers_strike_rates_2'][i] = 1
        
    if train_data['bowlers_economy_2'][i] == 0:
        train_data['bowlers_economy_2'][i] = 1
        
    if train_data['bowlers_extras_2'][i] == 0:
        train_data['bowlers_extras_2'][i] = 1
        
    
    train_data.at[i, 'batsmens_averages_ratios'] = train_data['batsmens_averages_1'][i] / train_data['batsmens_averages_2'][i]
    train_data.at[i, 'batsmens_strike_rates_ratios'] = train_data['batsmens_strike_rates_1'][i] / train_data['batsmens_strike_rates_2'][i]
    train_data.at[i, 'bowlers_strike_rates_ratios'] = train_data['bowlers_strike_rates_1'][i] / train_data['bowlers_strike_rates_2'][i]
    train_data.at[i, 'bowlers_economy_ratios'] = train_data['bowlers_economy_1'][i] / train_data['bowlers_economy_2'][i]
    train_data.at[i, 'bowlers_extras_ratios'] = train_data['bowlers_extras_1'][i] / train_data['bowlers_extras_2'][i]
    
    
    # filling team win% against each other in last 10 games
    team1_id = train_data['team1_id'][i]
    team2_id = train_data['team2_id'][i]
    date = train_data['match_dt'][i]
    
    train_data.at[i, 'team1_winp_team2_last10'] = winpCrossLastn(team1_id, team2_id, date, 10)

            
    # filling team average scores
    team1_id = train_data['team1_id'][i]
    team2_id = train_data['team2_id'][i]
    
    if team1_id in team_avg:
        train_data.at[i, 'team1_average_score'] = team_avg[team1_id]
    else:
        train_data.at[i, 'team1_average_score'] = 0
    if team2_id in team_avg:
        train_data.at[i, 'team2_average_score'] = team_avg[team2_id]
    else:
        train_data.at[i, 'team2_average_score'] = 0
        
    # filling win% of team1 in last 15 games from matchscore_updated
    
    train_data.at[i, 'win_percentage_team1_last_15'] = win_percentage_last_n_matches(team1_id, date, 15)
    train_data.at[i, 'win_percentage_team2_last_15'] = win_percentage_last_n_matches(team2_id, date, 15)
    



  train_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
  train_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
  train_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
  train_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
  train_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
  train_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
  train_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
  train_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
  train_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
  train_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
  train_data.at[i, 'batsmens_averages_ratios'] = train_data['batsmens_averages_1'][i] / train_data['batsmens_averages_2'][i]
  train_data.at[i, 'batsmens_strike_rates_ratios'] = train_data['batsmens_strike_rates_1'][i] / train_data['batsmens_strike_rates_2'][i]
  train_data.at[i, 'bowlers_strike_rates_ratios'] 

In [424]:
# dropping team1_roster_ids , team2_roster_ids , winner , team1 , team2 ,  columns 
train_data = train_data.drop(['team1_roster_ids', 'team2_roster_ids', 'winner', 'team1', 'team2','batsmens_averages_1','batsmens_strike_rates_1','bowlers_strike_rates_1','bowlers_economy_1','bowlers_extras_1','batsmens_averages_2','batsmens_strike_rates_2','bowlers_strike_rates_2','bowlers_economy_2','bowlers_extras_2'], axis=1)

In [425]:
train_data.head(2)

Unnamed: 0,match id,team1_id,team2_id,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15
0,9331181,11283,12634,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364
1,8797060,20,41,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0


In [426]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_columns = ['toss winner', 'toss decision', 'venue', 'match_dt', 'lighting', 'series_name', 'season', 'city']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_columns:
    train_data[col] = label_encoder.fit_transform(train_data[col])

# Display the first 5 rows of the dataframe
train_data.head(5)


Unnamed: 0,match id,team1_id,team2_id,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15
0,9331181,11283,12634,12634,44,1,44,55,212,1,58,2,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364
1,8797060,20,41,20,138,1,53,13,20,1,22,0,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0
2,9433269,10576,8987,10576,81,1,111,83,370,1,79,3,251,0.857143,0.672131,173.266667,0.0,154.333333,1.000138,0.909712,0.85019,0.717828,1.024538,0.0,166.944444,149.588235,40.0,46.666667
3,9587073,36084,36070,36084,34,0,10,105,434,0,18,3,14300,2.166667,1.97561,164.266667,50.0,144.25,1.116102,1.052218,0.813452,0.878287,0.700727,50.0,156.3,177.076923,60.0,66.666667
4,9516457,30407,48341,48341,40,1,75,25,331,2,35,3,7118,0.818182,1.327869,164.666667,0.0,189.0,1.035394,1.240634,1.084338,1.168234,1.480742,0.0,172.4,167.916667,53.333333,69.230769


In [427]:
# filling nan values with 0

train_data.fillna(0, inplace=True)

In [428]:
# changing column names of match id to match_id , toss winner to toss_winner , toss decision to toss_decision

train_data['match_id'] = train_data['match id']
train_data['toss_winner'] = train_data['toss winner']
train_data['toss_decision'] = train_data['toss decision']

train_data = train_data.drop(['match id', 'toss winner', 'toss decision'], axis=1)

## For test data also

In [429]:
# from geopy.geocoders import Nominatim

# # Initialize Nominatim API
# geolocator = Nominatim(user_agent="my_geocoder")

# Creating new columns in test_data and initializing with 0
test_data['batsmens_averages_1'] = 0
test_data['batsmens_strike_rates_1'] = 0
test_data['bowlers_strike_rates_1'] = 0
test_data['bowlers_economy_1'] = 0
test_data['bowlers_extras_1'] = 0

test_data['batsmens_averages_2'] = 0
test_data['batsmens_strike_rates_2'] = 0
test_data['bowlers_strike_rates_2'] = 0
test_data['bowlers_economy_2'] = 0
test_data['bowlers_extras_2'] = 0

test_data['batsmens_averages_ratios'] = 0
test_data['batsmens_strike_rates_ratios'] = 0
test_data['bowlers_strike_rates_ratios'] = 0
test_data['bowlers_economy_ratios'] = 0
test_data['bowlers_extras_ratios'] = 0

test_data['team1_winp_team2_last10']=0
test_data['team1_average_score'] = 0
test_data['team2_average_score'] = 0
test_data['win_percentage_team1_last_15'] = 0
test_data['win_percentage_team2_last_15'] = 0


# test_data['location_humidity'] = 0
# test_data['location_temperature'] = 0

# Filling the values in the new columns
for i in range(len(test_data)):
    # Getting ids of batsmen and bowlers from team roster id separated by :
    team1_players_ids = test_data['team1_roster_ids'][i].split(':')
    team2_players_ids = test_data['team2_roster_ids'][i].split(':')
    # print(i,team1_players_ids,team2_players_ids)
    # For team1 check if id is present in the dictionary and add the values
    for player_id in team1_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            test_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
            test_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            test_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
            test_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
            test_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
            
    # For team2 check if id is present in the dictionary and add the values
    for player_id in team2_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            test_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
            test_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            test_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
            test_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
            test_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
    
    # filling ratios
    if test_data['batsmens_averages_2'][i] == 0:
        test_data['batsmens_averages_2'][i] = 1
        
    if test_data['batsmens_strike_rates_2'][i] == 0:
        test_data['batsmens_strike_rates_2'][i] = 1
    if test_data['bowlers_strike_rates_2'][i] == 0:
        test_data['bowlers_strike_rates_2'][i] = 1
        
    if test_data['bowlers_economy_2'][i] == 0:
        test_data['bowlers_economy_2'][i] = 1
        
    if test_data['bowlers_extras_2'][i] == 0:
        test_data['bowlers_extras_2'][i] = 1
        
    
    test_data.at[i, 'batsmens_averages_ratios'] = test_data['batsmens_averages_1'][i] / test_data['batsmens_averages_2'][i]
    test_data.at[i, 'batsmens_strike_rates_ratios'] = test_data['batsmens_strike_rates_1'][i] / test_data['batsmens_strike_rates_2'][i]
    test_data.at[i, 'bowlers_strike_rates_ratios'] = test_data['bowlers_strike_rates_1'][i] / test_data['bowlers_strike_rates_2'][i]
    test_data.at[i, 'bowlers_economy_ratios'] = test_data['bowlers_economy_1'][i] / test_data['bowlers_economy_2'][i]
    test_data.at[i, 'bowlers_extras_ratios'] = test_data['bowlers_extras_1'][i] / test_data['bowlers_extras_2'][i]
    
    
    # filling team win% against each other in last 10 games
    team1_id = test_data['team1_id'][i]
    team2_id = test_data['team2_id'][i]
    date = test_data['match_dt'][i]
    
    test_data.at[i, 'team1_winp_team2_last10'] = winpCrossLastn(team1_id, team2_id, date, 10)

            
    # filling team average scores
    team1_id = test_data['team1_id'][i]
    team2_id = test_data['team2_id'][i]
    
    if team1_id in team_avg:
        test_data.at[i, 'team1_average_score'] = team_avg[team1_id]
    else:
        test_data.at[i, 'team1_average_score'] = 0
    if team2_id in team_avg:
        test_data.at[i, 'team2_average_score'] = team_avg[team2_id]
    else:
        test_data.at[i, 'team2_average_score'] = 0
        
    # filling win% of team1 in last 15 games from matchscore_updated
    
    test_data.at[i, 'win_percentage_team1_last_15'] = win_percentage_last_n_matches(team1_id, date, 15)
    test_data.at[i, 'win_percentage_team2_last_15'] = win_percentage_last_n_matches(team2_id, date, 15)
    

  test_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
  test_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
  test_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
  test_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
  test_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
  test_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
  test_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
  test_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
  test_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
  test_data.at[i, 'batsmens_averages_ratios'] = test_data['batsmens_averages_1'][i] / test_data['batsmens_averages_2'][i]
  test_data.at[i, 'batsmens_strike_rates_ratios'] = test_data['batsmens_strike_rates_1'][i] / test_data['batsmens_strike_rates_2'][i]
  test_data.at[i, 'bowlers_strike_rates_ratios'] = test_data['bowlers_strike_rates_1'][i] / test_data['bowlers_strike_rates_2'][i]
  t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['bowlers_extras_2'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['bowlers_extras_2'][i] = 1


In [430]:
# dropping team1_roster_ids , team2_roster_ids , winner , team1 , team2 ,  columns 
test_data = test_data.drop(['team1_roster_ids', 'team2_roster_ids','team1', 'team2','batsmens_averages_1','batsmens_strike_rates_1','bowlers_strike_rates_1','bowlers_economy_1','bowlers_extras_1','batsmens_averages_2','batsmens_strike_rates_2','bowlers_strike_rates_2','bowlers_economy_2','bowlers_extras_2'], axis=1)

In [431]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_columns = ['toss winner', 'toss decision', 'venue', 'match_dt', 'lighting', 'series_name', 'season', 'city']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_columns:
    test_data[col] = label_encoder.fit_transform(test_data[col])

# Display the first 5 rows of the dataframe
test_data.head(5)


Unnamed: 0,match id,team1_id,team2_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15
0,9250275,28594,90,104,1,11,13,58,0,49,1,3226,0.0,0.019608,,0.0,,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,50.0
1,9262189,36084,36098,40,1,59,70,89,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,42.857143,46.666667
2,9128776,30428,48334,82,0,8,58,40,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,168.142857,159.4,46.666667,77.777778
3,9586919,36098,36112,92,1,23,38,195,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,148.125,166.8,53.333333,40.0
4,9128538,30414,48341,31,1,46,71,34,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,46.666667,66.666667


In [432]:
# filling nan values with 0

test_data.fillna(0, inplace=True)

In [433]:
# changing column names of match id to match_id , toss winner to toss_winner , toss decision to toss_decision

test_data['match_id'] = test_data['match id']
test_data['toss_winner'] = test_data['toss winner']
test_data['toss_decision'] = test_data['toss decision']

test_data = test_data.drop(['match id', 'toss winner', 'toss decision'], axis=1)

In [434]:
test_data.head(15)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision
0,28594,90,11,13,58,0,49,1,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,50.0,9250275,104,1
1,36084,36098,59,70,89,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,42.857143,46.666667,9262189,40,1
2,30428,48334,8,58,40,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,168.142857,159.4,46.666667,77.777778,9128776,82,0
3,36098,36112,23,38,195,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,148.125,166.8,53.333333,40.0,9586919,92,1
4,30414,48341,46,71,34,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,46.666667,66.666667,9128538,31,1
5,30407,30435,57,20,166,2,18,3,7118,0.4,0.672131,177.066667,50.0,184.5,0.941093,0.995272,1.001928,0.934268,0.73699,50.0,172.4,169.307692,46.666667,40.0,9516653,57,1
6,36014,30414,45,23,161,2,18,3,2050,0.681818,0.344262,183.466667,25.0,156.35,0.921257,1.034135,0.921563,0.955612,0.797946,25.0,170.0,165.307692,40.0,53.333333,9516534,12,1
7,9701,6698,81,81,57,2,50,1,1042,3.2,0.60396,163.066667,0.0,177.321429,0.937888,1.114808,0.63588,0.685779,1.635887,0.0,159.5,165.857143,66.666667,66.666667,9085474,19,1
8,7727,7573,82,76,184,1,50,3,8301,2.428571,2.904762,167.0,100.0,151.6,1.301483,1.165019,1.046915,0.940972,1.415819,100.0,158.0,159.3125,80.0,40.0,9433752,30,1
9,30428,36014,67,42,153,1,18,3,13880,1.727273,21.0,171.066667,66.67,186.5,1.161314,1.008119,0.596095,1.011756,1.146592,66.67,168.142857,170.0,46.666667,33.333333,9516359,89,1


In [435]:
train_data.shape
test_data.shape

(948, 28)

(271, 27)

In [436]:
new_test_data = test_data

In [437]:
new_train_data=train_data

In [438]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

X_train = train_data.drop(['winner_id'], axis=1)
y_train = train_data['winner_id']

X_test = test_data

In [439]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

In [440]:
unique_classes = np.unique(y_train)
single_occurrence_classes = [c for c in unique_classes if np.sum(y_train == c) == 1]

# Oversample each single-occurrence class to 8 instances
oversampled_X_train = []

oversampled_y = []
for class_label in single_occurrence_classes:
    class_X_train = X_train[y_train == class_label]
    class_y = y_train[y_train == class_label]
    oversampled_X_train.extend([class_X_train] * (8 - len(class_X_train)))
    oversampled_y.extend([class_y] * (8 - len(class_y)))

# Concatenate the oversampled data with the original data
X_train_resampled = np.concatenate([X_train] + oversampled_X_train)
y_resampled = np.concatenate([y_train] + oversampled_y)

In [441]:
# i79 = np.where(y_test == 79)[0][0]
# i79

In [442]:
# Initialize SMOTE
smote = SMOTE(random_state=101, k_neighbors=1)

# Apply SMOTE to the entire dataset
X_train, y_train = smote.fit_resample(X_train_resampled, y_resampled)

In [443]:
# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [444]:
# CatBoost
# cat_model = CatBoostClassifier()
# cat_model.fit(X_train, y_train)

# cat_accuracy = accuracy_score(y_test, cat_pred)
# cat_f1 = f1_score(y_test, cat_pred, average='macro')

In [445]:
test_data.shape

(271, 27)

In [446]:
X_test.shape

(271, 27)

In [447]:
# cat_model.predict(X_test)

In [448]:
new_test_data.head(5)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision
0,28594,90,11,13,58,0,49,1,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,50.0,9250275,104,1
1,36084,36098,59,70,89,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,42.857143,46.666667,9262189,40,1
2,30428,48334,8,58,40,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,168.142857,159.4,46.666667,77.777778,9128776,82,0
3,36098,36112,23,38,195,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,148.125,166.8,53.333333,40.0,9586919,92,1
4,30414,48341,46,71,34,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,46.666667,66.666667,9128538,31,1


In [449]:
team1_ids = new_test_data['team1_id']
team2_ids = new_test_data['team2_id']

In [491]:
import catboost as cb
from sklearn.metrics import accuracy_score, f1_score

# Assuming you have the training and test data: X_train, y_train, X_test, y_test

# Initialize and train the CatBoost model
cat_model = cb.CatBoostClassifier()
cat_model.fit(X_train, y_train)

# Get the predicted probabilities
cat_pred_proba = cat_model.predict_proba(X_test)

# Determine the predicted class based on the higher probability
cat_pred = [1 if proba[1] > proba[0] else 0 for proba in cat_pred_proba]

# Store the probabilities of predicting each team in a list
probabilities_list = [proba[1] for proba in cat_pred_proba]



Learning rate set to 0.078912
0:	learn: 4.8885936	total: 341ms	remaining: 5m 40s
1:	learn: 4.8365020	total: 744ms	remaining: 6m 11s
2:	learn: 4.7954476	total: 1.19s	remaining: 6m 36s
3:	learn: 4.7444414	total: 1.59s	remaining: 6m 35s
4:	learn: 4.6641634	total: 1.94s	remaining: 6m 26s
5:	learn: 4.6180452	total: 2.31s	remaining: 6m 22s
6:	learn: 4.5646635	total: 2.67s	remaining: 6m 19s
7:	learn: 4.5107025	total: 3.03s	remaining: 6m 16s
8:	learn: 4.4631228	total: 3.41s	remaining: 6m 15s
9:	learn: 4.4025982	total: 3.76s	remaining: 6m 12s
10:	learn: 4.3438528	total: 4.09s	remaining: 6m 8s
11:	learn: 4.2909544	total: 4.46s	remaining: 6m 7s
12:	learn: 4.2536256	total: 4.85s	remaining: 6m 7s
13:	learn: 4.1822968	total: 5.19s	remaining: 6m 5s
14:	learn: 4.1386320	total: 5.55s	remaining: 6m 4s
15:	learn: 4.0932348	total: 5.89s	remaining: 6m 2s
16:	learn: 4.0471530	total: 6.25s	remaining: 6m 1s
17:	learn: 4.0060824	total: 6.6s	remaining: 6m
18:	learn: 3.9571875	total: 6.94s	remaining: 5m 58s
19:	

<catboost.core.CatBoostClassifier at 0x2b10fc09dd0>

In [492]:
y_pred_01=cat_pred
win_prob=cat_pred_proba

In [500]:
win_prob

array([[0.01080245, 0.01492656, 0.00359981, ..., 0.0037198 , 0.00251834,
        0.00179335],
       [0.0013344 , 0.00091311, 0.0003397 , ..., 0.00057201, 0.00100788,
        0.00083103],
       [0.00189854, 0.00125279, 0.00154479, ..., 0.00144368, 0.00075725,
        0.00064444],
       ...,
       [0.00429791, 0.00396044, 0.01870964, ..., 0.00109272, 0.00071363,
        0.00070531],
       [0.00161126, 0.00146627, 0.0005244 , ..., 0.00079363, 0.00067195,
        0.00065127],
       [0.00079243, 0.00071637, 0.00063809, ..., 0.00094064, 0.00045841,
        0.00036957]])

In [498]:
len(win_prob)

271

In [493]:
# printing winning team along with their winning probability
for i in range(len(y_pred_01)):
    print(f"Match {i+1}: {y_pred_01[i]} with probability {win_prob[i]}")

Match 1: 1 with probability [0.01080245 0.01492656 0.00359981 0.00819047 0.08066714 0.01413776
 0.00880202 0.02398228 0.01950681 0.00240888 0.0189671  0.01022165
 0.01666525 0.00886249 0.02533696 0.00927478 0.03164361 0.00798736
 0.04456415 0.05117148 0.00361244 0.00249634 0.01105608 0.00205534
 0.01247788 0.00526227 0.00440703 0.00974448 0.0034003  0.0023985
 0.00529054 0.00474949 0.00504144 0.00671908 0.00174225 0.00399195
 0.00191289 0.00136167 0.00226546 0.00353557 0.00333641 0.00239477
 0.00288861 0.00644385 0.00370762 0.00201633 0.00157896 0.00380112
 0.00639264 0.01974451 0.00518138 0.00651141 0.01417799 0.0015779
 0.00223369 0.00867333 0.00319253 0.00522045 0.01757332 0.0024539
 0.01931648 0.0070597  0.01958095 0.00420101 0.02251321 0.01490762
 0.01721883 0.00427607 0.00622976 0.00460176 0.01115387 0.0062367
 0.00754417 0.00505967 0.00362729 0.01446674 0.00613876 0.00265059
 0.00961149 0.00165074 0.00144998 0.00248073 0.00279879 0.00209277
 0.00435295 0.00309849 0.00116285 0.00

In [452]:
# adding y_pred_01 to test_data
new_test_data['win_pred_team_id'] = y_pred_01

In [453]:
# adding winning probability to test_data
new_test_data['win_pred_score'] = win_prob

In [454]:
new_test_data.head(10)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision,win_pred_team_id,win_pred_score
0,28594,90,11,13,58,0,49,1,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,50.0,9250275,104,1,90,0.002488
1,36084,36098,59,70,89,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,42.857143,46.666667,9262189,40,1,36084,0.149814
2,30428,48334,8,58,40,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,100.0,168.142857,159.4,46.666667,77.777778,9128776,82,0,30428,0.067051
3,36098,36112,23,38,195,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,75.0,148.125,166.8,53.333333,40.0,9586919,92,1,36112,0.075065
4,30414,48341,46,71,34,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,46.666667,66.666667,9128538,31,1,48341,0.395569
5,30407,30435,57,20,166,2,18,3,7118,0.4,0.672131,177.066667,50.0,184.5,0.941093,0.995272,1.001928,0.934268,0.73699,50.0,172.4,169.307692,46.666667,40.0,9516653,57,1,30407,0.15345
6,36014,30414,45,23,161,2,18,3,2050,0.681818,0.344262,183.466667,25.0,156.35,0.921257,1.034135,0.921563,0.955612,0.797946,25.0,170.0,165.307692,40.0,53.333333,9516534,12,1,30414,0.160355
7,9701,6698,81,81,57,2,50,1,1042,3.2,0.60396,163.066667,0.0,177.321429,0.937888,1.114808,0.63588,0.685779,1.635887,0.0,159.5,165.857143,66.666667,66.666667,9085474,19,1,9701,0.074076
8,7727,7573,82,76,184,1,50,3,8301,2.428571,2.904762,167.0,100.0,151.6,1.301483,1.165019,1.046915,0.940972,1.415819,100.0,158.0,159.3125,80.0,40.0,9433752,30,1,7727,0.058673
9,30428,36014,67,42,153,1,18,3,13880,1.727273,21.0,171.066667,66.67,186.5,1.161314,1.008119,0.596095,1.011756,1.146592,66.67,168.142857,170.0,46.666667,33.333333,9516359,89,1,36014,0.058964


## Filling submission 1 csv

In [455]:
submission1=pd.read_csv('sample_template/submission_template_file1.csv')

In [456]:
submission1.head(2)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10
0,9250275,r1,,,,,,,,,,,,,,,,,
1,9262189,r1,,,,,,,,,,,,,,,,,


In [457]:
# filling submisssion 1 win_pred_team_id, win_pred_score with new_test_data win_pred_team_id, win_pred_score according to match id

# also filling train_algorithm as xgboost , is_ensemble=no , train_hps_tree=150, train_hps_depth=6, train_hps_lr=0.1 for all rows

for i in range(len(submission1)):
    match_id = submission1['match id'][i]
    if match_id not in new_test_data['match_id'].values:
        continue
    new_test_data_row = new_test_data[new_test_data['match_id'] == match_id]
    submission1.at[i, 'win_pred_team_id'] = new_test_data_row['win_pred_team_id'].values[0]
    submission1.at[i, 'win_pred_score'] = new_test_data_row['win_pred_score'].values[0]
    submission1.at[i, 'train_algorithm'] = 'xgboost'
    submission1.at[i, 'is_ensemble'] = 'no'
    submission1.at[i, 'train_hps_trees'] = 150
    submission1.at[i, 'train_hps_depth'] = 6
    submission1.at[i, 'train_hps_lr'] = 0.1

  submission1.at[i, 'train_algorithm'] = 'xgboost'
  submission1.at[i, 'is_ensemble'] = 'no'


In [458]:
submission1.head(10)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10
0,9250275,r1,90.0,0.002488,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
1,9262189,r1,36084.0,0.149814,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
2,9128776,r1,30428.0,0.067051,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
3,9586919,r1,36112.0,0.075065,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
4,9128538,r1,48341.0,0.395569,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
5,9516653,r1,30407.0,0.15345,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
6,9516534,r1,30414.0,0.160355,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
7,9085474,r1,9701.0,0.074076,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
8,9433752,r1,7727.0,0.058673,xgboost,no,150.0,6.0,0.1,,,,,,,,,,
9,9516359,r1,36014.0,0.058964,xgboost,no,150.0,6.0,0.1,,,,,,,,,,


## Now genearting for train data

In [459]:
new_train_data.head(2)

Unnamed: 0,team1_id,team2_id,winner_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision
0,11283,12634,12634,44,55,212,1,58,2,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364,9331181,44,1
1,20,41,20,53,13,20,1,22,0,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0,8797060,138,1


In [460]:
train_team1_ids = new_train_data['team1_id']
train_team2_ids = new_train_data['team2_id']

In [461]:
# scalling and x train and y train
X_train = new_train_data.drop(['winner_id'], axis=1)
y_train = new_train_data['winner_id']

# scalling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)


In [464]:
# Get the predicted probabilities
cat_pred_proba = cat_model.predict_proba(X_train)

# Determine the predicted class based on the higher probability
cat_pred = [1 if proba[1] > proba[0] else 0 for proba in cat_pred_proba]

# Store the probabilities of predicting each team in a list
probabilities_list = [proba[1] for proba in cat_pred_proba]

In [465]:
cat_pred.shape

(948, 139)

In [None]:
y_pred_01=cat_pred
win_prob=cat_pred_proba

In [466]:


# # Compare probabilities and assign the team with higher probability
# y_pred_01 = []
# win_prob=[]
# for i in range(len(cat_preds)):
#     team1_id = train_team1_ids[i]
#     team2_id = train_team2_ids[i]
    
#     team1_index = class_to_index.get(team1_id, -1)
#     team2_index = class_to_index.get(team2_id, -1)
    
#     if team1_index == -1 and team2_index == -1:
#         y_pred_01.append(team1_id)
#         win_prob.append(0)
        
#         continue
#         # raise ValueError(f"Team ID {team1_id} or {team2_id} not found in class_to_index mapping.")
    
#     team1_prob = cat_preds[i][team1_index]  # Probability for team1_id
#     team2_prob = cat_preds[i][team2_index]  # Probability for team2_id
    
#     if team1_prob > team2_prob:
#         y_pred_01.append(team1_id)  # team1_id
#         win_prob.append(team1_prob)
#     else:
#         y_pred_01.append(team2_id)  # team2_id
#         win_prob.append(team2_prob)

In [467]:
# printing winning team along with their winning probability
for i in range(len(y_pred_01)):
    print(f"Match {i+1}: {y_pred_01[i]} with probability {win_prob[i]}")

Match 1: 12634 with probability 0.3842836684460101
Match 2: 41 with probability 0.061410893316625456
Match 3: 10576 with probability 0.17073534908399277
Match 4: 36084 with probability 0.3411835815321204
Match 5: 48341 with probability 0.5102771388722365
Match 6: 40424 with probability 0.27686528503899593
Match 7: 55 with probability 0.12077807178680723
Match 8: 47501 with probability 0.7433970273018495
Match 9: 12634 with probability 0.46755215367743075
Match 10: 55 with probability 0.10112473891293597
Match 11: 30435 with probability 0.18203712349967516
Match 12: 30435 with probability 0.29154511558552626
Match 13: 36014 with probability 0.4644284582133825
Match 14: 8987 with probability 0.2837689732908093
Match 15: 10618 with probability 0.539608788128553
Match 16: 30393 with probability 0.2674183469358147
Match 17: 33956 with probability 0.2863925099153736
Match 18: 188 with probability 0.7123914344942982
Match 19: 7727 with probability 0.6145194769312922
Match 20: 251 with probabi

In [468]:
len(win_prob)

948

In [469]:
new_train_data['win_pred_team_id'] = y_pred_01

In [470]:
# adding winning probability to test_data
new_train_data['win_pred_score'] = win_prob

In [471]:
new_train_data.head(5)

Unnamed: 0,team1_id,team2_id,winner_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision,win_pred_team_id,win_pred_score
0,11283,12634,12634,44,55,212,1,58,2,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364,9331181,44,1,12634,0.384284
1,20,41,20,53,13,20,1,22,0,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0,8797060,138,1,41,0.061411
2,10576,8987,10576,111,83,370,1,79,3,251,0.857143,0.672131,173.266667,0.0,154.333333,1.000138,0.909712,0.85019,0.717828,1.024538,0.0,166.944444,149.588235,40.0,46.666667,9433269,81,1,10576,0.170735
3,36084,36070,36084,10,105,434,0,18,3,14300,2.166667,1.97561,164.266667,50.0,144.25,1.116102,1.052218,0.813452,0.878287,0.700727,50.0,156.3,177.076923,60.0,66.666667,9587073,34,0,36084,0.341184
4,30407,48341,48341,75,25,331,2,35,3,7118,0.818182,1.327869,164.666667,0.0,189.0,1.035394,1.240634,1.084338,1.168234,1.480742,0.0,172.4,167.916667,53.333333,69.230769,9516457,40,1,48341,0.510277


In [472]:
for i in range(len(submission1)):
    match_id = submission1['match id'][i]
    if match_id not in new_train_data['match_id'].values:
        continue
    new_train_data_row = new_train_data[new_train_data['match_id'] == match_id]
    submission1.at[i, 'win_pred_team_id'] = new_train_data_row['win_pred_team_id'].values[0]
    submission1.at[i, 'win_pred_score'] = new_train_data_row['win_pred_score'].values[0]
    submission1.at[i, 'train_algorithm'] = 'xgboost'
    submission1.at[i, 'is_ensemble'] = 'no'
    submission1.at[i, 'train_hps_trees'] = 150
    submission1.at[i, 'train_hps_depth'] = 6
    submission1.at[i, 'train_hps_lr'] = 0.1

## CSV-2 Generation

In [474]:
cols = train_data.columns[-13:-3]

In [477]:
# Get feature importance
importance = cat_model.get_feature_importance(type='PredictionValuesChange')

# Create a dictionary of feature importances
ft_dict = {f'f{i}': importance[i] for i in range(len(importance))}

print(ft_dict)


{'f0': 8.738610713772788, 'f1': 6.533675918132046, 'f2': 4.5628178028630035, 'f3': 5.706677049777389, 'f4': 1.3362018680073653, 'f5': 3.076096750857818, 'f6': 11.38695873719113, 'f7': 3.77121348366895, 'f8': 2.577253238393578, 'f9': 2.1412905898622068, 'f10': 3.1201318002534526, 'f11': 3.6491102060880345, 'f12': 0.8679927689152669, 'f13': 3.46385201335951, 'f14': 1.152223466364283, 'f15': 1.075891463737371, 'f16': 2.9658326524841216, 'f17': 1.2767127718761329, 'f18': 1.6370708827740423, 'f19': 2.062825608707715, 'f20': 7.8967643148409215, 'f21': 8.733109813605978, 'f22': 2.4815169604145857, 'f23': 3.138713805889004, 'f24': 1.437612210779906, 'f25': 3.665316951730414, 'f26': 1.5445261556530194}


In [478]:
ft_dict

{'f0': 8.738610713772788,
 'f1': 6.533675918132046,
 'f2': 4.5628178028630035,
 'f3': 5.706677049777389,
 'f4': 1.3362018680073653,
 'f5': 3.076096750857818,
 'f6': 11.38695873719113,
 'f7': 3.77121348366895,
 'f8': 2.577253238393578,
 'f9': 2.1412905898622068,
 'f10': 3.1201318002534526,
 'f11': 3.6491102060880345,
 'f12': 0.8679927689152669,
 'f13': 3.46385201335951,
 'f14': 1.152223466364283,
 'f15': 1.075891463737371,
 'f16': 2.9658326524841216,
 'f17': 1.2767127718761329,
 'f18': 1.6370708827740423,
 'f19': 2.062825608707715,
 'f20': 7.8967643148409215,
 'f21': 8.733109813605978,
 'f22': 2.4815169604145857,
 'f23': 3.138713805889004,
 'f24': 1.437612210779906,
 'f25': 3.665316951730414,
 'f26': 1.5445261556530194}

In [479]:
sorted_dict = {}
for key in sorted(ft_dict, key=ft_dict.get):
    sorted_dict[key] = ft_dict[key]
sorted_dict

{'f12': 0.8679927689152669,
 'f15': 1.075891463737371,
 'f14': 1.152223466364283,
 'f17': 1.2767127718761329,
 'f4': 1.3362018680073653,
 'f24': 1.437612210779906,
 'f26': 1.5445261556530194,
 'f18': 1.6370708827740423,
 'f19': 2.062825608707715,
 'f9': 2.1412905898622068,
 'f22': 2.4815169604145857,
 'f8': 2.577253238393578,
 'f16': 2.9658326524841216,
 'f5': 3.076096750857818,
 'f10': 3.1201318002534526,
 'f23': 3.138713805889004,
 'f13': 3.46385201335951,
 'f11': 3.6491102060880345,
 'f25': 3.665316951730414,
 'f7': 3.77121348366895,
 'f2': 4.5628178028630035,
 'f3': 5.706677049777389,
 'f1': 6.533675918132046,
 'f20': 7.8967643148409215,
 'f21': 8.733109813605978,
 'f0': 8.738610713772788,
 'f6': 11.38695873719113}

In [480]:
sorted_list = list(sorted_dict.keys())

In [481]:
feat = pd.read_csv('sample_template/submission_template_file2.csv')

In [482]:
feat['feat_id'] = np.arange(1, 11)

In [483]:
feat['feat_name'] = cols

In [484]:
feat

Unnamed: 0,feat_id,feat_name,feat_description,model_feat_imp_train,feat_rank_train
0,1,bowlers_strike_rates_ratios,,,
1,2,bowlers_economy_ratios,,,
2,3,bowlers_extras_ratios,,,
3,4,team1_winp_team2_last10,,,
4,5,team1_average_score,,,
5,6,team2_average_score,,,
6,7,win_percentage_team1_last_15,,,
7,8,win_percentage_team2_last_15,,,
8,9,match_id,,,
9,10,toss_winner,,,


In [485]:
for i in range(10):
    feat['model_feat_imp_train'][i] = ft_dict[f'f{14+i}']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[f'f{14+i}']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[f'f{14+i}']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[f'f{14+i}']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['

In [486]:
filtered_feats = []

for i in range(10):
    filtered_feats.append(f'f{14+i}')

In [487]:
filtered_feats

['f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23']

In [488]:
for i in range(10):
    feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [489]:
feat.to_csv('sample_template/submission_file2-2.csv')

In [490]:
submission1.to_csv('sample_template/submission_file1-2.csv', index=False)

In [310]:
# train_check=pd.read_csv('testing/train_data.csv')

In [311]:
# # checking count of matchings between y_pred_01 and winner_id
# count=0
# for i in range(len(y_pred_01)):
#     if y_pred_01[i] == train_check['winner_id'][i]:
#         count+=1
# print(count)

689
