## Training and Testing Pipeline

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_data = pd.read_csv('dataset/train_data.csv')
test_data = pd.read_csv('dataset/test_data.csv')

In [3]:
# Display options
pd.set_option('display.max_columns',50)

In [4]:
train_data.shape
train_data.head(2)

(948, 23)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5


In [5]:
test_data.shape
test_data.head(2)

(271, 21)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,Bulawayo,11-07-22,day match,Ud Ss of Aa tr of Ze,2022,3226,0.0,0.019608,,0.0,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,Providence,21-09-22,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.5


# Calculating batsmen features

In [6]:
batsmen=pd.read_csv('dataset/batsman_level_scorecard.csv')

In [7]:
batsmen.head(2)

Unnamed: 0,match id,batsman,batsman_id,batsman_details,is_batsman_captain,is_batsman_keeper,inning,runs,balls_faced,over_faced_first,wicket kind,out_by_bowler,out_by_fielder,bowler_id,bowler_details,is_bowler_keeper,is_bowler_captain,strike_rate,Fours,Sixes,match_dt
0,8638034,KD Ce,7907451.0,NZ:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,7,5,1.1,caught,JS Nm,TM Jn,2486896.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,140.0,1.0,,2021-01-01
1,8638034,TL St,4381761.0,NZ:Right-hand bat:None:,0.0,1.0,1,46,46,1.2,caught,R Ra,MG Bl,6718382.0,NZ:Left-hand bat:Slow left-arm orthodox:,0.0,0.0,100.0,4.0,1.0,2021-01-01


In [8]:
# calculating 2 dictionaries to store {batsman_id: batsman average } and {batsman_id: batsman strike rate}

total_runs = {}  
balls_faced = {}
matches_played = {}

for i in range(len(batsmen)):
    batsman_id = batsmen['batsman_id'][i]
    runs = batsmen['runs'][i]
    balls = batsmen['balls_faced'][i]
    match_id = batsmen['match id'][i]
    if batsman_id in total_runs:
        total_runs[batsman_id] += runs
        balls_faced[batsman_id] += balls
        matches_played[batsman_id] += 1
    else:
        total_runs[batsman_id] = runs
        balls_faced[batsman_id] = balls
        matches_played[batsman_id] = 1
        
batsman_avg = {}
batsman_sr = {}

for batsman_id in total_runs:
    batsman_avg[batsman_id] = total_runs[batsman_id] / matches_played[batsman_id]
    batsman_sr[batsman_id] = total_runs[batsman_id] / balls_faced[batsman_id] * 100
 

# Bowler level features

In [9]:
bowlers=pd.read_csv('dataset/bowler_level_scorecard.csv')

In [10]:
bowlers.head()

Unnamed: 0,match id,bowler,bowler_id,bowler_details,is_bowler_captain,is_bowler_keeper,inning,runs,wicket_count,balls_bowled,economy,maiden,dots,Fours,Sixes,wides,noballs,match_dt
0,8638034,BV Ss,6718396.0,NZ:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,21,0,12,10.5,0,3.0,1,1,3,1,2021-01-01
1,8638034,HK Bt,1585464.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,1,34,1,24,8.5,0,7.0,3,1,2,0,2021-01-01
2,8638034,JS Nm,2486896.0,NZ:Left-hand bat:Right-arm medium-fast:,0.0,0.0,1,23,2,24,5.75,0,12.0,1,1,0,0,2021-01-01
3,8638034,LV vn Bk,3083667.0,NED:Right-hand bat:Right-arm medium-fast:,0.0,0.0,1,31,1,24,7.75,0,9.0,4,0,2,0,2021-01-01
4,8638034,PF Yd,4950294.0,NZ:Right-hand bat:Legbreak googly:,0.0,0.0,1,21,1,24,5.25,0,8.0,0,0,1,0,2021-01-01


In [11]:
# calculating 2 dictionaries to store {bowler_id: bowler strike rate } , {bowler_id: bowler economy} , {bowler_id: bowler extras rate}

total_matches = {}
total_wickets = {}
total_ball_bowled = {}
total_extras = {}
total_economy = {}

for i in range(len(bowlers)):
    bowler_id = bowlers['bowler_id'][i]
    wickets = bowlers['wicket_count'][i]
    balls = bowlers['balls_bowled'][i]
    runs = bowlers['runs'][i]
    extras = bowlers['wides'][i] + bowlers['noballs'][i]
    match_id = bowlers['match id'][i]
    economy=bowlers['economy'][i]
    if bowler_id in total_matches:
        total_matches[bowler_id] += 1
        total_wickets[bowler_id] += wickets
        total_ball_bowled[bowler_id] += balls
        total_extras[bowler_id] += extras
        total_economy[bowler_id] += economy
    else:
        total_matches[bowler_id] = 1
        total_wickets[bowler_id] = wickets
        total_ball_bowled[bowler_id] = balls
        total_extras[bowler_id] = extras
        total_economy[bowler_id] = economy
        
bowler_sr = {}
bowler_economy = {}
bowler_extras = {}

for bowler_id in total_matches:
    bowler_economy[bowler_id] = total_economy[bowler_id] / total_matches[bowler_id]
    bowler_extras[bowler_id] = total_extras[bowler_id] / total_ball_bowled[bowler_id]
    if total_wickets[bowler_id] == 0:
        bowler_sr[bowler_id] = 0
    else:
        bowler_sr[bowler_id] = total_ball_bowled[bowler_id] /total_wickets[bowler_id]
    

## Match level features

In [12]:
match=pd.read_csv('dataset/match_level_scorecard.csv')

In [13]:
match.head()

Unnamed: 0,match id,team1,team2,winner,by,win amount,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,umpire1,umpire2,inning1_runs,inning1_wickets,inning1_balls,inning2_runs,inning2_wickets,inning2_balls,team1_id,team1_roster_ids,team2_id,team2_roster_ids,series_type,winner_id,player_of_the_match_id
0,8638034,Nn Ds,Wn,Wn,wickets,9.0,Wn,field,By Ol,Mount Maunganui,2021-01-01,night match,Sr Sh,2020/21,17681,Dn Mw,TJ Pe,148,6,130,152.0,1.0,97.0,17982,7907451.0:4381761.0:31464.0:258649.0:4949790.0...,18570,2653993.0:6718326.0:6718382.0:2486896.0:228878...,other_domestic,18570,
1,8588005,Me Rs,Sy Tr,Sy Tr,runs,7.0,Sy Tr,field,Ca Ol,Carrara,2021-01-01,night match,Bg Bh Le,2020/21,6663,D Tr,PJ Ge,166,6,106,117.0,2.0,74.0,33942,37351.0:46794.0:5406540.0:2231928.0:181404.0:1...,33963,1506098.0:1749075.0:36665.0:2083409.0:7534652....,other_domestic,33963,1749075.0
2,8587837,Sy Ss,Be Ht,Be Ht,wickets,4.0,Sy Ss,bat,Be Ct Gd,Brisbane,2021-01-02,night match,Bg Bh Le,2020/21,1476,GJ Dn,PJ Ge,165,8,126,171.0,6.0,119.0,33956,7869987.0:7620283.0:2076192.0:4002340.0:306369...,33921,7620269.0:2286437.0:87191.0:5786766.0:3114803....,other_domestic,33921,3890984.0
3,8638041,Nn Ds,Oo,Oo,wickets,2.0,Oo,field,By Ol,Mount Maunganui,2021-01-02,night match,Sr Sh,2020/21,17681,GA St,TJ Pe,153,7,128,156.0,8.0,126.0,17982,7907451.0:4381761.0:31464.0:4949790.0:258649.0...,18360,2319638.0:256080.0:7918280.0:3913447.0:2690498...,other_domestic,18360,
4,8587921,Ht Hs,Me Ss,Ht Hs,runs,21.0,Me Ss,field,Be Ol,Hobart,2021-01-02,day/night match,Bg Bh Le,2020/21,6348,GA Ad,NR Je,164,5,126,143.0,9.0,123.0,33928,4223883.0:2161599.0:1655436.0:5788418.0:319948...,33949,363047.0:2275097.0:3901078.0:2275195.0:4230127...,other_domestic,33928,3125849.0


In [14]:
# # calculating 2 dictionaries to store {team_id: team average }
#  if team1 has won toss and choose bat then inning1 score will be team1 score and inning2 score will be team2 score

total_runs = {}
total_matches={}
for i in range(len(match)):
    
    toss_winner = match['toss winner'][i]
    team2=toss_winner
    
    if toss_winner == match['team2_id'][i]:
        team1 = match['team1_id'][i]
    else:
        team1 = match['team2_id'][i]
    
    if match['toss decision'][i] == 'bat':
        team2_score=match['inning1_runs'][i]
        team1_score=match['inning2_runs'][i]
    
    else:
        team1_score=match['inning1_runs'][i]
        team2_score=match['inning2_runs'][i]
    
    if team1 in total_runs:
        total_runs[team1] += team1_score
        total_matches[team1] += 1
    
    else:
        total_runs[team1] = team1_score
        total_matches[team1] = 1
        
    if team2 in total_runs:
        total_runs[team2] += team2_score
        total_matches[team2] += 1
    
    else:
        total_runs[team2] = team2_score
        total_matches[team2] = 1
        
team_avg = {}

for team_id in total_runs:
    team_avg[team_id] = total_runs[team_id] / total_matches[team_id]

In [15]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = match[(match['match_dt']<date)&\
                      (((match['team1_id']==team1_id)&(match['team2_id']==team2_id))|((match['team1_id']==team2_id)&(match['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.


In [16]:
def win_percentage_last_n_matches(team_id, date,n):
    '''
    Function to calculate a team's win percentage in their last n games.

    '''
    # Filter games where team1 or team2 is the input team_id and the match date is before the input date
    df_rel = match[(match['match_dt'] < date) &
                            ((match['team1_id'] == team_id) | (match['team2_id'] == team_id))]\
                            .sort_values(by='match_dt', ascending=False).head(n)
    
    # Calculate the number of wins
    wins = df_rel[(df_rel['winner_id'] == team_id)].shape[0]
    
    # Calculate the number of matches
    total_matches = df_rel.shape[0]
    
    if total_matches == 0:
        return 0.0  # Avoid division by zero
    
    # Calculate win percentage
    win_percentage = (wins / total_matches) * 100
    return win_percentage


In [17]:
train_data.head(2)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,winner,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9331181,Ba,11283,9373356.0:7857520.0:4232164.0:4566540.0:329940...,Hl Ph,12634,3500958.0:4231751.0:2735081.0:2035102.0:369833...,Hl Ph,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571
1,8797060,Ed,20,2089079.0:6139370.0:2076192.0:62432.0:2083409....,Wt Is,41,4690258.0:4069666.0:4230127.0:1942317.0:161392...,Ed,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5


In [18]:
test_data.head(2)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9250275,Jy,28594,7438955.0:8271969.0:8369661.0:3685247.0:259025...,Ud Ss of Aa,90,2436944.0:8117500.0:6703528.0:3010748.0:161387...,Ud Ss of Aa,field,Bo Ac Cb,Bulawayo,11-07-22,day match,Ud Ss of Aa tr of Ze,2022,3226,0.0,0.019608,,0.0,
1,9262189,Ga An Ws,36084,3715690.0:6818622.0:4069666.0:393014.0:4690188...,Ja Ts,36098,4690258.0:3761246.0:8464385.0:5742470.0:315072...,Ja Ts,field,Pe Sm Ga,Providence,21-09-22,night match,Cn Pr Le,2022,13915,0.615385,0.344262,151.285714,66.67,153.5


# Combining the features of dataset

In [19]:
# from geopy.geocoders import Nominatim

# # Initialize Nominatim API
# geolocator = Nominatim(user_agent="my_geocoder")

# Creating new columns in train_data and initializing with 0
train_data['batsmens_averages_1'] = 0
train_data['batsmens_strike_rates_1'] = 0
train_data['bowlers_strike_rates_1'] = 0
train_data['bowlers_economy_1'] = 0
train_data['bowlers_extras_1'] = 0

train_data['batsmens_averages_2'] = 0
train_data['batsmens_strike_rates_2'] = 0
train_data['bowlers_strike_rates_2'] = 0
train_data['bowlers_economy_2'] = 0
train_data['bowlers_extras_2'] = 0

train_data['batsmens_averages_ratios'] = 0
train_data['batsmens_strike_rates_ratios'] = 0
train_data['bowlers_strike_rates_ratios'] = 0
train_data['bowlers_economy_ratios'] = 0
train_data['bowlers_extras_ratios'] = 0

train_data['team1_winp_team2_last10']=0
train_data['team1_average_score'] = 0
train_data['team2_average_score'] = 0
train_data['win_percentage_team1_last_15'] = 0
train_data['win_percentage_team2_last_15'] = 0


# train_data['location_humidity'] = 0
# train_data['location_temperature'] = 0

# Filling the values in the new columns
for i in range(len(train_data)):
    # Getting ids of batsmen and bowlers from team roster id separated by :
    team1_players_ids = train_data['team1_roster_ids'][i].split(':')
    team2_players_ids = train_data['team2_roster_ids'][i].split(':')
    # print(i,team1_players_ids,team2_players_ids)
    # For team1 check if id is present in the dictionary and add the values
    for player_id in team1_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            train_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
            train_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            train_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
            train_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
            train_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
            
    # For team2 check if id is present in the dictionary and add the values
    for player_id in team2_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            train_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
            train_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            train_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
            train_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
            train_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
    
    # filling ratios
    if train_data['batsmens_averages_2'][i] == 0:
        train_data['batsmens_averages_2'][i] = 1
        
    if train_data['batsmens_strike_rates_2'][i] == 0:
        train_data['batsmens_strike_rates_2'][i] = 1
    if train_data['bowlers_strike_rates_2'][i] == 0:
        train_data['bowlers_strike_rates_2'][i] = 1
        
    if train_data['bowlers_economy_2'][i] == 0:
        train_data['bowlers_economy_2'][i] = 1
        
    if train_data['bowlers_extras_2'][i] == 0:
        train_data['bowlers_extras_2'][i] = 1
        
    
    train_data.at[i, 'batsmens_averages_ratios'] = train_data['batsmens_averages_1'][i] / train_data['batsmens_averages_2'][i]
    train_data.at[i, 'batsmens_strike_rates_ratios'] = train_data['batsmens_strike_rates_1'][i] / train_data['batsmens_strike_rates_2'][i]
    train_data.at[i, 'bowlers_strike_rates_ratios'] = train_data['bowlers_strike_rates_1'][i] / train_data['bowlers_strike_rates_2'][i]
    train_data.at[i, 'bowlers_economy_ratios'] = train_data['bowlers_economy_1'][i] / train_data['bowlers_economy_2'][i]
    train_data.at[i, 'bowlers_extras_ratios'] = train_data['bowlers_extras_1'][i] / train_data['bowlers_extras_2'][i]
    
    
    # filling team win% against each other in last 10 games
    team1_id = train_data['team1_id'][i]
    team2_id = train_data['team2_id'][i]
    date = train_data['match_dt'][i]
    
    train_data.at[i, 'team1_winp_team2_last10'] = winpCrossLastn(team1_id, team2_id, date, 10)

            
    # filling team average scores
    team1_id = train_data['team1_id'][i]
    team2_id = train_data['team2_id'][i]
    
    if team1_id in team_avg:
        train_data.at[i, 'team1_average_score'] = team_avg[team1_id]
    else:
        train_data.at[i, 'team1_average_score'] = 0
    if team2_id in team_avg:
        train_data.at[i, 'team2_average_score'] = team_avg[team2_id]
    else:
        train_data.at[i, 'team2_average_score'] = 0
        
    # filling win% of team1 in last 15 games from matchscore_updated
    
    train_data.at[i, 'win_percentage_team1_last_15'] = win_percentage_last_n_matches(team1_id, date, 15)
    train_data.at[i, 'win_percentage_team2_last_15'] = win_percentage_last_n_matches(team2_id, date, 15)
    



  train_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
  train_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
  train_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
  train_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
  train_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
  train_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
  train_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
  train_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
  train_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
  train_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
  train_data.at[i, 'batsmens_averages_ratios'] = train_data['batsmens_averages_1'][i] / train_data['batsmens_averages_2'][i]
  train_data.at[i, 'batsmens_strike_rates_ratios'] = train_data['batsmens_strike_rates_1'][i] / train_data['batsmens_strike_rates_2'][i]
  train_data.at[i, 'bowlers_strike_rates_ratios'] 

In [20]:
# dropping team1_roster_ids , team2_roster_ids , winner , team1 , team2 ,  columns 
train_data = train_data.drop(['team1_roster_ids', 'team2_roster_ids', 'winner', 'team1', 'team2','batsmens_averages_1','batsmens_strike_rates_1','bowlers_strike_rates_1','bowlers_economy_1','bowlers_extras_1','batsmens_averages_2','batsmens_strike_rates_2','bowlers_strike_rates_2','bowlers_economy_2','bowlers_extras_2'], axis=1)

In [21]:
train_data.head(2)

Unnamed: 0,match id,team1_id,team2_id,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15
0,9331181,11283,12634,12634,Hl Ph,field,Hr Ct Sm Ie,Indore,2022-10-20,day/night match,Sd Mq Ai Ty,2022/23,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364
1,8797060,20,41,20,Wt Is,field,Kn Ol Bn Bs,Bridgetown,2022-01-23,day/night match,Ed tr of Wt Is,2021/22,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0


In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_columns = ['toss winner', 'toss decision', 'venue', 'match_dt', 'lighting', 'series_name', 'season', 'city']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_columns:
    train_data[col] = label_encoder.fit_transform(train_data[col])

# Display the first 5 rows of the dataframe
train_data.head(5)


Unnamed: 0,match id,team1_id,team2_id,winner_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15
0,9331181,11283,12634,12634,44,1,44,55,212,1,58,2,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364
1,8797060,20,41,20,138,1,53,13,20,1,22,0,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0
2,9433269,10576,8987,10576,81,1,111,83,370,1,79,3,251,0.857143,0.672131,173.266667,0.0,154.333333,1.000138,0.909712,0.85019,0.717828,1.024538,0.0,166.944444,149.588235,40.0,46.666667
3,9587073,36084,36070,36084,34,0,10,105,434,0,18,3,14300,2.166667,1.97561,164.266667,50.0,144.25,1.116102,1.052218,0.813452,0.878287,0.700727,50.0,156.3,177.076923,60.0,66.666667
4,9516457,30407,48341,48341,40,1,75,25,331,2,35,3,7118,0.818182,1.327869,164.666667,0.0,189.0,1.035394,1.240634,1.084338,1.168234,1.480742,0.0,172.4,167.916667,53.333333,69.230769


In [23]:
# filling nan values with 0

train_data.fillna(0, inplace=True)

In [24]:
# changing column names of match id to match_id , toss winner to toss_winner , toss decision to toss_decision

train_data['match_id'] = train_data['match id']
train_data['toss_winner'] = train_data['toss winner']
train_data['toss_decision'] = train_data['toss decision']

train_data = train_data.drop(['match id', 'toss winner', 'toss decision'], axis=1)

## For test data also

In [25]:
# from geopy.geocoders import Nominatim

# # Initialize Nominatim API
# geolocator = Nominatim(user_agent="my_geocoder")

# Creating new columns in test_data and initializing with 0
test_data['batsmens_averages_1'] = 0
test_data['batsmens_strike_rates_1'] = 0
test_data['bowlers_strike_rates_1'] = 0
test_data['bowlers_economy_1'] = 0
test_data['bowlers_extras_1'] = 0

test_data['batsmens_averages_2'] = 0
test_data['batsmens_strike_rates_2'] = 0
test_data['bowlers_strike_rates_2'] = 0
test_data['bowlers_economy_2'] = 0
test_data['bowlers_extras_2'] = 0

test_data['batsmens_averages_ratios'] = 0
test_data['batsmens_strike_rates_ratios'] = 0
test_data['bowlers_strike_rates_ratios'] = 0
test_data['bowlers_economy_ratios'] = 0
test_data['bowlers_extras_ratios'] = 0

test_data['team1_winp_team2_last10']=0
test_data['team1_average_score'] = 0
test_data['team2_average_score'] = 0
test_data['win_percentage_team1_last_15'] = 0
test_data['win_percentage_team2_last_15'] = 0


# test_data['location_humidity'] = 0
# test_data['location_temperature'] = 0

# Filling the values in the new columns
for i in range(len(test_data)):
    # Getting ids of batsmen and bowlers from team roster id separated by :
    team1_players_ids = test_data['team1_roster_ids'][i].split(':')
    team2_players_ids = test_data['team2_roster_ids'][i].split(':')
    # print(i,team1_players_ids,team2_players_ids)
    # For team1 check if id is present in the dictionary and add the values
    for player_id in team1_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            test_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
            test_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            test_data.at[i, 'bowlers_strike_rates_1'] += bowler_sr[player_id]
            test_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
            test_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
            
    # For team2 check if id is present in the dictionary and add the values
    for player_id in team2_players_ids:
        player_id = int(float(player_id))
        if player_id in batsman_avg:
            test_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
            test_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
        if player_id in bowler_sr:
            test_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
            test_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
            test_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
    
    # filling ratios
    if test_data['batsmens_averages_2'][i] == 0:
        test_data['batsmens_averages_2'][i] = 1
        
    if test_data['batsmens_strike_rates_2'][i] == 0:
        test_data['batsmens_strike_rates_2'][i] = 1
    if test_data['bowlers_strike_rates_2'][i] == 0:
        test_data['bowlers_strike_rates_2'][i] = 1
        
    if test_data['bowlers_economy_2'][i] == 0:
        test_data['bowlers_economy_2'][i] = 1
        
    if test_data['bowlers_extras_2'][i] == 0:
        test_data['bowlers_extras_2'][i] = 1
        
    
    test_data.at[i, 'batsmens_averages_ratios'] = test_data['batsmens_averages_1'][i] / test_data['batsmens_averages_2'][i]
    test_data.at[i, 'batsmens_strike_rates_ratios'] = test_data['batsmens_strike_rates_1'][i] / test_data['batsmens_strike_rates_2'][i]
    test_data.at[i, 'bowlers_strike_rates_ratios'] = test_data['bowlers_strike_rates_1'][i] / test_data['bowlers_strike_rates_2'][i]
    test_data.at[i, 'bowlers_economy_ratios'] = test_data['bowlers_economy_1'][i] / test_data['bowlers_economy_2'][i]
    test_data.at[i, 'bowlers_extras_ratios'] = test_data['bowlers_extras_1'][i] / test_data['bowlers_extras_2'][i]
    
    
    # filling team win% against each other in last 10 games
    team1_id = test_data['team1_id'][i]
    team2_id = test_data['team2_id'][i]
    date = test_data['match_dt'][i]
    
    test_data.at[i, 'team1_winp_team2_last10'] = winpCrossLastn(team1_id, team2_id, date, 10)

            
    # filling team average scores
    team1_id = test_data['team1_id'][i]
    team2_id = test_data['team2_id'][i]
    
    if team1_id in team_avg:
        test_data.at[i, 'team1_average_score'] = team_avg[team1_id]
    else:
        test_data.at[i, 'team1_average_score'] = 0
    if team2_id in team_avg:
        test_data.at[i, 'team2_average_score'] = team_avg[team2_id]
    else:
        test_data.at[i, 'team2_average_score'] = 0
        
    # filling win% of team1 in last 15 games from matchscore_updated
    
    test_data.at[i, 'win_percentage_team1_last_15'] = win_percentage_last_n_matches(team1_id, date, 15)
    test_data.at[i, 'win_percentage_team2_last_15'] = win_percentage_last_n_matches(team2_id, date, 15)
    

  test_data.at[i, 'batsmens_averages_1'] += batsman_avg[player_id]
  test_data.at[i, 'batsmens_strike_rates_1'] += batsman_sr[player_id]
  test_data.at[i, 'bowlers_economy_1'] += bowler_economy[player_id]
  test_data.at[i, 'bowlers_extras_1'] += bowler_extras[player_id]
  test_data.at[i, 'batsmens_averages_2'] += batsman_avg[player_id]
  test_data.at[i, 'batsmens_strike_rates_2'] += batsman_sr[player_id]
  test_data.at[i, 'bowlers_economy_2'] += bowler_economy[player_id]
  test_data.at[i, 'bowlers_strike_rates_2'] += bowler_sr[player_id]
  test_data.at[i, 'bowlers_extras_2'] += bowler_extras[player_id]
  test_data.at[i, 'batsmens_averages_ratios'] = test_data['batsmens_averages_1'][i] / test_data['batsmens_averages_2'][i]
  test_data.at[i, 'batsmens_strike_rates_ratios'] = test_data['batsmens_strike_rates_1'][i] / test_data['batsmens_strike_rates_2'][i]
  test_data.at[i, 'bowlers_strike_rates_ratios'] = test_data['bowlers_strike_rates_1'][i] / test_data['bowlers_strike_rates_2'][i]
  t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['bowlers_extras_2'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['bowlers_extras_2'][i] = 1


In [26]:
# dropping team1_roster_ids , team2_roster_ids , winner , team1 , team2 ,  columns 
test_data = test_data.drop(['team1_roster_ids', 'team2_roster_ids','team1', 'team2','batsmens_averages_1','batsmens_strike_rates_1','bowlers_strike_rates_1','bowlers_economy_1','bowlers_extras_1','batsmens_averages_2','batsmens_strike_rates_2','bowlers_strike_rates_2','bowlers_economy_2','bowlers_extras_2'], axis=1)

In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to encode
categorical_columns = ['toss winner', 'toss decision', 'venue', 'match_dt', 'lighting', 'series_name', 'season', 'city']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_columns:
    test_data[col] = label_encoder.fit_transform(test_data[col])

# Display the first 5 rows of the dataframe
test_data.head(5)


Unnamed: 0,match id,team1_id,team2_id,toss winner,toss decision,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15
0,9250275,28594,90,104,1,11,13,69,0,49,1,3226,0.0,0.019608,,0.0,,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,0.0
1,9262189,36084,36098,40,1,59,70,146,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,60.0,40.0
2,9128776,30428,48334,82,0,8,58,97,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,0.0,168.142857,159.4,0.0,0.0
3,9586919,36098,36112,92,1,23,38,107,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,0.0,148.125,166.8,0.0,0.0
4,9128538,30414,48341,31,1,46,71,113,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,0.0,0.0


In [28]:
# filling nan values with 0

test_data.fillna(0, inplace=True)

In [29]:
# changing column names of match id to match_id , toss winner to toss_winner , toss decision to toss_decision

test_data['match_id'] = test_data['match id']
test_data['toss_winner'] = test_data['toss winner']
test_data['toss_decision'] = test_data['toss decision']

test_data = test_data.drop(['match id', 'toss winner', 'toss decision'], axis=1)

In [30]:
test_data.head(15)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision
0,28594,90,11,13,69,0,49,1,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,0.0,9250275,104,1
1,36084,36098,59,70,146,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,60.0,40.0,9262189,40,1
2,30428,48334,8,58,97,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,0.0,168.142857,159.4,0.0,0.0,9128776,82,0
3,36098,36112,23,38,107,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,0.0,148.125,166.8,0.0,0.0,9586919,92,1
4,30414,48341,46,71,113,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,0.0,0.0,9128538,31,1
5,30407,30435,57,20,17,2,18,3,7118,0.4,0.672131,177.066667,50.0,184.5,0.941093,0.995272,1.001928,0.934268,0.73699,0.0,172.4,169.307692,0.0,0.0,9516653,57,1
6,36014,30414,45,23,142,2,18,3,2050,0.681818,0.344262,183.466667,25.0,156.35,0.921257,1.034135,0.921563,0.955612,0.797946,25.0,170.0,165.307692,33.333333,60.0,9516534,12,1
7,9701,6698,81,81,58,2,50,1,1042,3.2,0.60396,163.066667,0.0,177.321429,0.937888,1.114808,0.63588,0.685779,1.635887,0.0,159.5,165.857143,0.0,0.0,9085474,19,1
8,7727,7573,82,76,208,1,50,3,8301,2.428571,2.904762,167.0,100.0,151.6,1.301483,1.165019,1.046915,0.940972,1.415819,100.0,158.0,159.3125,80.0,40.0,9433752,30,1
9,30428,36014,67,42,8,1,18,3,13880,1.727273,21.0,171.066667,66.67,186.5,1.161314,1.008119,0.596095,1.011756,1.146592,0.0,168.142857,170.0,0.0,0.0,9516359,89,1


In [31]:
train_data.shape
test_data.shape

(948, 28)

(271, 27)

In [32]:
new_test_data = test_data

In [33]:
new_train_data=train_data

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler

X_train = train_data.drop(['winner_id'], axis=1)
y_train = train_data['winner_id']

X_test = test_data

In [35]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

In [36]:
unique_classes = np.unique(y_train)
single_occurrence_classes = [c for c in unique_classes if np.sum(y_train == c) == 1]

# Oversample each single-occurrence class to 8 instances
oversampled_X_train = []

oversampled_y = []
for class_label in single_occurrence_classes:
    class_X_train = X_train[y_train == class_label]
    class_y = y_train[y_train == class_label]
    oversampled_X_train.extend([class_X_train] * (8 - len(class_X_train)))
    oversampled_y.extend([class_y] * (8 - len(class_y)))

# Concatenate the oversampled data with the original data
X_train_resampled = np.concatenate([X_train] + oversampled_X_train)
y_resampled = np.concatenate([y_train] + oversampled_y)

In [37]:
# i79 = np.where(y_test == 79)[0][0]
# i79

In [38]:
# Initialize SMOTE
smote = SMOTE(random_state=101, k_neighbors=1)

# Apply SMOTE to the entire dataset
X_train, y_train = smote.fit_resample(X_train_resampled, y_resampled)

In [39]:
# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [40]:
# CatBoost
cat_model = CatBoostClassifier()
cat_model.fit(X_train, y_train)

# cat_accuracy = accuracy_score(y_test, cat_pred)
# cat_f1 = f1_score(y_test, cat_pred, average='macro')

Learning rate set to 0.084957
0:	learn: 4.7340856	total: 953ms	remaining: 15m 52s
1:	learn: 4.5765950	total: 1.75s	remaining: 14m 34s
2:	learn: 4.3849563	total: 2.47s	remaining: 13m 42s
3:	learn: 4.1641418	total: 3.19s	remaining: 13m 13s
4:	learn: 3.9643694	total: 3.89s	remaining: 12m 53s
5:	learn: 3.7966947	total: 4.64s	remaining: 12m 48s
6:	learn: 3.6283499	total: 5.39s	remaining: 12m 45s
7:	learn: 3.4414321	total: 6.11s	remaining: 12m 38s
8:	learn: 3.2603879	total: 6.84s	remaining: 12m 33s
9:	learn: 3.0939965	total: 7.54s	remaining: 12m 26s
10:	learn: 2.9584906	total: 8.4s	remaining: 12m 35s
11:	learn: 2.8240776	total: 9.16s	remaining: 12m 33s
12:	learn: 2.7192045	total: 9.9s	remaining: 12m 31s
13:	learn: 2.6205383	total: 10.6s	remaining: 12m 28s
14:	learn: 2.5140818	total: 11.4s	remaining: 12m 25s
15:	learn: 2.4144644	total: 12.2s	remaining: 12m 27s
16:	learn: 2.3048200	total: 12.9s	remaining: 12m 25s
17:	learn: 2.2112091	total: 13.7s	remaining: 12m 26s
18:	learn: 2.1317369	total: 

In [None]:
test_data.shape

(271, 29)

In [None]:
X_test.shape

(271, 27)

In [None]:
new_test_data.head(5)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision,win_pred_team_id,win_pred_score
0,28594,90,11,13,69,0,49,1,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,0.0,9250275,104,1,90,0.000871
1,36084,36098,59,70,146,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,60.0,40.0,9262189,40,1,36084,0.134157
2,30428,48334,8,58,97,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,0.0,168.142857,159.4,0.0,0.0,9128776,82,0,30428,0.062006
3,36098,36112,23,38,107,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,0.0,148.125,166.8,0.0,0.0,9586919,92,1,36112,0.034009
4,30414,48341,46,71,113,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,0.0,0.0,9128538,31,1,48341,0.272571


In [None]:
team1_ids = new_test_data['team1_id']
team2_ids = new_test_data['team2_id']

In [None]:
cat_preds = cat_model.predict_proba(X_test)

unique_classes = np.unique(y_train)
class_to_index = {label: idx for idx, label in enumerate(unique_classes)}
y_train_indexed = np.array([class_to_index[label] for label in y_train])

# Compare probabilities and assign the team with higher probability
cat_y_pred_01 = []
cat_win_prob = []
for i in range(len(cat_preds)):
    team1_id = team1_ids[i]
    team2_id = team2_ids[i]

    team1_index = class_to_index.get(team1_id, -1)
    team2_index = class_to_index.get(team2_id, -1)

    if team1_index == -1 and team2_index == -1:
        cat_y_pred_01.append(team1_id)
        cat_win_prob.append(0)

        continue
        # raise ValueError(f"Team ID {team1_id} or {team2_id} not found in class_to_index mapping.")

    team1_prob = cat_preds[i][team1_index]  # Probability for team1_id
    team2_prob = cat_preds[i][team2_index]  # Probability for team2_id

    if team1_prob > team2_prob:
        cat_y_pred_01.append(team1_id)  # team1_id
        cat_win_prob.append(team1_prob)
    else:
        cat_y_pred_01.append(team2_id)  # team2_id
        cat_win_prob.append(team2_prob)

In [None]:
# printing winning team along with their winning probability
for i in range(len(cat_y_pred_01)):
    print(f"Match {i+1}: {cat_y_pred_01[i]} with probability {cat_win_prob[i]}")

Match 1: 90 with probability 0.0013430374871913082
Match 2: 36084 with probability 0.13415652138673856
Match 3: 30428 with probability 0.06251607786198317
Match 4: 36112 with probability 0.05104069811402093
Match 5: 48341 with probability 0.2725709038449679
Match 6: 30407 with probability 0.5576064771718952
Match 7: 30414 with probability 0.21736723002828165
Match 8: 6698 with probability 0.1651782358901129
Match 9: 7727 with probability 0.07516110228872606
Match 10: 36014 with probability 0.020048325671559026
Match 11: 36098 with probability 0.23988541911740566
Match 12: 202 with probability 0.3557548977794818
Match 13: 47501 with probability 0.15600063863272026
Match 14: 48 with probability 0.319525427084894
Match 15: 62 with probability 0.5702257812613878
Match 16: 33963 with probability 0.06563858974347496
Match 17: 6698 with probability 0.26987828846549106
Match 18: 20 with probability 0.22733778115235784
Match 19: 36070 with probability 0.39124442980629953
Match 20: 36070 with pr

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd


# Ensure y_train contains zero-indexed integer labels
unique_classes = np.unique(y_train)
class_to_index = {label: idx for idx, label in enumerate(unique_classes)}
y_train_indexed = np.array([class_to_index[label] for label in y_train])

# Create DMatrix for training and testing
xgb_train = xgb.DMatrix(X_train, label=y_train_indexed, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, enable_categorical=True)  # No label for xgb_test

# Define parameters and add num_class
n = 150
params = {
    'objective': 'multi:softprob',
    'max_depth': 6,
    'learning_rate': 0.1,
    'num_class': len(unique_classes)  # Number of unique classes
}

# Train the model
xgb_model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=n)

# Make predictions
xgb_preds = xgb_model.predict(xgb_test)

# Compare probabilities and assign the team with higher probability
y_pred_01 = []
win_prob=[]
for i in range(len(xgb_preds)):
    team1_id = team1_ids[i]
    team2_id = team2_ids[i]
    
    team1_index = class_to_index.get(team1_id, -1)
    team2_index = class_to_index.get(team2_id, -1)
    
    if team1_index == -1 and team2_index == -1:
        y_pred_01.append(team1_id)
        win_prob.append(0)
        
        continue
        # raise ValueError(f"Team ID {team1_id} or {team2_id} not found in class_to_index mapping.")
    
    team1_prob = xgb_preds[i][team1_index]  # Probability for team1_id
    team2_prob = xgb_preds[i][team2_index]  # Probability for team2_id
    
    if team1_prob > team2_prob:
        y_pred_01.append(team1_id)  # team1_id
        win_prob.append(team1_prob)
    else:
        y_pred_01.append(team2_id)  # team2_id
        win_prob.append(team2_prob)

# Print predictions
# print(f'Predicted class labels for X_test: {y_pred_01}')

In [None]:
# printing winning team along with their winning probability
for i in range(len(y_pred_01)):
    print(f"Match {i+1}: {y_pred_01[i]} with probability {win_prob[i]}")

Match 1: 28594 with probability 0.007757620420306921
Match 2: 36098 with probability 0.003110178280621767
Match 3: 30428 with probability 0.220680832862854
Match 4: 36112 with probability 0.004472097847610712
Match 5: 30414 with probability 0.04112886264920235
Match 6: 30407 with probability 0.06293463706970215
Match 7: 30414 with probability 0.1516619324684143
Match 8: 6698 with probability 0.034552719444036484
Match 9: 7727 with probability 0.010891024954617023
Match 10: 30428 with probability 0.26741883158683777
Match 11: 36098 with probability 0.01836247369647026
Match 12: 272 with probability 0.005053613800555468
Match 13: 47487 with probability 0.006391638424247503
Match 14: 48 with probability 0.04245441406965256
Match 15: 62 with probability 0.02649027481675148
Match 16: 33963 with probability 0.013009293004870415
Match 17: 6698 with probability 0.02539064921438694
Match 18: 41 with probability 0.6000406742095947
Match 19: 36070 with probability 0.03413557633757591
Match 20: 36

In [None]:
ensemble_pred = []
ensemble_prob = []

for i in range(len(y_pred_01)):
    if(y_pred_01[i] == cat_y_pred_01[i]):
        ensemble_pred.append(y_pred_01[i])
        ensemble_prob.append(0.5 * (cat_win_prob[i] + win_prob[i]))

    elif (cat_win_prob[i] > win_prob[i]):
        ensemble_pred.append(cat_y_pred_01[i])
        ensemble_prob.append(cat_win_prob[i])

    else:
        ensemble_pred.append(y_pred_01[i])
        ensemble_prob.append(win_prob[i])

In [None]:
ensemble_pred

[28594,
 36084,
 30428,
 36112,
 48341,
 30407,
 30414,
 6698,
 7727,
 30428,
 36098,
 202,
 47501,
 48,
 62,
 33963,
 6698,
 41,
 36070,
 36070,
 48341,
 10366,
 33942,
 42573,
 9701,
 30414,
 45926,
 251,
 8987,
 7258,
 188,
 47529,
 23750,
 47487,
 27,
 47529,
 9967,
 10576,
 11591,
 33963,
 41,
 48341,
 55,
 47529,
 40298,
 9701,
 40452,
 36084,
 33928,
 15301,
 11283,
 293,
 14454,
 7258,
 40578,
 47480,
 40564,
 48922,
 33942,
 62,
 55,
 55,
 6698,
 30421,
 6698,
 55,
 33935,
 14454,
 30393,
 48334,
 62,
 7727,
 12718,
 188,
 8056,
 216,
 46759,
 33921,
 62,
 30421,
 33928,
 188,
 48733,
 40298,
 44904,
 40550,
 30428,
 7608,
 47508,
 11591,
 30421,
 6698,
 30393,
 40298,
 10618,
 55,
 7258,
 40298,
 202,
 10576,
 216,
 35790,
 15497,
 33921,
 33942,
 17583,
 36084,
 27,
 48,
 7727,
 40606,
 8987,
 30407,
 20,
 8987,
 30428,
 35790,
 22784,
 48,
 45072,
 20,
 33914,
 47487,
 33921,
 40298,
 36098,
 23841,
 40550,
 90,
 14860,
 209,
 27,
 41,
 30407,
 23841,
 33963,
 46780,
 11157

In [None]:
y_pred_01=ensemble_pred
win_prob=ensemble_prob

In [None]:
# adding y_pred_01 to test_data
new_test_data['win_pred_team_id'] = y_pred_01

In [None]:
# adding winning probability to test_data
new_test_data['win_pred_score'] = win_prob

In [None]:
new_test_data.head(10)

Unnamed: 0,team1_id,team2_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision,win_pred_team_id,win_pred_score
0,28594,90,11,13,69,0,49,1,3226,0.0,0.019608,0.0,0.0,0.0,0.580853,0.443698,0.962702,0.439479,1.313131,0.0,141.0,144.5,0.0,0.0,9250275,104,1,28594,0.007758
1,36084,36098,59,70,146,2,11,1,13915,0.615385,0.344262,151.285714,66.67,153.5,1.089279,1.091648,0.840087,0.961859,0.787592,66.67,156.3,148.125,60.0,40.0,9262189,40,1,36084,0.134157
2,30428,48334,8,58,97,2,18,1,2764,0.842105,0.753086,171.066667,100.0,179.625,1.080138,0.955992,0.771592,0.993761,0.900654,0.0,168.142857,159.4,0.0,0.0,9128776,82,0,30428,0.141598
3,36098,36112,23,38,107,2,11,3,11892,0.285714,1.487805,166.0,75.0,155.5,0.871249,0.848296,0.845522,0.870841,0.731058,0.0,148.125,166.8,0.0,0.0,9586919,92,1,36112,0.027756
4,30414,48341,46,71,113,2,18,1,18752,2.375,0.31033,169.933333,0.0,164.125,1.107382,1.166437,0.963643,1.001825,0.946688,0.0,165.307692,167.916667,0.0,0.0,9128538,31,1,48341,0.272571
5,30407,30435,57,20,17,2,18,3,7118,0.4,0.672131,177.066667,50.0,184.5,0.941093,0.995272,1.001928,0.934268,0.73699,0.0,172.4,169.307692,0.0,0.0,9516653,57,1,30407,0.310271
6,36014,30414,45,23,142,2,18,3,2050,0.681818,0.344262,183.466667,25.0,156.35,0.921257,1.034135,0.921563,0.955612,0.797946,25.0,170.0,165.307692,33.333333,60.0,9516534,12,1,30414,0.184515
7,9701,6698,81,81,58,2,50,1,1042,3.2,0.60396,163.066667,0.0,177.321429,0.937888,1.114808,0.63588,0.685779,1.635887,0.0,159.5,165.857143,0.0,0.0,9085474,19,1,6698,0.099865
8,7727,7573,82,76,208,1,50,3,8301,2.428571,2.904762,167.0,100.0,151.6,1.301483,1.165019,1.046915,0.940972,1.415819,100.0,158.0,159.3125,80.0,40.0,9433752,30,1,7727,0.043026
9,30428,36014,67,42,8,1,18,3,13880,1.727273,21.0,171.066667,66.67,186.5,1.161314,1.008119,0.596095,1.011756,1.146592,0.0,168.142857,170.0,0.0,0.0,9516359,89,1,30428,0.267419


## Filling submission 1 csv

In [None]:
submission1=pd.read_csv('sample_template/submission_template_file1.csv')

In [None]:
submission1.head(2)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10
0,9250275,r1,,,,,,,,,,,,,,,,,
1,9262189,r1,,,,,,,,,,,,,,,,,


In [None]:
# filling submisssion 1 win_pred_team_id, win_pred_score with new_test_data win_pred_team_id, win_pred_score according to match id

# also filling train_algorithm as xgboost , is_ensemble=no , train_hps_tree=150, train_hps_depth=6, train_hps_lr=0.1 for all rows

for i in range(len(submission1)):
    match_id = submission1['match id'][i]
    if match_id not in new_test_data['match_id'].values:
        continue
    new_test_data_row = new_test_data[new_test_data['match_id'] == match_id]
    submission1.at[i, 'win_pred_team_id'] = new_test_data_row['win_pred_team_id'].values[0]
    submission1.at[i, 'win_pred_score'] = new_test_data_row['win_pred_score'].values[0]
    submission1.at[i, 'train_algorithm'] = 'xgboost-catboost-ensemble'
    submission1.at[i, 'is_ensemble'] = 'yes'
    submission1.at[i, 'train_hps_trees'] = 150
    submission1.at[i, 'train_hps_depth'] = 6
    submission1.at[i, 'train_hps_lr'] = 0.1

  submission1.at[i, 'train_algorithm'] = 'xgboost-catboost-ensemble'
  submission1.at[i, 'is_ensemble'] = 'yes'


In [None]:
submission1.head(10)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10
0,9250275,r1,28594.0,0.007758,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
1,9262189,r1,36084.0,0.134157,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
2,9128776,r1,30428.0,0.141598,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
3,9586919,r1,36112.0,0.027756,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
4,9128538,r1,48341.0,0.272571,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
5,9516653,r1,30407.0,0.310271,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
6,9516534,r1,30414.0,0.184515,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
7,9085474,r1,6698.0,0.099865,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
8,9433752,r1,7727.0,0.043026,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,
9,9516359,r1,30428.0,0.267419,xgboost-catboost-ensemble,yes,150.0,6.0,0.1,,,,,,,,,,


## Now genearting for train data

In [None]:
new_train_data.head(2)

Unnamed: 0,team1_id,team2_id,winner_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision
0,11283,12634,12634,44,55,212,1,58,2,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364,9331181,44,1
1,20,41,20,53,13,20,1,22,0,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0,8797060,138,1


In [None]:
train_team1_ids = new_train_data['team1_id']
train_team2_ids = new_train_data['team2_id']

In [None]:
# scalling and x train and y train
X_train = new_train_data.drop(['winner_id'], axis=1)
y_train = new_train_data['winner_id']

# scalling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)


In [None]:
accuracy_score(cat_model.predict(X_train), y_train)

0.9156118143459916

In [None]:
cat_preds = cat_model.predict_proba(X_train)

In [None]:
cat_preds.shape

(948, 139)

In [None]:
X_train.shape

(948, 27)

In [None]:
# Create DMatrix for training and testing
xgb_train = xgb.DMatrix(X_train, enable_categorical=True)
# xgb_test = xgb.DMatrix(X_test, enable_categorical=True)  # No label for xgb_test

In [None]:
xgb_preds = xgb_model.predict(xgb_train)

In [None]:
xgb_preds.shape

(948, 139)

In [None]:
# Compare probabilities and assign the team with higher probability
y_pred_01 = []
win_prob=[]
for i in range(len(xgb_preds)):
    team1_id = train_team1_ids[i]
    team2_id = train_team2_ids[i]
    
    team1_index = class_to_index.get(team1_id, -1)
    team2_index = class_to_index.get(team2_id, -1)
    
    if team1_index == -1 and team2_index == -1:
        y_pred_01.append(team1_id)
        win_prob.append(0)
        
        continue
        # raise ValueError(f"Team ID {team1_id} or {team2_id} not found in class_to_index mapping.")
    
    team1_prob = xgb_preds[i][team1_index]  # Probability for team1_id
    team2_prob = xgb_preds[i][team2_index]  # Probability for team2_id
    
    if team1_prob > team2_prob:
        y_pred_01.append(team1_id)  # team1_id
        win_prob.append(team1_prob)
    else:
        y_pred_01.append(team2_id)  # team2_id
        win_prob.append(team2_prob)

In [None]:
# printing winning team along with their winning probability
for i in range(len(y_pred_01)):
    print(f"Match {i+1}: {y_pred_01[i]} with probability {win_prob[i]}")

Match 1: 12634 with probability 0.8592671751976013
Match 2: 20 with probability 0.8937444686889648
Match 3: 10576 with probability 0.8723538517951965
Match 4: 36084 with probability 0.8804664611816406
Match 5: 48341 with probability 0.948393702507019
Match 6: 40424 with probability 0.9025112986564636
Match 7: 55 with probability 0.988191545009613
Match 8: 47501 with probability 0.9442260265350342
Match 9: 12634 with probability 0.8743334412574768
Match 10: 55 with probability 0.8995677828788757
Match 11: 30414 with probability 0.9270097613334656
Match 12: 30435 with probability 0.9063557386398315
Match 13: 36014 with probability 0.8538640737533569
Match 14: 8301 with probability 0.899867594242096
Match 15: 10618 with probability 0.9181980490684509
Match 16: 30393 with probability 0.9548448324203491
Match 17: 33956 with probability 0.9375231266021729
Match 18: 188 with probability 0.9336394667625427
Match 19: 7727 with probability 0.9784219264984131
Match 20: 251 with probability 0.9742

In [None]:
# cat_preds = cat_model.predict_proba(X_train)
# Compare probabilities and assign the team with higher probability 
cat_y_pred_01 = []
cat_win_prob = []
for i in range(len(cat_preds)):
    team1_id = train_team1_ids[i]
    team2_id = train_team2_ids[i]

    team1_index = class_to_index.get(team1_id, -1)
    team2_index = class_to_index.get(team2_id, -1)

    if team1_index == -1 and team2_index == -1:
        cat_y_pred_01.append(team1_id)
        cat_win_prob.append(0)

        continue
        # raise ValueError(f"Team ID {team1_id} or {team2_id} not found in class_to_index mapping.")

    team1_prob = cat_preds[i][team1_index]  # Probability for team1_id
    team2_prob = cat_preds[i][team2_index]  # Probability for team2_id

    if team1_prob > team2_prob:
        cat_y_pred_01.append(team1_id)  # team1_id
        cat_win_prob.append(team1_prob)
    else:
        cat_y_pred_01.append(team2_id)  # team2_id
        cat_win_prob.append(team2_prob)

In [None]:
# printing winning team along with their winning probability
for i in range(len(cat_y_pred_01)):
    print(f"Match {i+1}: {cat_y_pred_01[i]} with probability {cat_win_prob[i]}")

Match 1: 12634 with probability 0.6908398996337619
Match 2: 20 with probability 0.0476049371912421
Match 3: 10576 with probability 0.5755352971693442
Match 4: 36084 with probability 0.6199478184181156
Match 5: 48341 with probability 0.8255967049867855
Match 6: 40424 with probability 0.7541206763530969
Match 7: 55 with probability 0.2618201860625222
Match 8: 47501 with probability 0.9126797179509518
Match 9: 12634 with probability 0.6348532718935135
Match 10: 55 with probability 0.37687424201695374
Match 11: 30414 with probability 0.29147749862837724
Match 12: 30435 with probability 0.7953689333240211
Match 13: 36014 with probability 0.8183062551143228
Match 14: 8301 with probability 0.4152372694749473
Match 15: 10618 with probability 0.8763027281619895
Match 16: 30393 with probability 0.704024979345876
Match 17: 33956 with probability 0.6412387049268528
Match 18: 188 with probability 0.9151190200332511
Match 19: 7727 with probability 0.8850838953078326
Match 20: 251 with probability 0.

In [None]:
len(win_prob)

948

In [None]:
ensemble_pred = []
ensemble_prob = []

for i in range(len(y_pred_01)):
    if(y_pred_01[i] == cat_y_pred_01[i]):
        ensemble_pred.append(y_pred_01[i])
        ensemble_prob.append(0.5 * (cat_win_prob[i] + win_prob[i]))

    elif (cat_win_prob[i] > win_prob[i]):
        ensemble_pred.append(cat_y_pred_01[i])
        ensemble_prob.append(cat_win_prob[i])

    else:
        ensemble_pred.append(y_pred_01[i])
        ensemble_prob.append(win_prob[i])

In [None]:
new_train_data['win_pred_team_id'] = ensemble_pred

In [None]:
# adding winning probability to test_data
new_train_data['win_pred_score'] = ensemble_prob

In [None]:
new_train_data.head(5)

Unnamed: 0,team1_id,team2_id,winner_id,venue,city,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15,batsmens_averages_ratios,batsmens_strike_rates_ratios,bowlers_strike_rates_ratios,bowlers_economy_ratios,bowlers_extras_ratios,team1_winp_team2_last10,team1_average_score,team2_average_score,win_percentage_team1_last_15,win_percentage_team2_last_15,match_id,toss_winner,toss_decision,win_pred_team_id,win_pred_score
0,11283,12634,12634,44,55,212,1,58,2,7398,1.666667,0.672131,139.0,100.0,157.178571,2.028272,2.611098,4.638805,2.351068,1.710894,100.0,137.666667,116.5,60.0,63.636364,9331181,44,1,12634,0.775054
1,20,41,20,53,13,20,1,22,0,1406,1.285714,1.952381,156.0,50.0,103.5,1.024794,0.966568,0.797703,0.80708,0.374242,50.0,157.730769,157.791667,60.0,40.0,8797060,138,1,20,0.470675
2,10576,8987,10576,111,83,370,1,79,3,251,0.857143,0.672131,173.266667,0.0,154.333333,1.000138,0.909712,0.85019,0.717828,1.024538,0.0,166.944444,149.588235,40.0,46.666667,9433269,81,1,10576,0.723945
3,36084,36070,36084,10,105,434,0,18,3,14300,2.166667,1.97561,164.266667,50.0,144.25,1.116102,1.052218,0.813452,0.878287,0.700727,50.0,156.3,177.076923,60.0,66.666667,9587073,34,0,36084,0.750207
4,30407,48341,48341,75,25,331,2,35,3,7118,0.818182,1.327869,164.666667,0.0,189.0,1.035394,1.240634,1.084338,1.168234,1.480742,0.0,172.4,167.916667,53.333333,69.230769,9516457,40,1,48341,0.886995


In [None]:
cat_model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'MultiClass',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'class_names': [20,
  27,
  34,
  41,
  48,
  55,
  62,
  69,
  76,
  90,
  118,
  146,
  188,
  195,
  202,
  209,
  216,
  223,
  237,
  251,
  272,
  279,
  293,
  300,
  769,
  825,
  958,
  1490,
  6698,
  6838,
  7258,
  7573,
  7608,
  7727,
  8056,
  8182,
  8301,
  8700,
  8917,
  8987,
  9701,
  9876,
  9967,
  10366,
  10576,
  10618,
  11157,
  112

In [None]:
for i in range(len(submission1)):
    match_id = submission1['match id'][i]
    if match_id not in new_train_data['match_id'].values:
        continue
    new_train_data_row = new_train_data[new_train_data['match_id'] == match_id]
    submission1.at[i, 'win_pred_team_id'] = new_train_data_row['win_pred_team_id'].values[0]
    submission1.at[i, 'win_pred_score'] = new_train_data_row['win_pred_score'].values[0]
    submission1.at[i, 'train_algorithm'] = 'xgboost-catboost-ensemble'
    submission1.at[i, 'is_ensemble'] = 'yes'
    submission1.at[i, 'train_hps_trees'] = 150
    submission1.at[i, 'train_hps_depth'] = 6
    submission1.at[i, 'train_hps_lr'] = 0.1

## CSV-2 Generation

In [None]:
train_data.columns

Index(['team1_id', 'team2_id', 'winner_id', 'venue', 'city', 'match_dt',
       'lighting', 'series_name', 'season', 'ground_id',
       'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'batsmens_averages_ratios',
       'batsmens_strike_rates_ratios', 'bowlers_strike_rates_ratios',
       'bowlers_economy_ratios', 'bowlers_extras_ratios',
       'team1_winp_team2_last10', 'team1_average_score', 'team2_average_score',
       'win_percentage_team1_last_15', 'win_percentage_team2_last_15',
       'match_id', 'toss_winner', 'toss_decision', 'win_pred_team_id',
       'win_pred_score'],
      dtype='object')

In [None]:
cols = train_data.columns[-15:-5]

In [None]:
cols

Index(['batsmens_averages_ratios', 'batsmens_strike_rates_ratios',
       'bowlers_strike_rates_ratios', 'bowlers_economy_ratios',
       'bowlers_extras_ratios', 'team1_winp_team2_last10',
       'team1_average_score', 'team2_average_score',
       'win_percentage_team1_last_15', 'win_percentage_team2_last_15'],
      dtype='object')

In [None]:
ft_dict = cat_model.get_feature_importance()

In [None]:
ft_dict

array([ 7.80900644,  6.08932603,  5.41663391,  5.47226815,  1.51087058,
        3.21027929, 10.8410908 ,  2.54966508,  3.2388673 ,  1.96549767,
        3.0865692 ,  4.37174384,  0.6776219 ,  3.05949681,  1.4476404 ,
        1.54685818,  2.96260578,  1.89513903,  1.98141025,  2.16928731,
        7.89146647,  8.46925044,  2.55744238,  3.33497025,  1.32843192,
        3.63979761,  1.476763  ])

In [None]:
ft_dict[-15:-5]

array([0.6776219 , 3.05949681, 1.4476404 , 1.54685818, 2.96260578,
       1.89513903, 1.98141025, 2.16928731, 7.89146647, 8.46925044])

In [None]:
sorted_dict = {}
for key in sorted(ft_dict, key=ft_dict.get):
    sorted_dict[key] = ft_dict[key]
sorted_dict

AttributeError: 'numpy.ndarray' object has no attribute 'get'

In [None]:
sorted_list = list(sorted_dict.keys())

In [None]:
feat = pd.read_csv('sample_template/submission_template_file2.csv')

In [None]:
feat['feat_id'] = np.arange(1, 11)

In [None]:
feat['feat_name'] = cols

In [None]:
feat

Unnamed: 0,feat_id,feat_name,feat_description,model_feat_imp_train,feat_rank_train
0,1,batsmens_averages_ratios,,,
1,2,batsmens_strike_rates_ratios,,,
2,3,bowlers_strike_rates_ratios,,,
3,4,bowlers_economy_ratios,,,
4,5,bowlers_extras_ratios,,,
5,6,team1_winp_team2_last10,,,
6,7,team1_average_score,,,
7,8,team2_average_score,,,
8,9,win_percentage_team1_last_15,,,
9,10,win_percentage_team2_last_15,,,


In [None]:
for i in range(10):
    feat['model_feat_imp_train'][i] = ft_dict[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] = ft_dict[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat['model_feat_imp_train'][i] =

In [None]:
filtered_feats = []
for i in range(10):
    filtered_feats.append(f'f{14+i}')

In [None]:
filtered_feats

['f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23']

In [None]:
for i in range(10):
    feat['feat_rank_train'][i] = sorted_list.index(filtered_feats[i]) + 1

ValueError: 'f14' is not in list

In [None]:
feat.to_csv('sample_template/submission_file2-2.csv')

In [None]:
submission1.to_csv('sample_template/submission_file1-2.csv', index=False)

In [None]:
train_check=pd.read_csv('dataset/train_data.csv')

In [188]:
# checking count of matchings between y_pred_01 and winner_id
count=0
for i in range(len(cat_y_pred_01)):
    if cat_y_pred_01[i] == train_check['winner_id'][i]:
        count+=1
        
print(count)
# print((count/948)*100)

96.51898734177216
