In [100]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [101]:
train_df = pd.read_csv('./Train.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2508 entries, 0 to 2507
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Team1          2508 non-null   int64 
 1   Team2          2508 non-null   int64 
 2   Stadium        2508 non-null   int64 
 3   HostCountry    2508 non-null   int64 
 4   Team1_Venue    2508 non-null   object
 5   Team2_Venue    2508 non-null   object
 6   Team1_Innings  2508 non-null   object
 7   Team2_Innings  2508 non-null   object
 8   MonthOfMatch   2508 non-null   object
 9   MatchWinner    2508 non-null   int64 
dtypes: int64(5), object(5)
memory usage: 196.1+ KB


In [102]:
orig_columns = train_df.columns
orig_columns

Index(['Team1', 'Team2', 'Stadium', 'HostCountry', 'Team1_Venue',
       'Team2_Venue', 'Team1_Innings', 'Team2_Innings', 'MonthOfMatch',
       'MatchWinner'],
      dtype='object')

In [103]:
dataset = train_df.copy(deep=True)
dataset['Team1_win']=np.where(dataset.Team1==dataset.MatchWinner, 1, 0)
dataset['Team2_win']=np.where(dataset.Team2==dataset.MatchWinner, 1, 0)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2508 entries, 0 to 2507
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Team1          2508 non-null   int64 
 1   Team2          2508 non-null   int64 
 2   Stadium        2508 non-null   int64 
 3   HostCountry    2508 non-null   int64 
 4   Team1_Venue    2508 non-null   object
 5   Team2_Venue    2508 non-null   object
 6   Team1_Innings  2508 non-null   object
 7   Team2_Innings  2508 non-null   object
 8   MonthOfMatch   2508 non-null   object
 9   MatchWinner    2508 non-null   int64 
 10  Team1_win      2508 non-null   int64 
 11  Team2_win      2508 non-null   int64 
dtypes: int64(7), object(5)
memory usage: 235.2+ KB


In [104]:
df_partition = dataset[['Team1','Team2']].drop_duplicates()
df_partition['group_id'] = np.arange(df_partition.shape[0])

In [105]:
dataset = dataset.merge(df_partition, on=['Team1','Team2'])
assert dataset.shape[0]==train_df.shape[0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2508 entries, 0 to 2507
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Team1          2508 non-null   int64 
 1   Team2          2508 non-null   int64 
 2   Stadium        2508 non-null   int64 
 3   HostCountry    2508 non-null   int64 
 4   Team1_Venue    2508 non-null   object
 5   Team2_Venue    2508 non-null   object
 6   Team1_Innings  2508 non-null   object
 7   Team2_Innings  2508 non-null   object
 8   MonthOfMatch   2508 non-null   object
 9   MatchWinner    2508 non-null   int64 
 10  Team1_win      2508 non-null   int64 
 11  Team2_win      2508 non-null   int64 
 12  group_id       2508 non-null   int64 
dtypes: int64(8), object(5)
memory usage: 274.3+ KB


In [106]:
team1_wins = dataset.groupby(['Team1']).agg({'Team1_win':['sum','count']}).reset_index()
team1_wins.columns = ['Team','wins','matches']
team2_wins = dataset.groupby(['Team2']).agg({'Team2_win':['sum','count']}).reset_index()
team2_wins.columns = ['Team','wins','matches']
team_winrate = team1_wins.merge(team2_wins, on = ['Team'])
team_winrate['wins'] = team_winrate['wins_x'] + team_winrate['wins_y']
team_winrate['matches'] = team_winrate['matches_x'] + team_winrate['matches_y']
team_winrate['win_rate'] = team_winrate['wins']/team_winrate['matches']
team_winrate = team_winrate[['Team','wins','matches','win_rate']]
team_winrate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Team      16 non-null     int64  
 1   wins      16 non-null     int64  
 2   matches   16 non-null     int64  
 3   win_rate  16 non-null     float64
dtypes: float64(1), int64(3)
memory usage: 640.0 bytes


In [107]:
dataset = dataset.merge(team_winrate, left_on=['Team1'], right_on=['Team'])
dataset = dataset.merge(team_winrate, left_on=['Team2'], right_on=['Team'])
dataset['win_rate_diff'] = dataset['win_rate_x'] - dataset['win_rate_y']
dataset['match_diff'] = dataset['matches_x'] - dataset['matches_y']
assert dataset.shape[0]==train_df.shape[0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2508 entries, 0 to 2507
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Team1          2508 non-null   int64  
 1   Team2          2508 non-null   int64  
 2   Stadium        2508 non-null   int64  
 3   HostCountry    2508 non-null   int64  
 4   Team1_Venue    2508 non-null   object 
 5   Team2_Venue    2508 non-null   object 
 6   Team1_Innings  2508 non-null   object 
 7   Team2_Innings  2508 non-null   object 
 8   MonthOfMatch   2508 non-null   object 
 9   MatchWinner    2508 non-null   int64  
 10  Team1_win      2508 non-null   int64  
 11  Team2_win      2508 non-null   int64  
 12  group_id       2508 non-null   int64  
 13  Team_x         2508 non-null   int64  
 14  wins_x         2508 non-null   int64  
 15  matches_x      2508 non-null   int64  
 16  win_rate_x     2508 non-null   float64
 17  Team_y         2508 non-null   int64  
 18  wins_y  

In [108]:
team1_venue_wins = dataset.groupby(['Team1','Team1_Venue']).agg({'Team1_win':['sum','count']}).reset_index()
team1_venue_wins.columns = ['Team','Venue','venue_wins','venue_matches']
team2_venue_wins = dataset.groupby(['Team2','Team2_Venue']).agg({'Team2_win':['sum','count']}).reset_index()
team2_venue_wins.columns = ['Team','Venue','venue_wins','venue_matches']
team_venue_wins = pd.concat([team1_venue_wins, team2_venue_wins], axis=0)
team_venue_wins = team_venue_wins.groupby(['Team','Venue']).agg({'venue_wins':['sum'], 
                                                                 'venue_matches':['sum']}).reset_index()
team_venue_wins.columns = ['Team','Venue','venue_wins','venue_matches']
team_venue_wins['venue_win_rate'] = team_venue_wins['venue_wins']/team_venue_wins['venue_matches']
team_venue_wins.head()

Unnamed: 0,Team,Venue,venue_wins,venue_matches,venue_win_rate
0,0,Away,14,30,0.466667
1,0,Neutral,13,26,0.5
2,1,Away,110,202,0.544554
3,1,Home,193,286,0.674825
4,1,Neutral,76,102,0.745098


In [109]:
dataset = dataset.merge(team_venue_wins, left_on=['Team1','Team1_Venue'], right_on=['Team','Venue'])
dataset = dataset.merge(team_venue_wins, left_on=['Team2','Team2_Venue'], right_on=['Team','Venue'])
dataset['venue_win_rate_diff'] = dataset['venue_win_rate_x'] - dataset['venue_win_rate_y']
dataset['venue_match_diff'] = dataset['venue_matches_x'] - dataset['venue_matches_y']
assert dataset.shape[0] == train_df.shape[0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2508 entries, 0 to 2507
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Team1                2508 non-null   int64  
 1   Team2                2508 non-null   int64  
 2   Stadium              2508 non-null   int64  
 3   HostCountry          2508 non-null   int64  
 4   Team1_Venue          2508 non-null   object 
 5   Team2_Venue          2508 non-null   object 
 6   Team1_Innings        2508 non-null   object 
 7   Team2_Innings        2508 non-null   object 
 8   MonthOfMatch         2508 non-null   object 
 9   MatchWinner          2508 non-null   int64  
 10  Team1_win            2508 non-null   int64  
 11  Team2_win            2508 non-null   int64  
 12  group_id             2508 non-null   int64  
 13  Team_x               2508 non-null   int64  
 14  wins_x               2508 non-null   int64  
 15  matches_x            2508 non-null   i

In [110]:
1028 - (round(1 - 944/1075, 2) * 1028)

904.64

In [111]:
team1_stadium_wins = dataset.groupby(['Team1','Stadium']).agg({'Team1_win':['sum','count']}).reset_index()
team1_stadium_wins.columns = ['Team','Stadium','stadium_wins','stadium_matches']
team2_stadium_wins = dataset.groupby(['Team2','Stadium']).agg({'Team2_win':['sum','count']}).reset_index()
team2_stadium_wins.columns = ['Team','Stadium','stadium_wins','stadium_matches']
team_stadium_wins = pd.concat([team1_stadium_wins, team2_stadium_wins], axis=0)
team_stadium_wins = team_stadium_wins.groupby(['Team','Stadium']).agg({'stadium_wins':['sum'], 
                                                                       'stadium_matches':['sum']}).reset_index()
team_stadium_wins.columns = ['Team','Stadium','stadium_wins','stadium_matches']
team_stadium_wins['stadium_win_rate'] = team_stadium_wins['stadium_wins']/team_stadium_wins['stadium_matches']
### Augmentation
# team_stadium_wins = team_stadium_wins.sample(750)
### Augmentation
team_stadium_wins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Team              1028 non-null   int64  
 1   Stadium           1028 non-null   int64  
 2   stadium_wins      1028 non-null   int64  
 3   stadium_matches   1028 non-null   int64  
 4   stadium_win_rate  1028 non-null   float64
dtypes: float64(1), int64(4)
memory usage: 40.3 KB


In [112]:
'''dataset = dataset.merge(team_stadium_wins, how='left',left_on=['Team1','Stadium'], right_on=['Team','Stadium'])
dataset = dataset.merge(team_stadium_wins, how='left',left_on=['Team2','Stadium'], right_on=['Team','Stadium'])
dataset['stadium_win_rate_diff'] = dataset['stadium_win_rate_x'] - dataset['stadium_win_rate_y']
dataset['stadium_match_diff'] = dataset['stadium_matches_x'] - dataset['stadium_matches_y']
assert dataset.shape[0] == train_df.shape[0]
dataset.info()'''

"dataset = dataset.merge(team_stadium_wins, how='left',left_on=['Team1','Stadium'], right_on=['Team','Stadium'])\ndataset = dataset.merge(team_stadium_wins, how='left',left_on=['Team2','Stadium'], right_on=['Team','Stadium'])\ndataset['stadium_win_rate_diff'] = dataset['stadium_win_rate_x'] - dataset['stadium_win_rate_y']\ndataset['stadium_match_diff'] = dataset['stadium_matches_x'] - dataset['stadium_matches_y']\nassert dataset.shape[0] == train_df.shape[0]\ndataset.info()"

In [116]:
team1_innings_wins = dataset.groupby(['Team1','Team1_Innings']).agg({'Team1_win':['sum','count']}).reset_index()
team1_innings_wins.columns = ['Team','Innings','innings_wins','innings_matches']
team2_innings_wins = dataset.groupby(['Team2','Team2_Innings']).agg({'Team2_win':['sum','count']}).reset_index()
team2_innings_wins.columns = ['Team','Innings','innings_wins','innings_matches']
team_innings_wins = pd.concat([team1_innings_wins, team2_innings_wins], axis=0)
team_innings_wins = team_innings_wins.groupby(['Team','Innings']).agg({'innings_wins':['sum'], 
                                                                       'innings_matches':['sum']}).reset_index()
team_innings_wins.columns = ['Team','Innings','innings_wins','innings_matches']
team_innings_wins['innings_win_rate'] = team_innings_wins['innings_wins']/team_innings_wins['innings_matches']
team_innings_wins.head()

Unnamed: 0,Team,Innings,innings_wins,innings_matches,innings_win_rate
0,0,First,10,26,0.384615
1,0,Second,17,30,0.566667
2,1,First,207,322,0.642857
3,1,Second,172,268,0.641791
4,2,First,33,114,0.289474


In [117]:
dataset = dataset.merge(team_innings_wins, left_on=['Team1','Team1_Innings'], right_on=['Team','Innings'])
dataset = dataset.merge(team_innings_wins, left_on=['Team2','Team2_Innings'], right_on=['Team','Innings'])
dataset['innings_win_rate_diff'] = dataset['innings_win_rate_x'] - dataset['innings_win_rate_y']
dataset['innings_match_diff'] = dataset['innings_matches_x'] - dataset['innings_matches_y']
assert dataset.shape[0] == train_df.shape[0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2508 entries, 0 to 2507
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Team1                  2508 non-null   int64  
 1   Team2                  2508 non-null   int64  
 2   Stadium                2508 non-null   int64  
 3   HostCountry            2508 non-null   int64  
 4   Team1_Venue            2508 non-null   object 
 5   Team2_Venue            2508 non-null   object 
 6   Team1_Innings          2508 non-null   object 
 7   Team2_Innings          2508 non-null   object 
 8   MonthOfMatch           2508 non-null   object 
 9   MatchWinner            2508 non-null   int64  
 10  Team1_win              2508 non-null   int64  
 11  Team2_win              2508 non-null   int64  
 12  group_id               2508 non-null   int64  
 13  Team_x                 2508 non-null   int64  
 14  wins_x                 2508 non-null   int64  
 15  matc

In [118]:
# dataset.head()
dataset.columns

Index(['Team1', 'Team2', 'Stadium', 'HostCountry', 'Team1_Venue',
       'Team2_Venue', 'Team1_Innings', 'Team2_Innings', 'MonthOfMatch',
       'MatchWinner', 'Team1_win', 'Team2_win', 'group_id', 'Team_x', 'wins_x',
       'matches_x', 'win_rate_x', 'Team_y', 'wins_y', 'matches_y',
       'win_rate_y', 'win_rate_diff', 'match_diff', 'Team_x', 'Venue_x',
       'venue_wins_x', 'venue_matches_x', 'venue_win_rate_x', 'Team_y',
       'Venue_y', 'venue_wins_y', 'venue_matches_y', 'venue_win_rate_y',
       'venue_win_rate_diff', 'venue_match_diff', 'Team_x', 'Innings_x',
       'innings_wins_x', 'innings_matches_x', 'innings_win_rate_x', 'Team_y',
       'Innings_y', 'innings_wins_y', 'innings_matches_y',
       'innings_win_rate_y', 'innings_win_rate_diff', 'innings_match_diff'],
      dtype='object')

In [119]:
final_cols = ['Stadium', 'HostCountry', 
              # 'Team1', 'Team2', 
              'Team1_Venue', 'Team2_Venue', 'Team1_Innings', 
              'Team2_Innings', 'MonthOfMatch',
              'Team1_win', 'group_id', 'wins_x',
              'matches_x', 'win_rate_x', 'wins_y', 'matches_y',
              'win_rate_y', 'win_rate_diff', 'match_diff', 'venue_wins_x', 
              'venue_matches_x', 'venue_win_rate_x', 
              'venue_wins_y', 'venue_matches_y', 'venue_win_rate_y',
              'venue_win_rate_diff', 'venue_match_diff', 
              #'stadium_wins_x',
              #'stadium_matches_x', 'stadium_win_rate_x', 'stadium_wins_y',
              #'stadium_matches_y', 'stadium_win_rate_y', 'stadium_win_rate_diff', 'stadium_match_diff'
              'innings_wins_x', 'innings_matches_x', 'innings_win_rate_x',
              'innings_wins_y', 'innings_matches_y','innings_win_rate_y', 
              'innings_win_rate_diff', 'innings_match_diff'
             ]

In [120]:
dataset[final_cols].to_csv('./odi_dataset.csv',index=False)

In [127]:
test_df = pd.read_csv('./Test.csv')
test_df['id'] = np.arange(test_df.shape[0])
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1075 entries, 0 to 1074
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Team1          1075 non-null   int64 
 1   Team2          1075 non-null   int64 
 2   Stadium        1075 non-null   int64 
 3   HostCountry    1075 non-null   int64 
 4   Team1_Venue    1075 non-null   object
 5   Team2_Venue    1075 non-null   object
 6   Team1_Innings  1075 non-null   object
 7   Team2_Innings  1075 non-null   object
 8   MonthOfMatch   1075 non-null   object
 9   id             1075 non-null   int64 
dtypes: int64(5), object(5)
memory usage: 84.1+ KB


In [128]:
test_df.head()

Unnamed: 0,Team1,Team2,Stadium,HostCountry,Team1_Venue,Team2_Venue,Team1_Innings,Team2_Innings,MonthOfMatch,id
0,2,4,34,1,Home,Away,First,Second,Oct,0
1,14,1,19,15,Home,Away,First,Second,Mar,1
2,9,10,130,14,Neutral,Neutral,Second,First,Dec,2
3,9,10,8,9,Home,Away,First,Second,Dec,3
4,5,15,130,14,Neutral,Neutral,First,Second,Oct,4


In [129]:
dataset = test_df.copy(deep=True)
dataset = dataset.merge(team_winrate, left_on=['Team1'], right_on=['Team'])
dataset = dataset.merge(team_winrate, left_on=['Team2'], right_on=['Team'])
dataset['win_rate_diff'] = dataset['win_rate_x'] - dataset['win_rate_y']
dataset['match_diff'] = dataset['matches_x'] - dataset['matches_y']
assert dataset.shape[0]==test_df.shape[0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1075 entries, 0 to 1074
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Team1          1075 non-null   int64  
 1   Team2          1075 non-null   int64  
 2   Stadium        1075 non-null   int64  
 3   HostCountry    1075 non-null   int64  
 4   Team1_Venue    1075 non-null   object 
 5   Team2_Venue    1075 non-null   object 
 6   Team1_Innings  1075 non-null   object 
 7   Team2_Innings  1075 non-null   object 
 8   MonthOfMatch   1075 non-null   object 
 9   id             1075 non-null   int64  
 10  Team_x         1075 non-null   int64  
 11  wins_x         1075 non-null   int64  
 12  matches_x      1075 non-null   int64  
 13  win_rate_x     1075 non-null   float64
 14  Team_y         1075 non-null   int64  
 15  wins_y         1075 non-null   int64  
 16  matches_y      1075 non-null   int64  
 17  win_rate_y     1075 non-null   float64
 18  win_rate

In [130]:
dataset = dataset.merge(team_venue_wins, left_on=['Team1','Team1_Venue'], right_on=['Team','Venue'])
dataset = dataset.merge(team_venue_wins, left_on=['Team2','Team2_Venue'], right_on=['Team','Venue'])
dataset['venue_win_rate_diff'] = dataset['venue_win_rate_x'] - dataset['venue_win_rate_y']
dataset['venue_match_diff'] = dataset['venue_matches_x'] - dataset['venue_matches_y']
assert dataset.shape[0] == test_df.shape[0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1075 entries, 0 to 1074
Data columns (total 32 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Team1                1075 non-null   int64  
 1   Team2                1075 non-null   int64  
 2   Stadium              1075 non-null   int64  
 3   HostCountry          1075 non-null   int64  
 4   Team1_Venue          1075 non-null   object 
 5   Team2_Venue          1075 non-null   object 
 6   Team1_Innings        1075 non-null   object 
 7   Team2_Innings        1075 non-null   object 
 8   MonthOfMatch         1075 non-null   object 
 9   id                   1075 non-null   int64  
 10  Team_x               1075 non-null   int64  
 11  wins_x               1075 non-null   int64  
 12  matches_x            1075 non-null   int64  
 13  win_rate_x           1075 non-null   float64
 14  Team_y               1075 non-null   int64  
 15  wins_y               1075 non-null   i

In [131]:
# dataset = dataset.merge(team_stadium_wins, how = 'left' ,left_on=['Team1','Stadium'], right_on=['Team','Stadium'])
# dataset = dataset.merge(team_stadium_wins, how = 'left' ,left_on=['Team2','Stadium'], right_on=['Team','Stadium'])
# dataset['stadium_win_rate_diff'] = dataset['stadium_win_rate_x'] - dataset['stadium_win_rate_y']
# dataset['stadium_match_diff'] = dataset['stadium_matches_x'] - dataset['stadium_matches_y']
# assert dataset.shape[0] == test_df.shape[0]
# dataset.info()

In [132]:
dataset = dataset.merge(team_innings_wins, left_on=['Team1','Team1_Innings'], right_on=['Team','Innings'])
dataset = dataset.merge(team_innings_wins, left_on=['Team2','Team2_Innings'], right_on=['Team','Innings'])
dataset['innings_win_rate_diff'] = dataset['innings_win_rate_x'] - dataset['innings_win_rate_y']
dataset['innings_match_diff'] = dataset['innings_matches_x'] - dataset['innings_matches_y']
assert dataset.shape[0] == test_df.shape[0]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1075 entries, 0 to 1074
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Team1                  1075 non-null   int64  
 1   Team2                  1075 non-null   int64  
 2   Stadium                1075 non-null   int64  
 3   HostCountry            1075 non-null   int64  
 4   Team1_Venue            1075 non-null   object 
 5   Team2_Venue            1075 non-null   object 
 6   Team1_Innings          1075 non-null   object 
 7   Team2_Innings          1075 non-null   object 
 8   MonthOfMatch           1075 non-null   object 
 9   id                     1075 non-null   int64  
 10  Team_x                 1075 non-null   int64  
 11  wins_x                 1075 non-null   int64  
 12  matches_x              1075 non-null   int64  
 13  win_rate_x             1075 non-null   float64
 14  Team_y                 1075 non-null   int64  
 15  wins

In [133]:
dataset.drop(columns=['Team_x','Team_y']).to_csv('./test_dataset.csv', index = False)

In [159]:
# preds = pd.read_csv('./preds_v1.csv')
# preds = pd.read_csv('./preds_blender.csv') 
# preds = pd.read_csv('./preds_feat_subset.csv')
# preds = pd.read_csv('./preds_with_teams.csv')
# preds = pd.read_csv('./preds_enet.csv')
# preds = pd.read_csv('./preds_enet_best_proj.csv')
# preds = pd.read_csv('./preds_more_feats.csv')
# preds = pd.read_csv('./preds_stadium.csv')
# preds = pd.read_csv('./preds_aug_v1.csv')
# preds = pd.read_csv('./preds_aug_2.csv')
# preds = pd.read_csv('./pred_augs_v3.csv') 
# preds = pd.read_csv('./preds_innings.csv') # best public LB Score
# preds = pd.read_csv('./preds_inn_blender_e.csv')
# preds = pd.read_csv('./preds_inn_avg.csv')
preds = pd.read_csv('./preds_wo_interactions.csv')
preds.sort_values(['row_id'],inplace=True)
preds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1075 entries, 0 to 1074
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   row_id          1075 non-null   int64  
 1   Prediction      1075 non-null   float64
 2   PredictedLabel  1075 non-null   int64  
dtypes: float64(1), int64(2)
memory usage: 33.6 KB


In [160]:
preds.head()

Unnamed: 0,row_id,Prediction,PredictedLabel
0,0,0.217219,0
1,1,0.210104,0
2,2,0.213676,0
3,3,0.200724,0
4,4,0.210104,0


In [161]:
submission = dataset[['Team1', 'Team2', 'id']]
#submission['Team1_prob'] = np.round(preds['Prediction'],2)
#submission['Team2_prob'] = np.round(1 - submission['Team1_prob'],2)
submission['Team1_prob'] = preds['Prediction']
# submission['Team1_prob'] = np.where(submission['Team1_prob']>=0.4323,1,0)
submission['Team2_prob'] = 1 - submission['Team1_prob']
submission['row_id'] = preds['row_id']
submission.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,Team1,Team2,id,Team1_prob,Team2_prob,row_id
0,2,4,0,0.217219,0.782781,0
1,2,4,129,0.210104,0.789896,1
2,2,4,205,0.213676,0.786324,2
3,2,4,284,0.200724,0.799276,3
4,2,4,635,0.210104,0.789896,4


In [162]:
pred_mat = []
for index, row in submission.iterrows():
    row_pred = [0 for i in range(17)]
    row_pred[int(row['Team1'])] = row['Team1_prob']
    row_pred[int(row['Team2'])] = row['Team2_prob']
    row_pred[16] = row['id']
    pred_mat.append(row_pred)

In [163]:
'''preds_na = pd.read_csv('./preds_blender.csv')
preds_na.sort_values(['row_id'],inplace=True)

submission_na = dataset[['Team1', 'Team2', 'id']]
submission_na['Team1_prob'] = preds_na['Prediction']
submission_na['Team2_prob'] = 1 - submission_na['Team1_prob']
submission_na['row_id'] = preds_na['row_id']

pred_mat = []
for index, row in dataset.iterrows():
    row_pred = [0 for i in range(17)]
    id = row['id']
    if(dataset[dataset.id==id]['stadium_match_diff'].isna().values[0]):
        row_pred[int(row['Team1'])] = submission_na[submission_na.id==id]['Team1_prob'].values[0]
        row_pred[int(row['Team2'])] = submission_na[submission_na.id==id]['Team2_prob'].values[0]
    else:
        row_pred[int(row['Team1'])] = submission[submission.id==id]['Team1_prob'].values[0]
        row_pred[int(row['Team2'])] = submission[submission.id==id]['Team2_prob'].values[0]
    row_pred[16] = row['id']
    pred_mat.append(row_pred)'''

"preds_na = pd.read_csv('./preds_blender.csv')\npreds_na.sort_values(['row_id'],inplace=True)\n\nsubmission_na = dataset[['Team1', 'Team2', 'id']]\nsubmission_na['Team1_prob'] = preds_na['Prediction']\nsubmission_na['Team2_prob'] = 1 - submission_na['Team1_prob']\nsubmission_na['row_id'] = preds_na['row_id']\n\npred_mat = []\nfor index, row in dataset.iterrows():\n    row_pred = [0 for i in range(17)]\n    id = row['id']\n    if(dataset[dataset.id==id]['stadium_match_diff'].isna().values[0]):\n        row_pred[int(row['Team1'])] = submission_na[submission_na.id==id]['Team1_prob'].values[0]\n        row_pred[int(row['Team2'])] = submission_na[submission_na.id==id]['Team2_prob'].values[0]\n    else:\n        row_pred[int(row['Team1'])] = submission[submission.id==id]['Team1_prob'].values[0]\n        row_pred[int(row['Team2'])] = submission[submission.id==id]['Team2_prob'].values[0]\n    row_pred[16] = row['id']\n    pred_mat.append(row_pred)"

In [164]:
submission_df = pd.DataFrame(pred_mat)
submission_df.sort_values([16], inplace=True)
submission_df = submission_df.iloc[:,0:16]
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1075 entries, 0 to 298
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1075 non-null   float64
 1   1       1075 non-null   float64
 2   2       1075 non-null   float64
 3   3       1075 non-null   float64
 4   4       1075 non-null   float64
 5   5       1075 non-null   float64
 6   6       1075 non-null   float64
 7   7       1075 non-null   float64
 8   8       1075 non-null   float64
 9   9       1075 non-null   float64
 10  10      1075 non-null   float64
 11  11      1075 non-null   float64
 12  12      1075 non-null   float64
 13  13      1075 non-null   float64
 14  14      1075 non-null   float64
 15  15      1075 non-null   float64
dtypes: float64(16)
memory usage: 142.8 KB


In [165]:
submission_df.to_excel('./submission.xlsx',index=False)

In [166]:
submission_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.217219,0.0,0.782781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
340,0.0,0.600062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399938,0.0
730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333427,0.666573,0.0,0.0,0.0,0.0,0.0
215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.555653,0.444347,0.0,0.0,0.0,0.0,0.0
133,0.0,0.0,0.0,0.0,0.0,0.827531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172469
714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.547005,0.0,0.0,0.0,0.452995,0.0
1031,0.0,0.0,0.353763,0.0,0.0,0.0,0.646237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,0.0,0.616407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383593,0.0,0.0,0.0,0.0,0.0
164,0.0,0.0,0.0,0.0,0.0,0.592444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.407556,0.0,0.0
914,0.0,0.720263,0.0,0.0,0.0,0.279737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
