In [181]:
# Importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [141]:
# Helper function
def to_datetime(df):
    df['match_dt'] = pd.to_datetime(df['match_dt'], format='%Y-%m-%d')
    return df

def rm_blankspace(df):
    df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return df

def unwrap_rosters(df):
    df['team1_roster_ids'] = df['team1_roster_ids'].apply(lambda x: x.split(':'))
    df['team2_roster_ids'] = df['team2_roster_ids'].apply(lambda x: x.split(':'))
    return df

def data_preprocessing(df):
    df = to_datetime(df)
    df = rm_blankspace(df)
    return df

In [142]:
# Importing datset
train_data = pd.read_csv('data//train_data.csv',)

batsman_data = pd.read_csv('data//batsman_level_data.csv')

bowler_data = pd.read_csv('data//bowler_level_data.csv')

match_data = pd.read_csv('data//match_level_data.csv')

test_data = pd.read_csv('data//round_1_sub_data.csv')

In [143]:
train_data = data_preprocessing(train_data)
train_data = unwrap_rosters(train_data)
match_data = data_preprocessing(match_data)
match_data = unwrap_rosters(match_data)
test_data = data_preprocessing(test_data)
test_data = unwrap_rosters(test_data)

batsman_data = data_preprocessing(batsman_data)
bowler_data = data_preprocessing(bowler_data)

In [144]:
# Reordering columns
match_data = match_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
        'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
        'winner', 'winner_id', 'by', 'win_amount', 'player_of_the_match_id',
        'inning1_runs', 'inning1_wickets', 'inning1_balls', 'inning2_runs', 'inning2_wickets', 'inning2_balls',
        'team1_roster_ids', 'team2_roster_ids']]

train_data = train_data[['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
       'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
       'winner', 'winner_id', 'team1_roster_ids', 'team2_roster_ids', 'team_count_50runs_last15',
       'team_winp_last5', 'team1only_avg_runs_last15', 'team1_winp_team2_last15','ground_avg_runs_last15']]

In [145]:
match_data.iloc[0]

match_id                                                            8638034
match_dt                                                2021-01-01 00:00:00
team1                                                                 Nn Ds
team1_id                                                              17982
team2                                                                    Wn
team2_id                                                              18570
toss_winner                                                              Wn
toss_decision                                                         field
ground_id                                                             17681
lighting                                                        night match
series_name                                                           Sr Sh
winner                                                                   Wn
winner_id                                                             18570
by          

In [146]:
# Cheatsheet features
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    '''
    if bat_or_bowl == 'bat':
        df_topick = batsman_data
        id_col = 'batsman_id'
    else:
        df_topick = bowler_data
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

def no50sLastn(player_list, date, n):
    '''
    Function to get total number of 50s scored by players in the roster of a team in last n games.
    
    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    '''
    res_list = []
    for player in player_list:
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0)
        res_list.append(np.nansum(df_rel['gte_50runs']))
    return np.nansum(res_list)

def avgRunsGround(ground_id, date, n):
    '''
    Function to calculate average runs scored in ground/venue.
    
    Input-
    1. ground_id: ID of the ground to calculate the feature for.
    2. date: match date of the current game to calculate the feature for.
    3. n: look-back window of games for the ground.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&(match_data['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2
    return df_rel['avg_runs_inn'].mean()

def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    '''
    match_data['team1_bat_inning'] = np.where( ((match_data['team1']==match_data['toss winner'])&(match_data['toss decision']=='bat'))|\
                                               ((match_data['team2']==match_data['toss winner'])&(match_data['toss decision']=='field')) , 1, 2)

    df_rel = match_data[(match_data['match_dt']<date)&\
                      ((match_data['team1_id']==team_id)|(match_data['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean()


In [147]:
# Ratio agnostic features
def tossWinnerWins(team1_id, team2_id, date):
    '''
    Function to compute percent of games toss winner wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    '''
    match_data['DtossWinnerWins'] = np.where(match_data['toss_winner'] == match_data['winner'], 1, 0)
    df_rel = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)
    
    res = df_rel['DtossWinnerWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['DtossWinnerWins'].values),2)

def teamBatsFirstWins(team1_id, team2_id, date):
    '''
    Function to compute percent of games team that bats first wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    '''
    match_data['toss_decision_bats'] = np.where(match_data['toss_decision'] == 'bat', 1, 0)

    match_data['DteamBatsFirstWins'] = np.where(((match_data['DtossWinnerWins'] == 1) & (match_data['toss_decision_bats'] == 1))
                                          | ((match_data['DtossWinnerWins'] == 0) & (match_data['toss_decision_bats'] == 0)), 1, 0)
  
    df_rel = match_data[(match_data['match_dt']<date)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    res = df_rel['DteamBatsFirstWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['DteamBatsFirstWins'].values),2)

def teamBatsFirstWinsAtGround(team1_id, team2_id, date, ground):
    '''
    Function to compute percent of games team that bats first wins the game.
    
    Input-
    1. team1_id: ID of team1
    2. team2_id: ID of team2
    3: date: match date of the current game for which the feature is to be calculated.
    4. ground: Ground ID of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['ground_id'] == ground)&\
                        (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                         ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    res = df_rel['DteamBatsFirstWins'].values
    if len(res)==0:
        return 0
    else:
        return round(np.mean(df_rel['DteamBatsFirstWins'].values),2)


In [148]:
# Ratio agnostic features

# tossWinnerWins
train_data['tossWinnerWins'] = train_data.apply(lambda x: tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['tossWinnerWins'] = test_data.apply(lambda x: tossWinnerWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# teamBatsFirstWins
train_data['teamBatsFirstWins'] = train_data.apply(lambda x: teamBatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)
test_data['teamBatsFirstWins'] = test_data.apply(lambda x: teamBatsFirstWins(x['team1_id'], x['team2_id'], x['match_dt']), axis=1)

# teamBatsFirstWins
train_data['teamBatsFirstWinsAtGround'] = train_data.apply(lambda x: teamBatsFirstWinsAtGround(x['team1_id'], x['team2_id'], x['match_dt'],x['ground_id']), axis=1)
test_data['teamBatsFirstWinsAtGround'] = test_data.apply(lambda x: teamBatsFirstWinsAtGround(x['team1_id'], x['team2_id'], x['match_dt'],x['ground_id']), axis=1)


In [149]:
# Team1 to Team2 ratio features
def team1WinpAtGround(team1_id, team2_id, date, ground):
    '''
    Function to compute team1's win% against team2 from the current game at the given ground.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. ground: Ground ID of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['ground_id'] == ground)&\
                      (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                       ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0]
    if win_count == 0:
        return 0
    return round(win_count/df_rel.shape[0], 2)

def team1WinpLight(team1_id, team2_id, date, light):
    '''
    Function to compute team1's win% against team2 from the current game at the given lighting.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. light: Lighting condition of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['lighting'] == light)&\
                      (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                       ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0]
    if win_count == 0:
        return 0
    return round(win_count/df_rel.shape[0], 2)

def team1WinpSeries(team1_id, team2_id, date, series):
    '''
    Function to compute team1's win% against team2 from the current game at the given lighting.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. series: Series name of the current game.
    '''

    df_rel = match_data[(match_data['match_dt']<date)&\
                        (match_data['series_name'] == series)&\
                      (((match_data['team1_id']==team1_id)&(match_data['team2_id']==team2_id))|\
                       ((match_data['team1_id']==team2_id)&(match_data['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False)

    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0]
    if win_count == 0:
        return 0
    return round(win_count/df_rel.shape[0], 2)


In [150]:
# Team1 to Team2 ratio features

# team1WinpAtGround
train_data['team1WinpAtGround'] = train_data.apply(lambda x: \
                                  team1WinpAtGround(x['team1_id'], x['team2_id'], x['match_dt'], x['ground_id']), axis=1)
test_data['team1WinpAtGround'] = test_data.apply(lambda x: \
                                  team1WinpAtGround(x['team1_id'], x['team2_id'], x['match_dt'], x['ground_id']), axis=1)

# team1WinpLight
train_data['team1WinpLight'] = train_data.apply(lambda x: \
                                  team1WinpLight(x['team1_id'], x['team2_id'], x['match_dt'], x['lighting']), axis=1)
test_data['team1WinpLight'] = test_data.apply(lambda x: \
                                  team1WinpLight(x['team1_id'], x['team2_id'], x['match_dt'], x['lighting']), axis=1)

# team1WinpSeries
train_data['team1WinpSeries'] = train_data.apply(lambda x: \
                                  team1WinpSeries(x['team1_id'], x['team2_id'], x['match_dt'], x['series_name']), axis=1)
test_data['team1WinpSeries'] = test_data.apply(lambda x: \
                                  team1WinpSeries(x['team1_id'], x['team2_id'], x['match_dt'], x['series_name']), axis=1)

In [152]:
cols = ['team_count_50runs_last15', 'team_winp_last5',
'team1only_avg_runs_last15', 'team1_winp_team2_last15',
'ground_avg_runs_last15', 'tossWinnerWins', 'teamBatsFirstWins',
'teamBatsFirstWinsAtGround', 'team1WinpAtGround', 'team1WinpLight',
'team1WinpSeries']

In [151]:
train_data['winner_01'] = train_data.apply(lambda x: 1 if (x['team1']==x['winner']) else 0, axis=1)
train_data.fillna(0, inplace=True), test_data.fillna(0, inplace=True)

In [180]:
# Testing the model
train_data_test = train_data.copy()
train_data_test.sort_values(by='match_dt', inplace=True)
X_train, y_train, X_test, y_test = train_data_test[cols][:670], train_data_test['winner_01'][:670], train_data_test[cols][670:], train_data_test['winner_01'][670:]

models = {
    # 'xgb': xgb.XGBClassifier(random_state=0),
    'lgbm': lgb.LGBMClassifier(random_state=0, verbose=-1),
    # 'cat': cat.CatBoostClassifier(random_state=0, verbose=0),
    # 'gb' : GradientBoostingClassifier(random_state=0)
}

grid = {
    # 'xgb': {'n_estimators':[50,100,150,200,400,500],
    #         'learning_rate':[0.05, 0.1, 0.2, 0.5, 1,1.5,2],
    #         'max_depth':[3,4,5,6,7,8,9,10]},
    'lgbm': {'n_estimators':[50,100,150,200,400,500],
            'learning_rate':[0.05, 0.1, 0.2, 0.5, 1,1.5,2],
            'max_depth':[3,4,5,6,7,8,9,10]},
    # 'cat': {'iterations': [50,100],
    #         'learning_rate':[0.05, 0.1, 0.2, 0.5, 1,1.5,2],
    #         'depth':[3,4,5,6,7,8,9,10]},
    # 'gb' : {'n_estimators':[50,100,150,200,400,500],
    #         'learning_rate':[0.05, 0.1, 0.2, 0.5, 1,1.5,2],
    #         'max_depth':[3,4,5,6,7,8,9,10]}
}

for model_name, model in models.items():
    gs = GridSearchCV(model, grid[model_name], cv=5, n_jobs=-1)
    gs.fit(X_train, y_train)
    print(f"Best parameters for {model_name}: {gs.best_params_}")
    y_pred = gs.predict(X_test)
    acc = np.mean(y_pred == y_test)
    print(f'Accuracy for {model_name}: {acc}')

Best parameters for lgbm: {'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 400}
Accuracy for lgbm: 0.49640287769784175


In [187]:
# Testing the model
train_data_test = train_data.copy()
train_data_test.sort_values(by='match_dt', inplace=True)
X_train, y_train, X_test, y_test = train_data_test[cols][:670], train_data_test['winner_01'][:670], train_data_test[cols][670:], train_data_test['winner_01'][670:]

clf_xgb = xgb.XGBClassifier(learning_rate=0.1, max_depth=4, n_estimators=100)
clf_cat = cat.CatBoostClassifier(depth=5, iterations=50, learning_rate=0.05, verbose=0)
clf_gb = GradientBoostingClassifier(learning_rate=0.05, max_depth=10, n_estimators=400)
clf_lgbm = lgb.LGBMClassifier(learning_rate=0.05, max_depth=8, n_estimators=400, verbose=-1)

models = {
    'xgb': clf_xgb,
    'lgbm': clf_lgbm,
    'cat': clf_cat,
    'gb' : clf_gb
}

trained_models = [(name, model) for name, model in models.items()]

ensemble = VotingClassifier(estimators=trained_models, voting='hard', verbose=False, n_jobs=-1)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

acc = np.mean(y_pred == y_test)
print(f'Accuracy for ensemble: {acc}')

Accuracy for ensemble: 0.5251798561151079


In [154]:
X_train, y_train, X_test = train_data[cols], train_data['winner_01'], test_data[cols]

In [155]:
clf_gbm = GradientBoostingClassifier().fit(X_train,y_train)

In [159]:
# Train accuracy
print(classification_report(y_train, clf_gbm.predict(X_train), labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       477
           1       0.83      0.83      0.83       471

    accuracy                           0.83       948
   macro avg       0.83      0.83      0.83       948
weighted avg       0.83      0.83      0.83       948



In [86]:
df_feat_importance = pd.DataFrame({'feat_name':X_train.columns.tolist(), 'model_feat_imp_train':clf_gbm.feature_importances_}).sort_values(by='model_feat_imp_train', ascending=False)\
                                                                                                                        .reset_index(drop=True).head(10)
df_feat_importance

Unnamed: 0,feat_name,model_feat_imp_train
0,ground_avg_runs_last15,0.267694
1,team_count_50runs_last15,0.216294
2,team1only_avg_runs_last15,0.14785
3,team_winp_last5,0.138607
4,tossWinnerWins,0.062906
5,teamBatsFirstWins,0.056573
6,team1WinpSeries,0.037943
7,team1_winp_team2_last15,0.030083
8,team1WinpAtGround,0.020357
9,team1WinpLight,0.013531


In [87]:
train_data['y_pred_01'] = clf_gbm.predict(X_train)
test_data['y_pred_01'] = clf_gbm.predict(X_test)

train_data['win_pred_score'] = clf_gbm.predict_proba(X_train)[:,1]
test_data['win_pred_score'] = clf_gbm.predict_proba(X_test)[:,1]

train_data['win_pred_score'] = np.where( (train_data['y_pred_01']==0), (1-train_data['win_pred_score']), train_data['win_pred_score'])
test_data['win_pred_score'] = np.where( (test_data['y_pred_01']==0), (1-test_data['win_pred_score']), test_data['win_pred_score'])

train_data['win_pred_team_id'] = np.where( (train_data['y_pred_01']==1), (train_data['team1_id']), train_data['team2_id'])
test_data['win_pred_team_id'] = np.where( (test_data['y_pred_01']==1), (test_data['team1_id']), test_data['team2_id'])

In [104]:
## refactor
train_data['dataset_type'] = 'train'
train_data['train_algorithm'] = 'GBM'
train_data['is_ensemble'] = 'no'
train_data['train_hps_trees'] = clf_gbm.n_estimators_
train_data['train_hps_depth'] = clf_gbm.max_depth
train_data['train_hps_lr'] = clf_gbm.learning_rate
train_data['match id'] = train_data['match_id']

test_data['dataset_type'] = 'r1'
test_data['train_algorithm'] = 'GBM'
test_data['is_ensemble'] = 'no'
test_data['train_hps_trees'] = clf_gbm.n_estimators_
test_data['train_hps_depth'] = clf_gbm.max_depth
test_data['train_hps_lr'] = clf_gbm.learning_rate
test_data['match id'] = test_data['match_id']

df_file1 = pd.concat([test_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(df_feat_importance['feat_name'].head(10))], \
                     train_data[['match id','dataset_type','win_pred_team_id','win_pred_score','train_algorithm', 'is_ensemble', 'train_hps_trees', 'train_hps_depth', 'train_hps_lr'] + list(df_feat_importance['feat_name'].head(10))]])

renaming_dict = {}
for i,col in enumerate(list(df_feat_importance['feat_name'].head(10))):
    renaming_dict[col] = f'indep_feat_id{i+1}'
df_file1.rename(columns=renaming_dict, inplace=True)

for i in range(1,11):
    if f'indep_feat_id{i}' not in df_file1.columns:
        df_file1[f'indep_feat_id{i}'] = np.nan

In [106]:
df_file1.shape

(1219, 19)

In [110]:
feature_desc = {'ground_avg_runs_last15':'average runs scored in the ground in last 15 games',
                'team_count_50runs_last15':'Ratio of number of 50s by players in team1 to number of 50s by players in team2 in last 15 games',
                'team1only_avg_runs_last15':'team1\'s avg inning runs in last 15 games',
                'team_winp_last5':'Ratio of team1\'s win % to team2\'s win % in last 5 games',
                'tossWinnerWins': 'Ratio of toss winner winning past matches',
                'teamBatsFirstWins': 'Ratio of team batting first winning past matches',
                'team1WinpSeries': 'Ratio of team1\'s win % to team2\'s win % in given series',
                'team1_winp_team2_last15':'Team1\'s win percentage against Team2 in last 15 games',
                'team1WinpAtGround': 'iRatio of team1\'s win % to team2\'s win % in given ground',
                'team1WinpLight': 'Ratio of team1\'s win % to team2\'s win % in given lighting'}

In [116]:
df_feat_importance['feat_description'] = df_feat_importance['feat_name'].map(feature_desc)
df_file2 = df_feat_importance[['feat_id', 'feat_name', 'feat_description', 'model_feat_imp_train','feat_rank_train']]

In [118]:
df_file2.shape

(10, 5)

Index(['match_id', 'match_dt', 'team1', 'team1_id', 'team2', 'team2_id',
       'toss_winner', 'toss_decision', 'ground_id', 'lighting', 'series_name',
       'winner', 'winner_id', 'team1_roster_ids', 'team2_roster_ids',
       'team_count_50runs_last15', 'team_winp_last5',
       'team1only_avg_runs_last15', 'team1_winp_team2_last15',
       'ground_avg_runs_last15', 'tossWinnerWins', 'teamBatsFirstWins',
       'teamBatsFirstWinsAtGround', 'team1WinpAtGround', 'team1WinpLight',
       'team1WinpSeries', 'winner_01', 'y_pred_01', 'win_pred_score',
       'win_pred_team_id', 'dataset_type', 'train_algorithm', 'is_ensemble',
       'train_hps_trees', 'train_hps_depth', 'train_hps_lr', 'match id'],
      dtype='object')

In [137]:
df_file3 = train_data[['match id', 'dataset_type', 'winner_id']]

In [138]:
df_file3

Unnamed: 0,match id,dataset_type,winner_id
0,9331181,train,12634
1,8797060,train,20
2,9433269,train,10576
3,9587073,train,36084
4,9516457,train,48341
...,...,...,...
943,9128601,train,30407
944,9433241,train,9701
945,9097227,train,23869
946,9516695,train,36014


In [121]:
df_file1.to_csv('subtest/1/primary_submission.csv', index=False)
df_file2.to_csv('subtest/1/secondary_submission.csv')


In [139]:
df_file3.to_csv('subtest/1/dep_var.csv', index=False)