- code refactoring to select features by rfecv

In [1]:
import scipy
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.metrics import log_loss, mutual_info_score
from sklearn.decomposition import PCA
from tqdm._tqdm_notebook import tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
tqdm_notebook.pandas(desc="progress")
pd.set_option("max_columns", 1000)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  
  from pandas import Panel


In [2]:
DIR = "/kaggle/input/lish-moa/"
train_feat = pd.read_csv(DIR+"train_features.csv")
test_feat = pd.read_csv(DIR+"test_features.csv")
#train_nonscore = pd.read_csv(DIR+"train_targets_nonscored.csv")
train_score = pd.read_csv(DIR+"train_targets_scored.csv")
sub = pd.read_csv(DIR+"sample_submission.csv")

In [3]:
# features after rfecv
df = pd.read_csv("/kaggle/input/moagcvariables/feats.csv", header=None, sep='\n')
df = df[0].str.split(',', expand=True)
df[0] = df[0].astype(int)
df = df.sort_values(0, ascending=True).reset_index(drop=True)
decreased_vars = df[0].values
df = df.set_index(0)

In [4]:
target_feats = [ i for i in train_score.columns if i != "sig_id"]
g_feats = [i for i in train_feat.columns if "g-" in i]
c_feats = [i for i in train_feat.columns if "c-" in i]

In [5]:
noncons_train_index = train_feat[train_feat.cp_type=="ctl_vehicle"].index
cons_train_index = train_feat[train_feat.cp_type!="ctl_vehicle"].index
noncons_test_index = test_feat[test_feat.cp_type=="ctl_vehicle"].index
cons_test_index = test_feat[test_feat.cp_type!="ctl_vehicle"].index

# preprocess

In [6]:
# normalization by ctl group
train_ctl = train_feat[train_feat.index.isin(noncons_train_index)].copy().reset_index(drop=True)
test_ctl = test_feat[test_feat.index.isin(noncons_test_index)].copy().reset_index(drop=True)
ctl_df = pd.concat([train_ctl, test_ctl])

ctl_group_data = ctl_df.groupby(["cp_dose", "cp_time"]).agg({"mean"}).reset_index()
mean_g_feats = ["mean-" + i for i in g_feats]
mean_c_feats = ["mean-" + i for i in c_feats]
columns = ["cp_dose", "cp_time"] + mean_g_feats + mean_c_feats
ctl_group_data.columns = columns

train_cons = train_feat[train_feat.index.isin(cons_train_index)].copy().reset_index(drop=True)
test_cons = test_feat[test_feat.index.isin(cons_test_index)].copy().reset_index(drop=True)
n_train_score = train_score[train_score.index.isin(cons_train_index)].copy().reset_index(drop=True)
#n_train_nonscore = train_nonscore[train_nonscore.index.isin(cons_train_index)].copy().reset_index(drop=True)

train_cons = pd.merge(train_cons, ctl_group_data, on=["cp_time", "cp_dose"], how="left")
test_cons = pd.merge(test_cons, ctl_group_data, on=["cp_time", "cp_dose"], how="left")

for i in range(len(g_feats)):
    train_cons["diff-g-"+str(i)] = train_cons["g-"+str(i)] - train_cons["mean-g-"+str(i)]
    test_cons["diff-g-"+str(i)] = test_cons["g-"+str(i)] - test_cons["mean-g-"+str(i)]
    
for i in range(len(c_feats)):
    train_cons["diff-c-"+str(i)] = train_cons["c-"+str(i)] - train_cons["mean-c-"+str(i)]
    test_cons["diff-c-"+str(i)] = test_cons["c-"+str(i)] - test_cons["mean-c-"+str(i)]

In [7]:
categoricals = ["cp_dose"]

def encoding(tr, te):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tr[f]))
        tr[f] = lbl.transform(list(tr[f]))
        te[f] = lbl.transform(list(te[f])) 
        
    return tr, te

n_train_feat, n_test_feat = encoding(train_cons, test_cons)

# feature engineering

In [8]:
def fe(df, remove_features):
    df.drop(remove_features, axis=1, inplace=True)
    return df

remove_features = ["cp_type"] + mean_g_feats + mean_c_feats + g_feats + c_feats 
for i in [i for i in n_train_feat.columns if i != "sig_id"]:
    if i not in remove_features and (n_train_feat[i].std() == 0):
        remove_features.append(i)
        
n_train_feat = fe(n_train_feat, remove_features)
n_test_feat = fe(n_test_feat, remove_features)
    
print(n_train_feat.shape, n_test_feat.shape)

(21948, 875) (3624, 875)


# Feature selection

In [9]:
from sklearn.feature_selection import RFECV
params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 
          'learning_rate': 0.01, "num_leaves": 10, 'random_seed':44, 'max_depth': 5}

lgbm_params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5}

#diff_feats = ["acetylcholine_receptor_agonist","acetylcholine_receptor_antagonist","adrenergic_receptor_agonist","adrenergic_receptor_antagonist",
# "bacterial_cell_wall_synthesis_inhibitor","calcium_channel_blocker","cyclooxygenase_inhibitor","dna_inhibitor","dopamine_receptor_antagonist",
# "estrogen_receptor_agonist","glutamate_receptor_antagonist","histamine_receptor_antagonist","phosphodiesterase_inhibitor",
# "serotonin_receptor_agonist","serotonin_receptor_antagonist","sodium_channel_inhibitor","tubulin_inhibitor"]

diff_var = ["histamine_receptor_antagonist","phosphodiesterase_inhibitor",
 "serotonin_receptor_agonist","serotonin_receptor_antagonist","sodium_channel_inhibitor","tubulin_inhibitor"]

def check(new_train, target_train, target, selected_features):
    
    X_train = new_train.drop(['sig_id'],axis=1).copy()
    y_train = target_train[target].copy()        
        
    remove_features = []
    for i in X_train.columns: 
        if i not in selected_features:
            remove_features.append(i)
    X_train.drop(remove_features, axis=1, inplace=True)
        
    n_folds=4
    if target not in ["erbb2_inhibitor", "atp-sensitive_potassium_channel_antagonist"]:
        skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    else:
        skf=KFold(n_splits = n_folds, shuffle=True, random_state=0)

    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=10000,early_stopping_rounds=50,verbose_eval = 1000) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
    
    score = log_loss(y_train, valid)
    
    return score

#for target in diff_var:
#    feature_selector = RFECV(lgb.LGBMClassifier(**params),
#                         step=10, min_features_to_select=200, scoring='neg_log_loss',
#                         cv=4, verbose=1, n_jobs=-1)

    #X_train = n_train_feat.drop(['sig_id'],axis=1).copy()
    #y_train = n_train_score[target].copy()

    #feature_selector.fit(X_train, y_train)
    #print('Features selected:', feature_selector.n_features_)
    #selected_features = [f for f in X_train.columns[feature_selector.ranking_ == 1]]

    #print(target, selected_features)

#    score = check(n_train_feat, n_train_score, target, selected_features)
#    print(target, score)

# parameter tuning

In [10]:
#import optuna.integration.lightgbm as lgb
#import json

def modelling_optuna(new_train, target_train, target):
    X_train = new_train.drop(['sig_id'],axis=1).copy()
    y_train = target_train[target].copy()
    
    n_folds=4
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)

    valid = np.zeros([X_train.shape[0]])
    best_params_list = []
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]
        
        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)

        best_params, tuning_history = dict(), list()
        lgbm_params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial'}
        
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=100,verbose_eval = 1000, 
                    best_params=best_params, tuning_history=tuning_history) 
        
        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
            
        #pd.DataFrame(tuning_history).to_csv('./tuning_history.csv')
        best_params_list.append(best_params)
        
    #for j in range(n_folds):
    #    print('Fold: ' + str(j+1) + ' Best parameters: ' + json.dumps(best_params_list[j], indent=4))

    #print('Best parameters: ' + json.dumps(best_params, indent=4))

    score = log_loss(y_train, valid)
    print("score = {}".format(score))
    return best_params_list

#best_params_list = modelling_optuna(new_train,n_train_score, target)
#best_params_list

# modelling

In [11]:
lgbm_params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5}

def modelling_lgb(new_train, target_train, new_test, target, ind):
    
    X_train = new_train.drop(['sig_id'],axis=1).copy()
    y_train = target_train[target].copy()
    X_test = new_test.copy()
    X_test = new_test.drop(['sig_id'],axis=1).copy()
        
    pred_value = np.zeros(X_test.shape[0])
        
    if ind in decreased_vars:
        selected_features = [i[1:-1] for i in df.loc[ind,:][df.loc[ind,:].notna()]]
        X_train = X_train[selected_features]
        X_test = X_test[selected_features]
        
    n_folds=4
    if target not in ["erbb2_inhibitor", "atp-sensitive_potassium_channel_antagonist"]:
        skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    else:
        skf=KFold(n_splits = n_folds, shuffle=True, random_state=0)

    models = []

    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        print("Fold "+str(i+1))
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=10000,early_stopping_rounds=50,verbose_eval = 1000) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        pred_value += clf.predict(X_test, num_iteration = clf.best_iteration) / n_folds

    score = log_loss(y_train, valid)
            
    return valid, pred_value, score

train_checkscore = train_score.copy()
target_list = []
log_loss_list = []

for ind, target in enumerate(target_feats):
    print(ind, target)
    valid, pred_value, score = modelling_lgb(n_train_feat, n_train_score, n_test_feat, target, ind)
    train_checkscore.loc[cons_train_index, target] = valid
    train_checkscore.loc[noncons_train_index, target] = 0
    print("oof log_loss= {} ".format(score))
    print("all log_loss= {} ".format(log_loss(train_score[target], train_checkscore[target])))
    target_list.append(target)
    log_loss_list.append(score)
    sub.loc[cons_test_index, target] = pred_value
    sub.loc[noncons_test_index, target] = 0

0 5-alpha_reductase_inhibitor
Fold 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.00595763	valid_1's binary_logloss: 0.0058469
Fold 2
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[56]	training's binary_logloss: 0.00127755	valid_1's binary_logloss: 0.00596482
Fold 3
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	training's binary_logloss: 0.00100471	valid_1's binary_logloss: 0.00537319
Fold 4
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[109]	training's binary_logloss: 0.000664927	valid_1's binary_logloss: 0.0071177
oof log_loss= 0.006075651377849975 
all log_loss= 0.005599579929497485 
1 11-beta-hsd1_inhibitor
Fold 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[41]	training's binary_logloss: 0.00196

KeyError: '["\'diff-g-669", "\'diff-g-182", "\'diff-g-679", "\'diff-g-494", "\'diff-g-680", "\'diff-g-71", "\'diff-g-509", "\'diff-g-555", "\'diff-g-686", "\'diff-g-8", "\'diff-g-171", "\'diff-g-191", "\'diff-g-368", "\'diff-g-54", "\'diff-g-309", "\'diff-g-698", "\'diff-g-263", "\'diff-g-159", "\'diff-c-67", "\'diff-c-31", "\'diff-g-137", "\'diff-c-50", "\'diff-g-436", "\'diff-g-57", "\'diff-g-535", "\'diff-g-156", "\'diff-g-154", "\'diff-g-400", "\'diff-c-51", "\'diff-c-98", "\'diff-g-81", "\'diff-g-750", "\'diff-g-515", "\'diff-g-643", "\'diff-g-702", "\'diff-g-756", "\'diff-g-729", "\'diff-g-622", "\'diff-g-344", "\'diff-g-599", "\'diff-g-533", "\'diff-g-505", "\'diff-g-604", "\'diff-g-695", "\'diff-g-66", "\'diff-g-160", "\'diff-g-26", "\'diff-g-284", "\'diff-c-57", "\'diff-g-407", "\'diff-g-32", "\'diff-g-84", "\'diff-g-501", "\'diff-g-435", "\'diff-g-138", "\'diff-g-397", "\'diff-g-72", "\'diff-c-35", "\'diff-g-247", "\'diff-g-402", "\'diff-g-406", "\'diff-g-468", "\'diff-g-168", "\'diff-g-596", "\'diff-g-86", "\'diff-g-80", "\'diff-g-249", "\'diff-g-43", "\'diff-g-538", "\'diff-g-629", "\'diff-g-697", "\'diff-c-49", "\'diff-g-44", "\'diff-g-496", "\'diff-g-266", "\'diff-g-614", "\'diff-g-282", "\'diff-g-508", "\'diff-g-97", "\'diff-g-712", "\'diff-g-645", "\'diff-g-513", "\'diff-g-299", "\'diff-c-27", "\'diff-g-116", "\'diff-g-521", "\'diff-g-577", "\'diff-c-46", "\'diff-g-375", "\'diff-g-130", "\'diff-g-211", "\'diff-g-236", "\'diff-g-317", "\'diff-g-261", "\'diff-g-611", "\'diff-g-625", "\'diff-c-81", "\'diff-g-552", "\'diff-g-34", "\'diff-g-189", "\'diff-g-206", "\'diff-g-310", "\'diff-g-144", "\'diff-g-148", "\'diff-g-128", "\'diff-g-126", "\'diff-g-564", "\'diff-g-360", "\'diff-g-581", "\'diff-g-157", "\'diff-g-745", "\'diff-g-264", "\'diff-c-68", "\'diff-g-655", "\'diff-g-91", "\'diff-g-771", "\'diff-c-38", "\'diff-g-664", "\'diff-g-466", "\'diff-c-20", "\'diff-c-77", "\'diff-g-382", "\'diff-g-709", "\'diff-g-219", "\'diff-g-145", "\'diff-g-619", "\'diff-g-183", "\'diff-g-141", "\'diff-g-320", "\'diff-g-730", "\'diff-g-759", "\'diff-g-361", "\'diff-g-429", "\'diff-g-38", "\'diff-g-395", "\'diff-g-165", "\'diff-g-225", "\'diff-g-232", "\'diff-g-631", "\'diff-g-82", "\'diff-g-503", "\'diff-g-95", "\'diff-g-248", "\'diff-g-527", "\'diff-g-202", "\'diff-g-274", "\'diff-g-300", "\'diff-g-660", "\'diff-g-200", "\'diff-g-448", "\'diff-g-401", "\'diff-g-499", "\'diff-g-651", "\'diff-g-760", "\'diff-g-403", "\'diff-g-70", "\'diff-g-204", "\'diff-g-327", "\'diff-g-725", "\'diff-c-63", "\'diff-g-502", "\'diff-g-388", "\'diff-g-276", "\'diff-g-589", "\'diff-g-140", "\'diff-g-713", "\'diff-g-173", "\'diff-g-147", "\'diff-g-591", "\'diff-g-100", "\'diff-g-357", "\'diff-g-409", "\'diff-g-470", "\'diff-g-14", "\'diff-g-307", "\'diff-g-743", "\'diff-g-765", "\'diff-g-453", "\'diff-g-737", "\'diff-g-656", "\'diff-c-37", "\'diff-g-220", "\'diff-g-511", "\'diff-g-294", "\'diff-g-203", "\'diff-g-230", "\'diff-g-223", "\'diff-c-2", "\'diff-c-76", "\'diff-g-358", "\'diff-c-83", "\'diff-g-363", "\'diff-c-62", "\'diff-g-323", "\'diff-c-92", "\'diff-g-37", "\'diff-g-231", "\'diff-g-371", "\'diff-g-681", "\'diff-g-703", "\'diff-g-259", "\'diff-g-155", "\'diff-g-544", "\'diff-g-178", "\'diff-g-352", "\'diff-g-767", "\'diff-g-444", "\'diff-g-617", "\'diff-g-50", "\'diff-c-58", "\'diff-c-78", "\'diff-c-10", "\'diff-c-23", "\'diff-g-378", "\'diff-g-65", "\'diff-g-271", "\'diff-g-356", "\'diff-g-723", "\'diff-g-605", "\'diff-g-135", "\'diff-g-716", "\'diff-g-641", "\'diff-g-711", "\'diff-g-736", "\'diff-g-254", "\'diff-g-404", "\'diff-g-582", "\'diff-g-567", "\'diff-g-224", "\'diff-c-89", "\'diff-g-446", "\'diff-c-61", "\'diff-g-3", "\'diff-g-682", "\'diff-g-318", "\'diff-g-325", "\'diff-g-93", "\'diff-g-657", "\'diff-g-608", "\'diff-c-7", "\'diff-g-504", "\'diff-g-497", "\'diff-g-253", "\'diff-g-440", "\'diff-g-414", "\'diff-g-11", "\'diff-g-488", "\'diff-g-531", "\'diff-g-355", "\'diff-c-54", "\'diff-g-291", "\'diff-c-3", "\'diff-g-226", "\'diff-g-376", "\'diff-g-719", "\'diff-g-728", "\'diff-g-733", "\'diff-g-460", "\'diff-g-667", "\'diff-g-613", "\'diff-g-442", "\'diff-g-672", "\'diff-g-749"] not in index'

In [12]:
# local score
scores = []
for target_col in target_feats:
    scores.append(log_loss(train_score[target_col], train_checkscore[target_col]))
print(np.mean(scores))

0.008911002480320321


In [13]:
difficult_list = pd.DataFrame(target_list, columns=["Target"])
difficult_list["score"] = log_loss_list
np.transpose(difficult_list)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104
Target,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,adrenergic_receptor_antagonist,akt_inhibitor,aldehyde_dehydrogenase_inhibitor,alk_inhibitor,ampk_activator,analgesic,androgen_receptor_agonist,androgen_receptor_antagonist,anesthetic_-_local,angiogenesis_inhibitor,angiotensin_receptor_antagonist,anti-inflammatory,antiarrhythmic,antibiotic,anticonvulsant,antifungal,antihistamine,antimalarial,antioxidant,antiprotozoal,antiviral,apoptosis_stimulant,aromatase_inhibitor,atm_kinase_inhibitor,atp-sensitive_potassium_channel_antagonist,atp_synthase_inhibitor,atpase_inhibitor,atr_kinase_inhibitor,aurora_kinase_inhibitor,autotaxin_inhibitor,bacterial_30s_ribosomal_subunit_inhibitor,bacterial_50s_ribosomal_subunit_inhibitor,bacterial_antifolate,bacterial_cell_wall_synthesis_inhibitor,bacterial_dna_gyrase_inhibitor,bacterial_dna_inhibitor,bacterial_membrane_integrity_inhibitor,bcl_inhibitor,bcr-abl_inhibitor,benzodiazepine_receptor_agonist,beta_amyloid_inhibitor,bromodomain_inhibitor,btk_inhibitor,calcineurin_inhibitor,calcium_channel_blocker,cannabinoid_receptor_agonist,cannabinoid_receptor_antagonist,carbonic_anhydrase_inhibitor,casein_kinase_inhibitor,caspase_activator,catechol_o_methyltransferase_inhibitor,cc_chemokine_receptor_antagonist,cck_receptor_antagonist,cdk_inhibitor,chelating_agent,chk_inhibitor,chloride_channel_blocker,cholesterol_inhibitor,cholinergic_receptor_antagonist,coagulation_factor_inhibitor,corticosteroid_agonist,cyclooxygenase_inhibitor,cytochrome_p450_inhibitor,dihydrofolate_reductase_inhibitor,dipeptidyl_peptidase_inhibitor,diuretic,dna_alkylating_agent,dna_inhibitor,dopamine_receptor_agonist,dopamine_receptor_antagonist,egfr_inhibitor,elastase_inhibitor,erbb2_inhibitor,estrogen_receptor_agonist,estrogen_receptor_antagonist,faah_inhibitor,farnesyltransferase_inhibitor,fatty_acid_receptor_agonist,fgfr_inhibitor,flt3_inhibitor,focal_adhesion_kinase_inhibitor,free_radical_scavenger,fungal_squalene_epoxidase_inhibitor,gaba_receptor_agonist,gaba_receptor_antagonist,gamma_secretase_inhibitor,glucocorticoid_receptor_agonist,glutamate_inhibitor,glutamate_receptor_agonist,glutamate_receptor_antagonist,gonadotropin_receptor_agonist,gsk_inhibitor,hcv_inhibitor,hdac_inhibitor,histamine_receptor_agonist
score,0.00607565,0.00662189,0.00843873,0.0471292,0.0690841,0.021901,0.0171832,0.0277751,0.00383652,0.0593518,0.0785539,0.0155387,0.00233972,0.0118439,0.00458633,0.00452143,0.0154159,0.0263142,0.0239117,0.0120969,0.0124162,0.0215702,0.00252711,0.0133475,0.00466088,0.00487605,0.00466274,0.00648111,0.0220385,0.0120665,0.00822767,0.0154739,0.0152612,0.00252552,0.00157376,0.00331201,0.0231797,0.00569297,0.01612,0.00254499,0.0187594,0.0238711,0.0120284,0.0482364,0.0261593,0.0318622,0.0027503,0.0096134,0.0113172,0.0204287,0.00855626,0.0149623,0.00960127,0.0025256,0.0650959,0.0138538,0.0171171,0.0121433,0.0117716,0.00659744,0.00466106,0.0292509,0.0066586,0.021628,0.0172165,0.00539583,0.0137462,0.0155199,0.0171519,0.0025279,0.00963845,0.0924955,0.0295469,0.0103305,0.00865342,0.00244157,0.0152817,0.0834787,0.0335795,0.0863835,0.0263916,0.0025343,0.00157447,0.0402608,0.0151534,0.0118792,0.0060714,0.00881815,0.0121519,0.029373,0.00484212,0.00666289,0.00822956,0.0302241,0.0428157,0.012967,0.0152137,0.0050084,0.0222849,0.0814973,0.00665608,0.0134728,0.0219699,0.0134376,0.0183271


In [14]:
sub.to_csv("submission.csv", index=False)