In [31]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from tqdm import tqdm_notebook
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns



plt.style.use('seaborn')
sns.set(font_scale=1)

import gc

In [32]:
train = pd.read_csv("../input/landslide-prevention-and-innovation-challenge/Train (12).csv")
test = pd.read_csv("../input/landslide-prevention-and-innovation-challenge/Test (13).csv")
sample = pd.read_csv('../input/landslide-prevention-and-innovation-challenge/SampleSubmission (7).csv')

In [33]:
train["Label"].value_counts()

0    8148
1    2716
Name: Label, dtype: int64

In [34]:
train

Unnamed: 0,Sample_ID,1_elevation,2_elevation,3_elevation,4_elevation,5_elevation,6_elevation,7_elevation,8_elevation,9_elevation,...,17_sdoif,18_sdoif,19_sdoif,20_sdoif,21_sdoif,22_sdoif,23_sdoif,24_sdoif,25_sdoif,Label
0,1,130,129,127,126,123,126,125,124,122,...,1.281779,1.281743,1.281720,1.281684,1.281811,1.281788,1.281752,1.281729,1.281693,0
1,2,161,158,155,153,151,162,159,155,153,...,1.359639,1.359608,1.359587,1.359556,1.359683,1.359662,1.359631,1.359610,1.359579,1
2,3,149,151,154,156,158,154,157,158,160,...,1.365005,1.365025,1.365055,1.365075,1.364937,1.364967,1.364988,1.365018,1.365038,0
3,4,80,78,77,75,73,80,78,77,75,...,1.100708,1.100738,1.100759,1.100789,1.100630,1.100650,1.100680,1.100700,1.100731,0
4,5,117,115,114,112,110,115,113,111,110,...,1.284180,1.284130,1.284056,1.284006,1.284125,1.284050,1.284001,1.283926,1.283876,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10859,10860,203,202,201,200,199,202,201,201,199,...,1.271912,1.272011,1.272076,1.272174,1.271720,1.271786,1.271884,1.271950,1.272048,0
10860,10861,432,430,427,423,421,431,429,426,422,...,1.331153,1.331137,1.331113,1.331096,1.331207,1.331183,1.331167,1.331142,1.331126,1
10861,10862,348,345,342,339,335,344,342,339,336,...,1.298362,1.298337,1.298321,1.298297,1.298392,1.298376,1.298352,1.298336,1.298311,1
10862,10863,93,91,90,88,86,96,94,93,92,...,1.355035,1.355061,1.355099,1.355125,1.354937,1.354976,1.355002,1.355041,1.355067,0


In [35]:
id_=test["Sample_ID"]
test.drop("Sample_ID", axis=1, inplace=True)
train.drop("Sample_ID", axis=1, inplace=True)

In [36]:
col=train.columns[:-1]

In [37]:
col

Index(['1_elevation', '2_elevation', '3_elevation', '4_elevation',
       '5_elevation', '6_elevation', '7_elevation', '8_elevation',
       '9_elevation', '10_elevation',
       ...
       '16_sdoif', '17_sdoif', '18_sdoif', '19_sdoif', '20_sdoif', '21_sdoif',
       '22_sdoif', '23_sdoif', '24_sdoif', '25_sdoif'],
      dtype='object', length=225)

In [38]:
def calc_total(name,DataFrame):
    DataFrame['total_'+name] = 0
    for i in range(1,26):
        DataFrame['total_'+name] +=  DataFrame[str(i)+'_'+name]
    return DataFrame
train=calc_total('slope',train)
test=calc_total('slope',test)
train=calc_total('elevation',train)
test=calc_total('elevation',test)
train=calc_total('sdoif',train)
test=calc_total('sdoif',test)
train=calc_total('lsfactor',train)
test=calc_total('lsfactor',test)
train=calc_total('twi',train)
test=calc_total('twi',test)
train=calc_total('placurv',train)
test=calc_total('placurv',test)
train=calc_total('procurv',train)
test=calc_total('procurv',test)

train.head(10)

Unnamed: 0,1_elevation,2_elevation,3_elevation,4_elevation,5_elevation,6_elevation,7_elevation,8_elevation,9_elevation,10_elevation,...,24_sdoif,25_sdoif,Label,total_slope,total_elevation,total_sdoif,total_lsfactor,total_twi,total_placurv,total_procurv
0,130,129,127,126,123,126,125,124,122,119,...,1.281729,1.281693,0,920.75922,2986,32.043332,225.342353,86.662827,0.316465,-0.222866
1,161,158,155,153,151,162,159,155,153,151,...,1.35961,1.359579,1,713.11667,3905,33.989444,200.345615,116.513413,0.266576,-0.095545
2,149,151,154,156,158,154,157,158,160,161,...,1.365018,1.365038,0,958.40645,4064,34.126354,273.950444,105.694982,0.702151,0.561299
3,80,78,77,75,73,80,78,77,75,73,...,1.1007,1.100731,0,481.78297,1910,27.520452,94.642992,93.446023,0.726599,0.222366
4,117,115,114,112,110,115,113,111,110,108,...,1.283926,1.283876,0,706.44844,2729,32.105513,193.563024,113.551492,-0.114302,-0.19198
5,308,310,314,317,319,303,307,310,313,315,...,1.325647,1.325583,1,902.15037,7670,33.145168,232.224687,95.473527,-0.739196,-0.378038
6,130,130,130,130,129,132,132,132,132,132,...,1.321152,1.321097,0,400.763856,3276,33.035228,65.038096,105.237826,0.266396,0.079556
7,141,139,137,135,132,143,141,139,136,133,...,1.287928,1.287892,0,721.34064,3491,32.199448,158.709341,83.868949,0.378994,0.327065
8,100,102,104,106,109,100,102,104,107,109,...,1.303178,1.303169,0,583.6896,2602,32.577881,221.85785,164.899133,-0.161406,-0.001375
9,141,140,140,140,140,143,143,143,144,144,...,1.299168,1.299124,0,833.93162,3668,32.479377,227.285501,104.631095,-0.246812,0.012332


In [39]:
train["elevation_median"]=train[train.columns[:25].to_list()].median(axis=1)
train["slope_median"]=train[train.columns[25:50].to_list()].median(axis=1)
train["aspect_median"]=train[train.columns[50:75].to_list()].median(axis=1)
train["placurv_median"]=train[train.columns[75:100].to_list()].median(axis=1)
train["procurv_median"]=train[train.columns[100:125].to_list()].median(axis=1)
train["lsfactor_median"]=train[train.columns[125:150].to_list()].median(axis=1)
train["twi_median"]=train[train.columns[150:175].to_list()].median(axis=1)
train["geology_median"]=train[train.columns[175:200].to_list()].median(axis=1)
train["sdoif_median"]=train[train.columns[200:225].to_list()].median(axis=1)
#################################################################
train["elevation_mean"]=train[train.columns[:25].to_list()].mean(axis=1)
train["slope_mean"]=train[train.columns[25:50].to_list()].mean(axis=1)
train["aspect_mean"]=train[train.columns[50:75].to_list()].mean(axis=1)
train["placurv_mean"]=train[train.columns[75:100].to_list()].mean(axis=1)
train["procurv_mean"]=train[train.columns[100:125].to_list()].mean(axis=1)
train["lsfactor_mean"]=train[train.columns[125:150].to_list()].mean(axis=1)
train["twi_mean"]=train[train.columns[150:175].to_list()].mean(axis=1)
train["geology_mean"]=train[train.columns[175:200].to_list()].mean(axis=1)
train["sdoif_mean"]=train[train.columns[200:225].to_list()].mean(axis=1)
########################################################################
train["elevation_count"]=train[train.columns[:25].to_list()].count(axis=1)
train["slope_count"]=train[train.columns[25:50].to_list()].count(axis=1)
train["aspect_count"]=train[train.columns[50:75].to_list()].count(axis=1)
train["placurv_count"]=train[train.columns[75:100].to_list()].count(axis=1)
train["procurv_count"]=train[train.columns[100:125].to_list()].count(axis=1)
train["lsfactor_count"]=train[train.columns[125:150].to_list()].count(axis=1)
train["twi_count"]=train[train.columns[150:175].to_list()].count(axis=1)
train["geology_count"]=train[train.columns[175:200].to_list()].count(axis=1)
train["sdoif_count"]=train[train.columns[200:225].to_list()].count(axis=1)
#########################################################################
train["elevation_std"]=train[train.columns[:25].to_list()].std(axis=1)
train["slope_std"]=train[train.columns[25:50].to_list()].std(axis=1)
train["aspect_std"]=train[train.columns[50:75].to_list()].std(axis=1)
train["placurv_std"]=train[train.columns[75:100].to_list()].std(axis=1)
train["procurv_std"]=train[train.columns[100:125].to_list()].std(axis=1)
train["lsfactor_std"]=train[train.columns[125:150].to_list()].std(axis=1)
train["twi_std"]=train[train.columns[150:175].to_list()].std(axis=1)
train["geology_std"]=train[train.columns[175:200].to_list()].std(axis=1)
train["sdoif_std"]=train[train.columns[200:225].to_list()].std(axis=1)
#########################################################################
train["elevation_min"]=train[train.columns[:25].to_list()].min(axis=1)
train["slope_min"]=train[train.columns[25:50].to_list()].min(axis=1)
train["aspect_min"]=train[train.columns[50:75].to_list()].min(axis=1)
train["placurv_min"]=train[train.columns[75:100].to_list()].min(axis=1)
train["procurv_min"]=train[train.columns[100:125].to_list()].min(axis=1)
train["lsfactor_min"]=train[train.columns[125:150].to_list()].min(axis=1)
train["twi_min"]=train[train.columns[150:175].to_list()].min(axis=1)
train["geology_min"]=train[train.columns[175:200].to_list()].min(axis=1)
train["sdoif_min"]=train[train.columns[200:225].to_list()].min(axis=1)
#########################################################################
train["elevation_max"]=train[train.columns[:25].to_list()].max(axis=1)
train["slope_max"]=train[train.columns[25:50].to_list()].max(axis=1)
train["aspect_max"]=train[train.columns[50:75].to_list()].max(axis=1)
train["placurv_max"]=train[train.columns[75:100].to_list()].max(axis=1)
train["procurv_max"]=train[train.columns[100:125].to_list()].max(axis=1)
train["lsfactor_max"]=train[train.columns[125:150].to_list()].max(axis=1)
train["twi_max"]=train[train.columns[150:175].to_list()].max(axis=1)
train["geology_max"]=train[train.columns[175:200].to_list()].max(axis=1)
train["sdoif_max"]=train[train.columns[200:225].to_list()].max(axis=1)
#########################################################################
train["elevation_kurt"]=train[train.columns[:25].to_list()].kurt(axis=1)
train["slope_kurt"]=train[train.columns[25:50].to_list()].kurt(axis=1)
train["aspect_kurt"]=train[train.columns[50:75].to_list()].kurt(axis=1)
train["placurv_kurt"]=train[train.columns[75:100].to_list()].kurt(axis=1)
train["procurv_kurt"]=train[train.columns[100:125].to_list()].kurt(axis=1)
train["lsfactor_kurt"]=train[train.columns[125:150].to_list()].kurt(axis=1)
train["twi_kurt"]=train[train.columns[150:175].to_list()].kurt(axis=1)
train["geology_kurt"]=train[train.columns[175:200].to_list()].kurt(axis=1)
train["sdoif_kurt"]=train[train.columns[200:225].to_list()].kurt(axis=1)
#########################################################################
train["elevation_skew"]=train[train.columns[:25].to_list()].skew(axis=1)
train["slope_skew"]=train[train.columns[25:50].to_list()].skew(axis=1)
train["aspect_skew"]=train[train.columns[50:75].to_list()].skew(axis=1)
train["placurv_skew"]=train[train.columns[75:100].to_list()].skew(axis=1)
train["procurv_skew"]=train[train.columns[100:125].to_list()].skew(axis=1)
train["lsfactor_skew"]=train[train.columns[125:150].to_list()].skew(axis=1)
train["twi_skew"]=train[train.columns[150:175].to_list()].skew(axis=1)
train["geology_skew"]=train[train.columns[175:200].to_list()].skew(axis=1)
train["sdoif_skew"]=train[train.columns[200:225].to_list()].skew(axis=1)
#########################################################################
train["elevation_q7.5"]=train[train.columns[:25].to_list()].quantile(q=0.75,axis=1)
train["slope_q7.5"]=train[train.columns[25:50].to_list()].quantile(q=0.75,axis=1)
train["aspect_q7.5"]=train[train.columns[50:75].to_list()].quantile(q=0.75,axis=1)
train["placurv_q7.5"]=train[train.columns[75:100].to_list()].quantile(q=0.75,axis=1)
train["procurv_q7.5"]=train[train.columns[100:125].to_list()].quantile(q=0.75,axis=1)
train["lsfactor_q7.5"]=train[train.columns[125:150].to_list()].quantile(q=0.75,axis=1)
train["twi_q7.5"]=train[train.columns[150:175].to_list()].quantile(q=0.75,axis=1)
train["geology_q7.5"]=train[train.columns[175:200].to_list()].quantile(q=0.75,axis=1)
train["sdoif_q7.5"]=train[train.columns[200:225].to_list()].quantile(q=0.75,axis=1)
#########################################################################
train["elevation_q5"]=train[train.columns[:25].to_list()].quantile(q=0.5,axis=1)
train["slope_q5"]=train[train.columns[25:50].to_list()].quantile(q=0.5,axis=1)
train["aspect_q5"]=train[train.columns[50:75].to_list()].quantile(q=0.5,axis=1)
train["placurv_q5"]=train[train.columns[75:100].to_list()].quantile(q=0.5,axis=1)
train["procurv_q5"]=train[train.columns[100:125].to_list()].quantile(q=0.5,axis=1)
train["lsfactor_q5"]=train[train.columns[125:150].to_list()].quantile(q=0.5,axis=1)
train["twi_q5"]=train[train.columns[150:175].to_list()].quantile(q=0.5,axis=1)
train["geology_q5"]=train[train.columns[175:200].to_list()].quantile(q=0.5,axis=1)
train["sdoif_q5"]=train[train.columns[200:225].to_list()].quantile(q=0.5,axis=1)
#########################################################################
train["elevation_q2.5"]=train[train.columns[:25].to_list()].quantile(q=0.25,axis=1)
train["slope_q2.5"]=train[train.columns[25:50].to_list()].quantile(q=0.25,axis=1)
train["aspect_q2.5"]=train[train.columns[50:75].to_list()].quantile(q=0.25,axis=1)
train["placurv_q2.5"]=train[train.columns[75:100].to_list()].quantile(q=0.25,axis=1)
train["procurv_q2.5"]=train[train.columns[100:125].to_list()].quantile(q=0.25,axis=1)
train["lsfactor_q2.5"]=train[train.columns[125:150].to_list()].quantile(q=0.25,axis=1)
train["twi_q2.5"]=train[train.columns[150:175].to_list()].quantile(q=0.25,axis=1)
train["geology_q2.5"]=train[train.columns[175:200].to_list()].quantile(q=0.25,axis=1)
train["sdoif_q2.5"]=train[train.columns[200:225].to_list()].quantile(q=0.25,axis=1)
#########################################################################
train.drop(col, axis=1, inplace=True)

In [40]:
test["elevation_median"]=train[train.columns[:25].to_list()].median(axis=1)
test["slope_median"]=train[train.columns[25:50].to_list()].median(axis=1)
test["aspect_median"]=train[train.columns[50:75].to_list()].median(axis=1)
test["placurv_median"]=train[train.columns[75:100].to_list()].median(axis=1)
test["procurv_median"]=train[train.columns[100:125].to_list()].median(axis=1)
test["lsfactor_median"]=train[train.columns[125:150].to_list()].median(axis=1)
test["twi_median"]=train[train.columns[150:175].to_list()].median(axis=1)
test["geology_median"]=train[train.columns[175:200].to_list()].median(axis=1)
test["sdoif_median"]=train[train.columns[200:225].to_list()].median(axis=1)
#################################################################
test["elevation_mean"]=test[test.columns[:25].to_list()].mean(axis=1)
test["slope_mean"]=test[test.columns[25:50].to_list()].mean(axis=1)
test["aspect_mean"]=test[test.columns[50:75].to_list()].mean(axis=1)
test["placurv_mean"]=test[test.columns[75:100].to_list()].mean(axis=1)
test["procurv_mean"]=test[test.columns[100:125].to_list()].mean(axis=1)
test["lsfactor_mean"]=test[test.columns[125:150].to_list()].mean(axis=1)
test["twi_mean"]=test[test.columns[150:175].to_list()].mean(axis=1)
test["geology_mean"]=test[test.columns[175:200].to_list()].mean(axis=1)
test["sdoif_mean"]=test[test.columns[200:225].to_list()].mean(axis=1)
########################################################################
test["elevation_count"]=test[test.columns[:25].to_list()].count(axis=1)
test["slope_count"]=test[test.columns[25:50].to_list()].count(axis=1)
test["aspect_count"]=test[test.columns[50:75].to_list()].count(axis=1)
test["placurv_count"]=test[test.columns[75:100].to_list()].count(axis=1)
test["procurv_count"]=test[test.columns[100:125].to_list()].count(axis=1)
test["lsfactor_count"]=test[test.columns[125:150].to_list()].count(axis=1)
test["twi_count"]=test[test.columns[150:175].to_list()].count(axis=1)
test["geology_count"]=test[test.columns[175:200].to_list()].count(axis=1)
test["sdoif_count"]=test[test.columns[200:225].to_list()].count(axis=1)
#########################################################################
test["elevation_std"]=test[test.columns[:25].to_list()].std(axis=1)
test["slope_std"]=test[test.columns[25:50].to_list()].std(axis=1)
test["aspect_std"]=test[test.columns[50:75].to_list()].std(axis=1)
test["placurv_std"]=test[test.columns[75:100].to_list()].std(axis=1)
test["procurv_std"]=test[test.columns[100:125].to_list()].std(axis=1)
test["lsfactor_std"]=test[test.columns[125:150].to_list()].std(axis=1)
test["twi_std"]=test[test.columns[150:175].to_list()].std(axis=1)
test["geology_std"]=test[test.columns[175:200].to_list()].std(axis=1)
test["sdoif_std"]=test[test.columns[200:225].to_list()].std(axis=1)
#########################################################################
test["elevation_min"]=test[test.columns[:25].to_list()].min(axis=1)
test["slope_min"]=test[test.columns[25:50].to_list()].min(axis=1)
test["aspect_min"]=test[test.columns[50:75].to_list()].min(axis=1)
test["placurv_min"]=test[test.columns[75:100].to_list()].min(axis=1)
test["procurv_min"]=test[test.columns[100:125].to_list()].min(axis=1)
test["lsfactor_min"]=test[test.columns[125:150].to_list()].min(axis=1)
test["twi_min"]=test[test.columns[150:175].to_list()].min(axis=1)
test["geology_min"]=test[test.columns[175:200].to_list()].min(axis=1)
test["sdoif_min"]=test[test.columns[200:225].to_list()].min(axis=1)
#########################################################################
test["elevation_max"]=test[test.columns[:25].to_list()].max(axis=1)
test["slope_max"]=test[test.columns[25:50].to_list()].max(axis=1)
test["aspect_max"]=test[test.columns[50:75].to_list()].max(axis=1)
test["placurv_max"]=test[test.columns[75:100].to_list()].max(axis=1)
test["procurv_max"]=test[test.columns[100:125].to_list()].max(axis=1)
test["lsfactor_max"]=test[test.columns[125:150].to_list()].max(axis=1)
test["twi_max"]=test[test.columns[150:175].to_list()].max(axis=1)
test["geology_max"]=test[test.columns[175:200].to_list()].max(axis=1)
test["sdoif_max"]=test[test.columns[200:225].to_list()].max(axis=1)
#########################################################################
test["elevation_kurt"]=test[test.columns[:25].to_list()].kurt(axis=1)
test["slope_kurt"]=test[test.columns[25:50].to_list()].kurt(axis=1)
test["aspect_kurt"]=test[test.columns[50:75].to_list()].kurt(axis=1)
test["placurv_kurt"]=test[test.columns[75:100].to_list()].kurt(axis=1)
test["procurv_kurt"]=test[test.columns[100:125].to_list()].kurt(axis=1)
test["lsfactor_kurt"]=test[test.columns[125:150].to_list()].kurt(axis=1)
test["twi_kurt"]=test[test.columns[150:175].to_list()].kurt(axis=1)
test["geology_kurt"]=test[test.columns[175:200].to_list()].kurt(axis=1)
test["sdoif_kurt"]=test[test.columns[200:225].to_list()].kurt(axis=1)
#########################################################################
test["elevation_skew"]=test[test.columns[:25].to_list()].skew(axis=1)
test["slope_skew"]=test[test.columns[25:50].to_list()].skew(axis=1)
test["aspect_skew"]=test[test.columns[50:75].to_list()].skew(axis=1)
test["placurv_skew"]=test[test.columns[75:100].to_list()].skew(axis=1)
test["procurv_skew"]=test[test.columns[100:125].to_list()].skew(axis=1)
test["lsfactor_skew"]=test[test.columns[125:150].to_list()].skew(axis=1)
test["twi_skew"]=test[test.columns[150:175].to_list()].skew(axis=1)
test["geology_skew"]=test[test.columns[175:200].to_list()].skew(axis=1)
test["sdoif_skew"]=test[test.columns[200:225].to_list()].skew(axis=1)
#########################################################################
test["elevation_q7.5"]=test[test.columns[:25].to_list()].quantile(q=0.75,axis=1)
test["slope_q7.5"]=test[test.columns[25:50].to_list()].quantile(q=0.75,axis=1)
test["aspect_q7.5"]=test[test.columns[50:75].to_list()].quantile(q=0.75,axis=1)
test["placurv_q7.5"]=test[test.columns[75:100].to_list()].quantile(q=0.75,axis=1)
test["procurv_q7.5"]=test[test.columns[100:125].to_list()].quantile(q=0.75,axis=1)
test["lsfactor_q7.5"]=test[test.columns[125:150].to_list()].quantile(q=0.75,axis=1)
test["twi_q7.5"]=test[test.columns[150:175].to_list()].quantile(q=0.75,axis=1)
test["geology_q7.5"]=test[test.columns[175:200].to_list()].quantile(q=0.75,axis=1)
test["sdoif_q7.5"]=test[test.columns[200:225].to_list()].quantile(q=0.75,axis=1)
#########################################################################
test["elevation_q5"]=test[test.columns[:25].to_list()].quantile(q=0.5,axis=1)
test["slope_q5"]=test[test.columns[25:50].to_list()].quantile(q=0.5,axis=1)
test["aspect_q5"]=test[test.columns[50:75].to_list()].quantile(q=0.5,axis=1)
test["placurv_q5"]=test[test.columns[75:100].to_list()].quantile(q=0.5,axis=1)
test["procurv_q5"]=test[test.columns[100:125].to_list()].quantile(q=0.5,axis=1)
test["lsfactor_q5"]=test[test.columns[125:150].to_list()].quantile(q=0.5,axis=1)
test["twi_q5"]=test[test.columns[150:175].to_list()].quantile(q=0.5,axis=1)
test["geology_q5"]=test[test.columns[175:200].to_list()].quantile(q=0.5,axis=1)
test["sdoif_q5"]=test[test.columns[200:225].to_list()].quantile(q=0.5,axis=1)
#########################################################################
test["elevation_q2.5"]=test[test.columns[:25].to_list()].quantile(q=0.25,axis=1)
test["slope_q2.5"]=test[test.columns[25:50].to_list()].quantile(q=0.25,axis=1)
test["aspect_q2.5"]=test[test.columns[50:75].to_list()].quantile(q=0.25,axis=1)
test["placurv_q2.5"]=test[test.columns[75:100].to_list()].quantile(q=0.25,axis=1)
test["procurv_q2.5"]=test[test.columns[100:125].to_list()].quantile(q=0.25,axis=1)
test["lsfactor_q2.5"]=test[test.columns[125:150].to_list()].quantile(q=0.25,axis=1)
test["twi_q2.5"]=test[test.columns[150:175].to_list()].quantile(q=0.25,axis=1)
test["geology_q2.5"]=test[test.columns[175:200].to_list()].quantile(q=0.25,axis=1)
test["sdoif_q2.5"]=test[test.columns[200:225].to_list()].quantile(q=0.25,axis=1)
#########################################################################
test.drop(col, axis=1, inplace=True)

In [41]:
train.shape , test.shape

((10864, 107), (5430, 106))

In [42]:
y_train = train[['Label']]
df_train=train.drop(['Label'], axis=1)
cols=df_train.columns

In [43]:
# from sklearn.decomposition import PCA
# pca = PCA()
# df_train = pca.fit_transform(df_train)
# test = pca.fit_transform(test)
# df_train = df_train[:, :15]
# test = test[:, :15]
# df_train = pd.DataFrame(df_train, columns=train.columns[:15])
# test = pd.DataFrame(test, columns=train.columns[:15])

In [44]:
from sklearn.preprocessing import MinMaxScaler
ss = MinMaxScaler()
df_train = pd.DataFrame(ss.fit_transform(df_train), columns=df_train.columns)
test = pd.DataFrame(ss.transform(test), columns=test.columns)


In [45]:
# from imblearn.over_sampling import RandomOverSampler
# sampler = RandomOverSampler(random_state=11)
# from imblearn.over_sampling import SMOTE
# # sampler = SMOTE(k_neighbors=5, random_state=11)
# from imblearn.over_sampling import BorderlineSMOTE
# sampler = BorderlineSMOTE(k_neighbors=5, random_state=11)
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(random_state=11)

In [46]:
df_train.shape

(10864, 106)

In [None]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
errcb2=[]
y_pred_totcb2=[]
from sklearn.model_selection import KFold,StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score
fold=StratifiedKFold(n_splits=5)#15#5#10
i=1
from sklearn.model_selection import KFold,StratifiedKFold ,GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn import  ensemble
def get_model(Name='lgbm') :
    if Name=='lgbm' :
      return LGBMClassifier(**{ 'boosting_type': 'gbdt','objective': 'binary','metric': 'auc',
                                'n_estimators': 500,'colsample_bytree' : 0.8,
                                'seed': 42,'silent':False,
                                'early_stopping_rounds': 100,
                                'learning_rate' : 0.05,'num_iterations': 1500,
                                'max_depth' :8 ,'num_leaves' : 150,
                                'max_bins': 85,'min_data_in_leaf':30 })
cols=test.columns
test_ = test[cols]
Model_Name = "lgbm"
folds = StratifiedKFold(n_splits=15, shuffle=True, random_state=5168)
oofss  = np.zeros((len(df_train[cols])))
tstpred = np.zeros((len(test)))
tst_predd =  np.zeros((len(test)))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train, train[['Label']])):
    X_train, y_train = df_train.iloc[trn_idx][cols], train.iloc[trn_idx]['Label']
    X_valid, y_valid = df_train.iloc[val_idx][cols], train.iloc[val_idx]['Label']
#     X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
#     X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    X_train ,y_train = sampler.fit_resample(X_train ,y_train)
    
    clf = get_model(Name=Model_Name)
    clf.fit(X_train, y_train, eval_set = [(X_valid, y_valid)],
            verbose =100, early_stopping_rounds = 500)
#     m2  = CatBoostClassifier(n_estimators=800,eval_metric='Accuracy',od_wait=200, learning_rate=0.1,
#                               bootstrap_type='Bernoulli',metric_period=100,max_depth = 8,
#                      use_best_model=True)
#     m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_valid, y_valid)], early_stopping_rounds=200,verbose=100,)

    
    vp = clf.predict(X_valid)
    oofss[val_idx] = vp
    val_score = mean_squared_error((vp), (y_valid),squared=True)
    print(4*'-- -- -- --')
    print(f'Fold {fold_+1} Val score: {val_score}')
    print(4*'-- -- -- --')
    
    tp = clf.predict(test_)
    tstpred += tp / folds.n_splits

  
print()
print(3*'###',10*"^",3*'###')
print(mean_squared_error(train["Label"], oofss,squared=True))
print("Model training")
# clf.fit(X_train, y_train )

[LightGBM] [Info] Number of positive: 2535, number of negative: 2535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7498
[LightGBM] [Info] Number of data points in the train set: 5070, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[100]	valid_0's auc: 0.850758
[200]	valid_0's auc: 0.859857
[300]	valid_0's auc: 0.864661
[400]	valid_0's auc: 0.868256
[500]	valid_0's auc: 0.867759
[600]	valid_0's auc: 0.868358
[700]	valid_0's auc: 0.867992
[800]	valid_0's auc: 0.866662
[900]	valid_0's auc: 0.867708
[1000]	valid_0's auc: 0.867444
[1100]	valid_0's auc: 0.868063
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 0.21379310344827587
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
[LightGBM] [Info] Number of positive: 2535, number of negative: 2535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7496
[LightGBM] [Info] Number of data points in the 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(train["Label"], oofss))

In [None]:
np.unique(tstpred)

In [None]:
for i in range(len(tstpred)):
    if tstpred[i]>=0.4:
        tstpred[i]=1
    else:
        tstpred[i]=0

In [None]:
sub=pd.DataFrame({"Sample_ID":id_})
sub['Label']=tstpred
sub



In [None]:
sub.Label.value_counts()

In [None]:
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv",index=False)
    return FileLink(submission_name+".csv")
create_submission(sub, "sol")

In [None]:
feature_importance_df = pd.DataFrame(clf.feature_importances_, columns=['importance'])
feature_importance_df['feature'] = X_train.columns

plt.figure(figsize=(20, 12));
sns.barplot(x="importance", y="feature", data=feature_importance_df.sort_values(by = ['importance'], ascending = False).head(50))
plt.title('CatboostClassifier features importance (top 50):');

In [None]:
GonnaBeDeleted = feature_importance_df.sort_values(by = ['importance'], ascending = False).head(50)
GonnaBeDeleted = GonnaBeDeleted['feature'].tail(75)
GonnaBeDeleted.tolist()

In [None]:
# predictions = pd.DataFrame(y_pred_totcb2)

In [None]:
# predictions = np.where(np.mean(predictions, axis=0)> 0.30, 1, 0)

In [None]:
# submission = pd.DataFrame()
# submission['Sample_ID'] = id_
# submission['Label'] = pd.DataFrame(predictions.T)
# submission.to_csv('submission.csv', index=False)

In [None]:
# from sklearn.metrics import classification_report
# print(classification_report(train["Label"], y_pred_totcb2))

              precision    recall  f1-score   support

           0       0.94      0.80      0.87      8148
           1       0.59      0.85      0.70      2716

    accuracy                           0.81     10864
   macro avg       0.77      0.83      0.78     10864
weighted avg       0.85      0.81      0.82     10864

              precision    recall  f1-score   support

           0       0.94      0.80      0.86      8148
           1       0.58      0.85      0.69      2716

    accuracy                           0.81     10864
   macro avg       0.76      0.82      0.78     10864
weighted avg       0.85      0.81      0.82     10864


In [None]:
sub=pd.DataFrame({"Sample_ID":id_})
sub['Label']=(np.round((tstpred)))
sub
sub.Label.value_counts()
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv",index=False)
    return FileLink(submission_name+".csv")
create_submission(submission, "sol")

#### 