### Import Packages

In [1]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC

### Load Data

In [2]:
train = pd.read_csv('/nfsroot/data/home/2359B48/Santander Competition/datasets/train.csv')
test = pd.read_csv('/nfsroot/data/home/2359B48/Santander Competition/datasets/test.csv')

In [3]:
print(train.shape)
test.shape

(200000, 202)


(200000, 201)

In [4]:
train.head(3)

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965


In [5]:
train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [6]:
X = train.iloc[:, 2:]
Y = train[['target']]

In [21]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(X, Y)

5

In [22]:
print(skf)

StratifiedKFold(n_splits=5, random_state=42, shuffle=True)


In [23]:
for train_index, test_index in skf.split(X, Y):
    print('TRAIN:', train_index, 'TEST:', test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [     1      2      3 ... 199996 199997 199999] TEST: [     0     11     12 ... 199988 199992 199998]


KeyError: '[     1      2      3 ... 199996 199997 199999] not in index'

In [None]:
####################################################################

In [9]:
param = {'num_leaves': 50,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 5,
         'learning_rate': 0.006,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 27,
         "metric": 'auc',
         "verbosity": -1,
        'is_unbalance' : True}

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

### Model Building

In [26]:
d_train = lgb.Dataset(x_train, label=y_train)
clf = lgb.train(param, d_train, 100)
y_pred = clf.predict(x_test)
len(y_pred)

### Predict On Test Data

In [46]:
test_pred = clf.predict(test.iloc[:, 1:])

test_predictions = pd.DataFrame()
test_predictions[['ID_code']] = test[['ID_code']] 
print(test_predictions.shape)

df = pd.DataFrame({'target':test_pred})
print(df.shape)

submission_lgbm = pd.concat([test_predictions, df], axis=1)
print(submission_lgbm.shape)

submission_lgbm.head(2)

In [50]:
submission_lgbm.to_csv('/nfsroot/data/home/2359B48/Santander Competition/submissions/submission_lgbm.csv', index=False)

Unnamed: 0,ID_code,target
0,test_0,0.11788
1,test_1,0.085708


################## LGBM With IMP Features #########################

### Taking Feature Importance from Random Forest

In [53]:
rfc1 = RandomForestClassifier(bootstrap=True, max_depth=40, min_samples_split=12, n_estimators=10)
rfc1.fit(x_train, y_train)
y_pred = rfc1.predict(x_test)

train_accuracy = accuracy_score(y_train, rfc1.predict(x_train))
val_accuracy = accuracy_score(y_test, y_pred)

  


In [54]:
print(train_accuracy)
print(val_accuracy)

0.9541
0.89878


In [58]:
feature_imp_RFC = pd.DataFrame({"imp_features":rfc1.feature_importances_, "columns":X.columns})
feature_imp_RFC.sort_values(by = 'imp_features', ascending=False)

Unnamed: 0,imp_features,columns
81,0.015073,var_81
139,0.011707,var_139
53,0.011150,var_53
12,0.010785,var_12
110,0.010611,var_110
26,0.010027,var_26
109,0.009435,var_109
174,0.009412,var_174
166,0.009201,var_166
6,0.009009,var_6


In [74]:
top_features = feature_imp_RFC[feature_imp_RFC['imp_features'] >= 0.01]
top_features.head()

Unnamed: 0,imp_features,columns
12,0.010785,var_12
26,0.010027,var_26
53,0.01115,var_53
81,0.015073,var_81
110,0.010611,var_110


In [75]:
## Select Imp_columns For Train Data

imp_columns_train = pd.DataFrame()
for i in top_features['columns'].values:
    imp_columns_train[i] = x_train[i]
    
    
## Select Imp_columns For Validation Data
imp_columns_val = pd.DataFrame()
for i in top_features['columns'].values:
    imp_columns_val[i] = x_test[i]

    
## Select Imp_columns For Test Data
imp_columns_test = pd.DataFrame()
for i in top_features['columns'].values:
    imp_columns_test[i] = test[i]

In [76]:
d_train1 = lgb.Dataset(imp_columns_train, y_train)
clf = lgb.train(param, d_train1, 100)
y_pred1 = clf.predict(imp_columns_val)
len(y_pred)

50000

In [77]:
test_pred1 = clf.predict(imp_columns_test)

test_predictions1 = pd.DataFrame()
test_predictions1[['ID_code']] = test[['ID_code']] 
print(test_predictions1.shape)

df1 = pd.DataFrame({'target':test_pred})
print(df1.shape)

submission_lgbm_rf_imp = pd.concat([test_predictions1, df1], axis=1)
print(submission_lgbm_rf_imp.shape)

submission_lgbm_rf_imp.head(5)

(200000, 1)
(200000, 1)
(200000, 2)


Unnamed: 0,ID_code,target
0,test_0,0.11788
1,test_1,0.085708
2,test_2,0.125834
3,test_3,0.093452
4,test_4,0.096944


In [78]:
submission_lgbm_rf_imp.to_csv('/nfsroot/data/home/2359B48/Santander Competition/submissions/submission_lgbm_rf_imp_0.01.csv', index=False)

In [None]:
###################################################################################################

In [10]:
d_train = lgb.Dataset(x_train, label=y_train)
clf = lgb.train(param, d_train, 100)
y_pred = clf.predict(x_test)
len(y_pred)

50000

In [11]:
test_pred = clf.predict(test.iloc[:, 1:])

test_predictions = pd.DataFrame()
test_predictions[['ID_code']] = test[['ID_code']] 
print(test_predictions.shape)

df = pd.DataFrame({'target':test_pred})
print(df.shape)

submission_lgbm = pd.concat([test_predictions, df], axis=1)
print(submission_lgbm.shape)

submission_lgbm.head(2)

(200000, 1)
(200000, 1)
(200000, 2)


Unnamed: 0,ID_code,target
0,test_0,0.300385
1,test_1,0.256588


In [13]:
submission_lgbm.to_csv('/nfsroot/data/home/2359B48/Santander Competition/submissions/submission_lgbm_unbalance.csv', index=False)

In [None]:
#################################################################################################################

In [None]:
trainX_fold, valX_fold, trainY_fold, valY_fold =  train_test_split(X, Y1, test_size=0.25, random_state=42, shuffle=False)

y_train, y_val = Y.loc[trainY_fold.index], Y.loc[valY_fold.index]