In [1]:
import datetime
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image


In [2]:
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)
sample = pd.read_csv('submit_sample.csv', header=None)

In [3]:
train

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,31,services,married,secondary,no,12294,yes,no,cellular,21,nov,101,3,498,0,other,0
1,29,entrepreneur,single,tertiary,no,43027,no,no,cellular,22,aug,158,2,702,0,unknown,1
2,35,management,married,tertiary,no,12252,yes,no,cellular,11,nov,351,1,826,0,failure,0
3,31,technician,married,secondary,no,99121,yes,yes,unknown,16,may,658,2,120,0,failure,0
4,48,unemployed,married,primary,no,42005,yes,no,telephone,3,apr,177,1,273,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27095,37,blue-collar,married,secondary,no,26661,yes,no,cellular,27,may,345,4,425,0,unknown,0
27096,35,services,married,secondary,no,42150,yes,no,cellular,27,may,121,1,719,0,unknown,0
27097,35,services,married,unknown,no,34531,no,no,cellular,28,jun,177,2,121,0,unknown,0
27098,30,admin.,single,secondary,no,99621,yes,no,cellular,27,may,121,1,100,0,unknown,0


In [4]:
cols_o = train.select_dtypes(include=object).columns.tolist()
cols_o

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [5]:
cols_i = train.select_dtypes(exclude=object).columns.tolist()
cols_i

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y']

In [6]:
test['y'] = -999

In [7]:
all_df = pd.concat([train, test], axis=0)
all_df

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,31,services,married,secondary,no,12294,yes,no,cellular,21,nov,101,3,498,0,other,0
1,29,entrepreneur,single,tertiary,no,43027,no,no,cellular,22,aug,158,2,702,0,unknown,1
2,35,management,married,tertiary,no,12252,yes,no,cellular,11,nov,351,1,826,0,failure,0
3,31,technician,married,secondary,no,99121,yes,yes,unknown,16,may,658,2,120,0,failure,0
4,48,unemployed,married,primary,no,42005,yes,no,telephone,3,apr,177,1,273,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18045,49,self-employed,married,tertiary,no,98357,yes,no,cellular,6,jul,101,2,417,0,failure,-999
18046,34,blue-collar,married,secondary,no,29621,yes,no,cellular,12,may,345,1,815,0,unknown,-999
18047,34,admin.,single,secondary,no,94260,yes,no,unknown,16,may,121,2,370,0,unknown,-999
18048,31,technician,single,secondary,no,65483,yes,no,unknown,15,may,345,2,41,0,unknown,-999


In [8]:
le = LabelEncoder()

for col in all_df.columns.drop('y'):
    all_df[col] = le.fit_transform(all_df[col])

all_df

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,9,7,1,1,0,6616,1,0,0,20,8,35,2,499,0,1,0
1,7,2,2,2,0,17047,0,0,0,21,1,66,1,703,0,3,1
2,13,4,1,2,0,6602,1,0,0,10,8,103,0,827,0,0,0
3,9,9,1,1,0,35951,1,1,2,15,7,124,1,121,0,0,0
4,26,10,1,0,0,16689,1,0,1,2,0,71,0,274,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18045,27,6,1,2,0,35680,1,0,0,5,4,35,1,418,0,0,-999
18046,12,1,1,1,0,12474,1,0,0,11,7,101,0,816,0,3,-999
18047,12,0,2,1,0,34245,1,0,2,15,7,42,1,371,0,3,-999
18048,9,9,2,1,0,24513,1,0,2,14,7,101,1,42,0,3,-999


In [9]:
std = StandardScaler()
all_df_std = std.fit_transform(all_df.drop('y', axis=1))
all_df_std

array([[-0.65111009,  0.82964952, -0.30302637, ...,  0.25429016,
        -0.23730182, -1.71840175],
       [-0.9075186 , -0.6915644 ,  1.45599146, ...,  1.06430014,
        -0.23730182,  0.39816254],
       [-0.13829306, -0.08307883, -0.30302637, ...,  1.55665915,
        -0.23730182, -2.7766839 ],
       ...,
       [-0.26649732, -1.30004997,  1.45599146, ..., -0.2539514 ,
        -0.23730182,  0.39816254],
       [-0.65111009,  1.43813509,  1.45599146, ..., -1.56029103,
        -0.23730182,  0.39816254],
       [-0.77931434,  0.22116395, -0.30302637, ...,  1.05238823,
        -0.23730182,  0.39816254]])

In [10]:
all_df_std[:27100]

array([[-0.65111009,  0.82964952, -0.30302637, ...,  0.25429016,
        -0.23730182, -1.71840175],
       [-0.9075186 , -0.6915644 ,  1.45599146, ...,  1.06430014,
        -0.23730182,  0.39816254],
       [-0.13829306, -0.08307883, -0.30302637, ...,  1.55665915,
        -0.23730182, -2.7766839 ],
       ...,
       [-0.13829306,  0.82964952, -0.30302637, ..., -1.24264006,
        -0.23730182,  0.39816254],
       [-0.77931434, -1.30004997,  1.45599146, ..., -1.32602344,
        -0.23730182,  0.39816254],
       [-0.26649732, -0.08307883, -0.30302637, ..., -0.44851262,
        -0.23730182,  0.39816254]])

In [11]:
all_df['y']

id
0          0
1          1
2          0
3          0
4          0
        ... 
18045   -999
18046   -999
18047   -999
18048   -999
18049   -999
Name: y, Length: 45150, dtype: int64

In [12]:
X, X_test, y, y_test = train_test_split(all_df_std[:27100], all_df['y'][:27100], test_size=0.3, random_state=0)

In [13]:
# X = all_df_std[:27100]
# y = all_df['y'][:27100]
X.shape, y.shape

((18970, 16), (18970,))

In [14]:
X

array([[-0.52290583, -1.30004997, -0.30302637, ...,  0.60767687,
        -0.23730182,  0.39816254],
       [ 1.400158  ,  0.82964952, -0.30302637, ...,  1.19533117,
        -0.23730182,  0.39816254],
       [ 2.16938353,  1.74237788,  1.45599146, ...,  1.49312896,
        -0.23730182,  0.39816254],
       ...,
       [ 3.06681332, -0.6915644 ,  1.45599146, ..., -0.45248326,
        -0.23730182,  0.39816254],
       [-0.52290583, -0.6915644 , -0.30302637, ..., -1.58411485,
        -0.23730182,  0.39816254],
       [-0.77931434, -0.99580719,  1.45599146, ..., -1.00440183,
        -0.23730182,  0.39816254]])

In [15]:
y

id
16404    0
2185     0
10080    1
4211     0
22914    0
        ..
13123    0
19648    0
9845     0
10799    0
2732     0
Name: y, Length: 18970, dtype: int64

## LightGBM

In [54]:
parameters = {
    'max_depth': list(range(1, 20)),
    'learning_rate': [1, 0.5, 0.1, 0.05, 0.01],
}

In [55]:
lgb = LGBMClassifier()

gcv = GridSearchCV(lgb, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv.fit(X, y)


GridSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [1, 0.5, 0.1, 0.05, 0.01],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19]},
             return_train_score=True, scoring='roc_auc')

In [56]:
gcv.best_params_

{'learning_rate': 0.1, 'max_depth': 5}

In [57]:
gcv.cv_results_

{'mean_fit_time': array([0.18888655, 0.24255123, 0.26050234, 0.33430619, 0.37499633,
        0.4298491 , 0.44460974, 0.44480982, 0.4529881 , 0.47851915,
        0.4404212 , 0.45976925, 0.4326426 , 0.43264236, 0.44899888,
        0.44919758, 0.42147055, 0.43443775, 0.41927671, 0.1904901 ,
        0.21681933, 0.25471797, 0.29939823, 0.36223049, 0.39653759,
        0.40890603, 0.39633994, 0.40411863, 0.39354701, 0.39673944,
        0.40631356, 0.39613929, 0.40112538, 0.40192499, 0.4049159 ,
        0.39793491, 0.40810785, 0.39394493, 0.18350883, 0.22280402,
        0.25790973, 0.31176562, 0.37419896, 0.43463693, 0.44181757,
        0.43583341, 0.44819951, 0.44221559, 0.44241695, 0.46455636,
        0.44480944, 0.43862643, 0.44780169, 0.43762898, 0.45179081,
        0.45578098, 0.44441066, 0.1819128 , 0.2287868 , 0.26050296,
        0.31336155, 0.3911531 , 0.45717659, 0.46355958, 0.4963099 ,
        0.47833056, 0.49507513, 0.49308043, 0.48629775, 0.4958734 ,
        0.49208193, 0.48829393,

In [58]:
# train_score = gcv.cv_results_['mean_train_score']
# test_score = gcv.cv_results_['mean_test_score']


In [59]:
# plt.plot(train_score)
# plt.plot(test_score)
# plt.xticks(range(0, len(gcv.param_grid['max_depth'])), list(gcv.param_grid['max_depth']))

In [60]:
pred_list = {}
pred_list['lgb'] = gcv.predict_proba(X)[:,1]


In [61]:
y_pred = gcv.predict_proba(X_test)[:,1]
y_pred

array([0.04863522, 0.00676828, 0.06531124, ..., 0.02227663, 0.04931018,
       0.67114398])

In [62]:
roc_auc_score(y_test, y_pred)
# 0.8456841121038058

0.8456841121038058

In [None]:
pred_list_test = {}
pred_list_test['lgb'] = y_pred

## XGBoost

In [None]:
parameters = {
    "learning_rate":[0.1,0.3,0.5],
    "max_depth": [2,3,5,10],
     "subsample":[0.5,0.8,0.9,1],
     "colsample_bytree": [0.5,1.0],    
}

In [None]:
xgb = XGBClassifier()

gcv2 = GridSearchCV(xgb, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv2.fit(X, y)


In [None]:
gcv2.best_params_

In [None]:
train_score = gcv2.cv_results_['mean_train_score']
test_score = gcv2.cv_results_['mean_test_score']

plt.plot(train_score)
plt.plot(test_score)
plt.xticks(range(0, len(gcv2.param_grid['max_depth'])), list(gcv2.param_grid['max_depth']))

In [None]:
pred_list['xgb'] = gcv2.predict_proba(X)[:,1]

In [None]:
y_pred2 = gcv2.predict_proba(X_test)[:,1]
y_pred2

In [None]:
roc_auc_score(y_test, y_pred2)
# 0.8465594251123711

In [None]:
pred_list_test['xgb'] = y_pred2

## LogisticRegression

In [34]:
gcv3 = LogisticRegression()

gcv3.fit(X, y)


LogisticRegression()

In [35]:
pred_list['logistic'] = gcv3.predict_proba(X)[:,1]
pred_list

{'lgb': array([0.0028839 , 0.01606278, 0.41616227, ..., 0.04773298, 0.02606361,
        0.03094729]),
 'xgb': array([0.00620427, 0.01752201, 0.21365789, ..., 0.17379753, 0.04019759,
        0.02910096], dtype=float32),
 'logistic': array([0.03765994, 0.03348983, 0.27168502, ..., 0.17384506, 0.01905472,
        0.04958183])}

In [36]:
y_pred3 = gcv3.predict_proba(X_test)[:,1]
y_pred3

array([0.07579774, 0.031406  , 0.03865054, ..., 0.02315975, 0.02580815,
       0.04051947])

In [37]:
roc_auc_score(y_test, y_pred3)
# 0.7605951589804085

0.7605951589804085

In [38]:
pred_list_test['logistic'] = y_pred3

## Random Forest

In [39]:
parameters = {
    'max_depth': list(range(1, 20)),
    'max_features': [1, 2, 3, 4, 5, 7, 10],
}

In [40]:
rf = RandomForestClassifier()

gcv4 = GridSearchCV(rf, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv4.fit(X, y)


KeyboardInterrupt: 

In [None]:
gcv4.best_params_

In [None]:
train_score = gcv4.cv_results_['mean_train_score']
test_score = gcv4.cv_results_['mean_test_score']

plt.plot(train_score)
plt.plot(test_score)
plt.xticks(range(0, len(gcv4.param_grid['max_depth'])), list(gcv4.param_grid['max_depth']))

In [None]:
pred_list['rf'] = gcv4.predict_proba(X)[:,1]

In [None]:
y_pred4 = gcv4.predict_proba(X_test)[:,1]
y_pred4

In [None]:
roc_auc_score(y_test, y_pred4)
# 0.8195972033973815

In [None]:
pred_list_test['rf'] = y_pred4

## Desicion Tree

In [None]:
parameters = {
    'max_depth': list(range(1, 20)),
}

In [None]:
dt = DecisionTreeClassifier()

gcv5 = GridSearchCV(dt, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv5.fit(X, y)


In [None]:
gcv5.best_params_

In [None]:
train_score = gcv5.cv_results_['mean_train_score']
test_score = gcv5.cv_results_['mean_test_score']

plt.plot(train_score)
plt.plot(test_score)
plt.xticks(range(0, len(gcv5.param_grid['max_depth'])), list(gcv5.param_grid['max_depth']))

In [None]:
pred_list['dt'] = gcv5.predict_proba(X)[:,1]

In [None]:
y_pred5 = gcv5.predict_proba(X_test)[:,1]
y_pred5

In [None]:
roc_auc_score(y_test, y_pred5)
# 0.765314891610439

In [None]:
pred_list_test['dt'] = y_pred5

## Stacking(Random Forest)

In [None]:
# sel_col = ['lgb', 'xgb', 'logistic', 'rf', 'dt']

sel_col = ['lgb', 'xgb', 'rf']

In [None]:
pred_df = pd.DataFrame(pred_list)
pred_df[sel_col]

In [None]:
pred_df_test = pd.DataFrame(pred_list_test)
pred_df_test[sel_col]

### RandomForest

In [None]:
parameters = {
    'max_depth': list(range(1, 20)),
    'max_features': [1, 2, 3, 4, 5, 7, 10],
}

In [None]:
gcv_st1 = GridSearchCV(rf, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv_st1.fit(pred_df[sel_col], y)

In [None]:
y_pred_st1 = gcv_st1.predict_proba(pred_df_test[sel_col])[:,1]
y_pred_st1

In [None]:
roc_auc_score(y_test, y_pred_st1)

### LightGBM

In [None]:
parameters = {
    'max_depth': list(range(1, 20)),
    'feature_fraction': [1, 2, 3, 4, 5, 7, 10],
}

In [None]:
gcv_st2 = GridSearchCV(lgb, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv_st2.fit(pred_df[sel_col], y)

In [None]:
y_pred_st2 = gcv_st2.predict_proba(pred_df_test[sel_col])[:,1]
y_pred_st2

In [None]:
roc_auc_score(y_test, y_pred_st2)

### XGBoost

In [None]:
parameters = {
    'max_depth': list(range(1, 20)),
    'colsample_bytree': [1, 2, 3, 4, 5, 7, 10],
}

In [None]:
gcv_st3 = GridSearchCV(xgb, parameters, cv=5, scoring="roc_auc", n_jobs=-1, return_train_score=True)
gcv_st3.fit(pred_df[sel_col], y)

In [None]:
y_pred_st3 = gcv_st3.predict_proba(pred_df_test[sel_col])[:,1]
y_pred_st3

In [None]:
roc_auc_score(y_test, y_pred_st3)

# Test Data Predict