In [1]:
import os
import pandas as pd
import copy
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

In [2]:
file = os.path.expanduser('~/Desktop/titanic_train.csv')
df = pd.read_csv(file)

In [3]:
train_y = df['Survived']
df = df.drop(['PassengerId','Survived'], axis=1)

In [5]:
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()

for c in df.columns:
    df[c] = df[c].fillna(-1)
    if df[c].dtype == 'object':
        df[c] = LEncoder.fit_transform(list(df[c].values))
    df[c] = MMEncoder.fit_transform(df[c].values.reshape(-1,1))

In [8]:
train_x = df.values

train_x,test_x,train_y,test_y = train_test_split(train_x,train_y,test_size = 0.5)
train_x,val_x,train_y,val_y = train_test_split(train_x,train_y,test_size=0.5)

In [9]:
gdbt = GradientBoostingClassifier(subsample=0.93, n_estimators=320, min_samples_split=0.1, min_samples_leaf=0.3, 
                                  max_features=4, max_depth=4, learning_rate=0.16)
onehot = OneHotEncoder()
lr = LogisticRegression(solver='lbfgs', max_iter=1000)

gdbt.fit(train_x,train_y)
onehot.fit(gdbt.apply(train_x)[:,:,0])
lr.fit(onehot.transform(gdbt.apply(val_x)[:,:,0]),val_y)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
gdbt.apply(train_x)[:,:,0]

array([[2., 2., 2., ..., 2., 1., 1.],
       [2., 2., 2., ..., 1., 1., 1.],
       [2., 2., 2., ..., 2., 2., 2.],
       ...,
       [1., 1., 1., ..., 2., 2., 2.],
       [1., 1., 1., ..., 1., 2., 2.],
       [2., 2., 2., ..., 2., 2., 2.]])

In [13]:
pred_gdbt_lr = lr.predict_proba(onehot.transform(gdbt.apply(test_x)[:, :, 0]))[:, 1]
fpr_gdbt_lr, tpr_gdbt_lr, _ = roc_curve(test_y, pred_gdbt_lr)
# 將梯度提升樹結果輸出
pred_gdbt = gdbt.predict_proba(test_x)[:, 1]
fpr_gdbt, tpr_gdbt, _ = roc_curve(test_y, pred_gdbt)

In [12]:
import matplotlib.pyplot as plt
# 將結果繪圖
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_gdbt, tpr_gdbt, label='GDBT')
plt.plot(fpr_gdbt_lr, tpr_gdbt_lr, label='GDBT + LR')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

<Figure size 640x480 with 1 Axes>