## Default prediction from customer spending habit

1. Create base functions for the project.
2. Initial Feature Engineering(Brainstorming)
3. Fit the model to the new features and test the results

In [1]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.cross_validation import KFold



### Data Exploration and Feature Engineering --- manipulate and extract features from each table

In [301]:
transaction_df = pd.read_csv('../raw_data/transactions_train.csv')
payment_df = pd.read_csv('../raw_data/paiements_train.csv')
billing_df = pd.read_csv('../raw_data/facturation_train.csv')
performance_df = pd.read_csv('../raw_data/performance_train.csv')

### Transaction table

In [302]:
transaction_df.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,A,365767,DP,C,5927.0,52.53,E,2015-06-20 12:00:00,F,AN
1,99690111,L,2635650,DP,C,13343.0,28.35,B,2015-01-25 12:00:00,F,AN
2,99690111,L,2635650,DP,C,13343.0,0.0,A,2015-01-26 12:00:00,G,AN
3,99690111,J,680536,AF,C,9430.0,0.0,A,2015-03-25 08:00:00,G,AW
4,99690111,J,680536,AF,C,10600.0,0.0,A,2015-03-03 08:00:00,G,AW


In [303]:
# the type of columns
transaction_df.dtypes

ID_CPTE                       int64
MERCHANT_CATEGORY_XCD        object
MERCHANT_CITY_NAME            int64
MERCHANT_COUNTRY_XCD         object
DECISION_XCD                 object
PRIOR_CREDIT_LIMIT_AMT      float64
TRANSACTION_AMT             float64
TRANSACTION_CATEGORY_XCD     object
TRANSACTION_DTTM             object
TRANSACTION_TYPE_XCD         object
SICGROUP                     object
dtype: object

In [304]:
# convert and select categorical columnst
categorical_columns = ['MERCHANT_CATEGORY_XCD', 'MERCHANT_CITY_NAME', 'MERCHANT_COUNTRY_XCD', 'DECISION_XCD', 
                       'TRANSACTION_CATEGORY_XCD', 'TRANSACTION_TYPE_XCD', 'SICGROUP'] 

In [305]:
# encode the categorical features
lbl = preprocessing.LabelEncoder()
for col in categorical_columns:
    transaction_df[col].fillna('unknown')
    transaction_df[col] = lbl.fit_transform(transaction_df[col].astype(str))

In [306]:
transaction_df.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,0,8516,90,2,5927.0,52.53,4,2015-06-20 12:00:00,5,13
1,99690111,27,6681,90,2,13343.0,28.35,1,2015-01-25 12:00:00,5,13
2,99690111,27,6681,90,2,13343.0,0.0,0,2015-01-26 12:00:00,6,13
3,99690111,24,9798,5,2,9430.0,0.0,0,2015-03-25 08:00:00,6,22
4,99690111,24,9798,5,2,10600.0,0.0,0,2015-03-03 08:00:00,6,22


In [307]:
len(set(transaction_df['ID_CPTE']))

3769

In [308]:
transaction_df[transaction_df['ID_CPTE'] == 99690111].head(10)

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,0,8516,90,2,5927.0,52.53,4,2015-06-20 12:00:00,5,13
1,99690111,27,6681,90,2,13343.0,28.35,1,2015-01-25 12:00:00,5,13
2,99690111,27,6681,90,2,13343.0,0.0,0,2015-01-26 12:00:00,6,13
3,99690111,24,9798,5,2,9430.0,0.0,0,2015-03-25 08:00:00,6,22
4,99690111,24,9798,5,2,10600.0,0.0,0,2015-03-03 08:00:00,6,22
5,99690111,24,9798,5,2,12203.0,7.28,2,2015-02-16 16:00:00,1,22
6,99690111,24,9798,5,2,13314.0,7.0,2,2015-01-20 12:00:00,1,22
7,99690111,24,9798,5,2,9696.0,7.14,2,2015-01-11 08:00:00,1,22
8,99690111,24,9798,5,2,12203.0,0.0,0,2015-02-20 16:00:00,6,22
9,99690111,24,9798,5,2,10600.0,6.36,2,2015-03-04 08:00:00,1,22


In [309]:
### Extract features from transaction_df
# for MERCHANT_CATEGORY_XCD
# sum the total spent money on the merchant category for the individual customer
tmp = transaction_df.groupby(['ID_CPTE', 'MERCHANT_CATEGORY_XCD'])['TRANSACTION_AMT'].sum()
tmp.head()

ID_CPTE   MERCHANT_CATEGORY_XCD
10034823  2                         14.98
          5                        536.54
          6                        395.68
          8                        377.36
          11                       905.19
Name: TRANSACTION_AMT, dtype: float64

In [310]:
tmp = tmp.reset_index()

In [311]:
tmp.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,TRANSACTION_AMT
0,10034823,2,14.98
1,10034823,5,536.54
2,10034823,6,395.68
3,10034823,8,377.36
4,10034823,11,905.19


In [312]:
tmp = tmp.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'MERCHANT_CATEGORY_XCD')
tmp.columns = ['MERCHANT_CATEGORY_' + str(i) for i in tmp.columns]

In [313]:
tmp = tmp.fillna(0)

In [314]:
tmp.head()

Unnamed: 0_level_0,MERCHANT_CATEGORY_0,MERCHANT_CATEGORY_1,MERCHANT_CATEGORY_2,MERCHANT_CATEGORY_3,MERCHANT_CATEGORY_4,MERCHANT_CATEGORY_5,MERCHANT_CATEGORY_6,MERCHANT_CATEGORY_7,MERCHANT_CATEGORY_8,MERCHANT_CATEGORY_9,...,MERCHANT_CATEGORY_45,MERCHANT_CATEGORY_46,MERCHANT_CATEGORY_47,MERCHANT_CATEGORY_48,MERCHANT_CATEGORY_49,MERCHANT_CATEGORY_50,MERCHANT_CATEGORY_51,MERCHANT_CATEGORY_52,MERCHANT_CATEGORY_53,MERCHANT_CATEGORY_54
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10034823,0.0,0.0,14.98,0.0,0.0,536.54,395.68,0.0,377.36,0.0,...,38.84,0.0,6.36,0.0,0.0,1548.0,840.65,0.0,0.0,0.0
10069450,0.0,311.37,0.0,0.0,17.51,39.22,0.0,267.4,175.76,0.0,...,635.75,135.87,0.0,0.0,0.0,0.0,132.14,0.0,0.0,0.0
10081565,0.0,0.0,0.0,0.0,8.4,0.0,0.0,0.0,169.6,0.0,...,34.1,0.0,0.0,0.0,0.0,10.47,307.88,0.0,0.0,0.0
10083607,0.0,0.0,0.0,0.0,336.16,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1022.19,11.33,0.0,0.0,0.0
10097162,0.0,0.0,552.22,0.0,397.8,0.0,1308.31,10.3,170.69,0.0,...,0.0,0.0,0.0,0.0,0.0,241.82,775.75,0.0,0.0,0.0


In [315]:
merchant_category = list(set(transaction_df['MERCHANT_CATEGORY_XCD']))

In [316]:
tmp2 = pd.DataFrame()

### Payment table

In [317]:
payment_df.head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
4,99690111,250.0,2015-11-24 00:00:00,Q


In [318]:
len(set(payment_df['ID_CPTE']))

11900

In [157]:
# null / total
payment_df['TRANSACTION_AMT'].isnull().sum() / len(payment_df)

0.00025656814449917899

In [158]:
# drop the null rows
payment_df = payment_df.dropna()

In [159]:
# get the month and year of TRANSACTION_DTTM
payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])

In [160]:
payment_df = payment_df.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])

In [164]:
payment_df = payment_df.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])['TRANSACTION_AMT'].sum().reset_index()

In [167]:
payment_df = payment_df.groupby('ID_CPTE').tail(12)

In [170]:
payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: x.split('-')[1])

In [174]:
payment_transaction = payment_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'TRANSACTION_DTTM')
payment_transaction.columns = ['transaction_' + str(i) for i in payment_transaction.columns + '_month']

In [176]:
payment_transaction = payment_transaction.reset_index()

In [178]:
payment_transaction = payment_transaction.fillna(0)

### Billing table

In [13]:
billing_df.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


In [118]:
len(set(billing_df['ID_CPTE']))

11900

In [19]:
billing_df[billing_df['ID_CPTE'] == 99690111].sort_values(['StatementDate'])

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
7,99690111,2014-12-01,2014-12-03,1151.85,0.0,12000.0,0
10,99690111,2015-01-01,2015-01-02,2298.96,350.0,12000.0,0
8,99690111,2015-02-01,2015-01-31,4045.67,1148.45,16200.0,0
11,99690111,2015-03-01,2015-03-03,5926.2,2567.25,16200.0,0
12,99690111,2015-04-01,2015-03-31,6916.62,3307.33,16200.0,0
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
13,99690111,2015-07-01,2015-07-05,10560.0,5127.54,16200.0,0
5,99690111,2015-08-01,2015-08-02,10610.05,4753.35,16200.0,0


In [186]:
billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[:-3])

In [189]:
billing_df = billing_df.sort_values(['ID_CPTE', 'PERIODID_MY'])
billing_df = billing_df.reset_index(drop=True)

In [192]:
billing_df = billing_df.groupby('ID_CPTE').tail(12)
billing_df = billing_df.reset_index(drop=True)

In [195]:
billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[-2:])

In [203]:
# CurrentTotalBalance
total_balance = billing_df.pivot_table('CurrentTotalBalance', ['ID_CPTE'], 'PERIODID_MY')
total_balance.columns = ['total_balance_' + str(i) for i in total_balance.columns + '_month']

In [206]:
# CashBalance
cash_balance = billing_df.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']

In [208]:
# CreditLimit
credit_limit = billing_df.pivot_table('CreditLimit', ['ID_CPTE'], 'PERIODID_MY')
credit_limit.columns = ['credit_limit_' + str(i) for i in credit_limit.columns + '_month']

In [211]:
# DelqCycle
delq_cycle = billing_df.pivot_table('DelqCycle', ['ID_CPTE'], 'PERIODID_MY')
delq_cycle.columns = ['delq_cycle_' + str(i) for i in delq_cycle.columns + '_month']

In [215]:
total_balance = total_balance.reset_index()
cash_balance = cash_balance.reset_index()
credit_limit = credit_limit.reset_index()
delq_cycle = delq_cycle.reset_index()

In [253]:
tmp = total_balance.merge(cash_balance, on='ID_CPTE')
tmp = tmp.merge(credit_limit, on='ID_CPTE')
tmp = tmp.merge(delq_cycle, on='ID_CPTE')

In [254]:
tmp.head()

Unnamed: 0,ID_CPTE,total_balance_01_month,total_balance_02_month,total_balance_03_month,total_balance_04_month,total_balance_05_month,total_balance_06_month,total_balance_07_month,total_balance_08_month,total_balance_09_month,...,delq_cycle_03_month,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month
0,10001822,11479.66,11481.12,11199.6,11017.08,8079.42,9389.97,9315.23,9831.0,10254.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10007972,560.0,0.0,38.0,560.74,443.7,442.0,571.34,400.05,519.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10012520,1634.96,1846.95,1428.45,1013.52,619.03,410.55,495.91,641.68,876.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10025534,6107.9,6353.91,6277.08,6389.28,1274.11,4549.65,5282.59,6312.78,6401.85,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,10033579,409.76,251.45,432.0,416.58,385.84,474.88,495.0,524.28,552.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [255]:
tmp = payment_transaction.merge(tmp, on='ID_CPTE')

In [256]:
tmp.head()

Unnamed: 0,ID_CPTE,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,...,delq_cycle_03_month,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month
0,10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [257]:
len(tmp)

11825

In [258]:
len(performance_df[performance_df['Default'] == 1]) / len(performance_df)

0.19336134453781512

In [259]:
performance_df[performance_df['Default'] == 1].head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
18,75780289,2012-12-01,1
53,58022132,2013-12-01,1
56,25809739,2015-12-01,1
72,35143533,2013-12-01,1
137,94504449,2012-12-01,1


In [260]:
tmp = tmp.merge(performance_df[['ID_CPTE', 'Default']], on='ID_CPTE')

In [261]:
tmp = tmp.set_index('ID_CPTE')

In [262]:
tmp.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,delq_cycle_04_month,delq_cycle_05_month,delq_cycle_06_month,delq_cycle_07_month,delq_cycle_08_month,delq_cycle_09_month,delq_cycle_10_month,delq_cycle_11_month,delq_cycle_12_month,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,231.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,701.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,612.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1
10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,219.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


### Create basic classes and functions

In [263]:
# Create basic scikit-learn wrapper model class
class SklearnWrapper:
    def __init__(self, clf, seed=0, params=None, seed_bool=True):
        if (seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

In [8]:
# create basic xgboost wrapper model class
class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [9]:
# create basic lightGBM wrapper model class
class LightGbmWrapper:
    def __init(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1550)
        self.verbose_eval = params.pop('verbose_eval', 100)
        
    def train(self, x_train, y_train):
        lgtrain = lgb.Dataset(x_train, y_train)
        self.lgbm = lgb.train(self.param, lgtrain, num_boost_round=self.nrounds, verbose_eval=self.verbose_eval)
    
    def predict(self, x):
        return self.lgbm.predict(lgb.Dataset(x))

In [10]:
# create out-of-fold predictions 
# make good use of k-fold CV's result 
# serving for the staking alogrithm 
# create a new column generated from model's score

def get_oof(clf, x_train, y, x_test):
    '''
    clf: the classifer, which can be logistic regression, SVM regression, Bayes classifier, etc.
    x_train: the training x in training dataset
    y: the training y in training dataset
    x_test: the testing x in training dataset 
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((nfold, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)       

In [264]:
tmp = tmp.fillna(0)
X = np.array(tmp.iloc[:, :-1])
y = np.array(tmp.iloc[:, -1])

In [265]:
len(X)

11825

In [246]:
from imblearn.over_sampling import SMOTE

In [266]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)

In [268]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

In [269]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [271]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [273]:
predict = clf.predict(X_test)

In [274]:
roc_auc_score(predict, y_test)

0.80967897397215616