In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import os, sys, gc, warnings, random, datetime
from skopt import BayesSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import StratifiedKFold , KFold
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import train_test_split
# SETTINGS - CHANGE THESE TO GET SOMETHING MEANINGFUL
ITERATIONS = 10 # 1000
# TRAINING_SIZE = 100000 # 20000000
# TEST_SIZE = 25000



In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 42
seed_everything(SEED)

In [3]:
path_read ="/Users/a06411/Documents/data_hub/lending_club/lgb_selected.pkl"

In [4]:
### Load Data
df = pd.read_pickle(path_read)

In [5]:

X = df.drop('loan_status', axis=1)

y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2021)

In [6]:
df['issue_d'] = pd.to_datetime(df['issue_d'])
df['issue_year'] = df['issue_d'].dt.year

In [7]:
train = df[df['issue_year'] != 2018]
test = df[df['issue_year'] == 2018]

In [8]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [9]:
answer = test[['id','loan_status']]
# test.drop('loan_status',1, inplace = True)

In [10]:
# Load Data with selected features
X = train.copy()
# Labels
y = train['loan_status']

# Remove Labels from Dataframe
X.drop(['loan_status'], axis = 1, inplace = True)
# Final Data Shapes
print(X.shape)
print(y.shape)

(1314290, 63)
(1314290,)


In [11]:
ttrain = train[train['issue_year'] != 2017]
ttest = train[train['issue_year'] == 2017]

In [12]:
X_train = ttrain.copy()
y_train = ttrain['loan_status']
X_train.drop(['loan_status'], axis = 1, inplace = True)

X_test = ttest.copy()
y_test = ttest['loan_status']
X_test.drop(['loan_status'], axis = 1, inplace = True)

In [13]:
remove_features = ['earliest_cr_line', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d', 'next_pymnt_d',
                  'initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','total_rec_late_fee',
                   'recoveries','collection_recovery_fee','last_pymnt_amnt','last_fico_range_high','last_fico_range_low' , 'id', 'loan_status' , 'issue_year' ]
features  = [col for col in list(train) if col not in remove_features]

In [14]:
X_train = X_train[features]
X_test = X_test[features]

In [15]:
space={'max_depth': hp.quniform("max_depth", 6, 24, 3),
        'gamma': hp.uniform ('gamma', 1,9),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
       'subsample' : hp.uniform('subsample', 0.5,1) , 
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 5000,
        'seed': 2021,
       'learning_rate' :0.1
    }

In [21]:
def objective(space):
    clf=xgb.XGBClassifier(
        n_estimators =space['n_estimators'], 
        max_depth = int(space['max_depth']), 
        gamma = space['gamma'],
        min_child_weight=int(space['min_child_weight']),
        colsample_bytree=space['colsample_bytree'],
        subsample=space['subsample'], )
       ## tree_method = 'gpu_hist')  ## cpu 처리할때는 주석처리할 것
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=40,verbose=False )
    

    pred = clf.predict(X_test)
    auc = roc_auc_score(y_test, pred)
    print ("SCORE:", auc)
    return {'loss': -auc, 'status': STATUS_OK }

## xgboost cpu 버전 처리 


In [None]:
trials = Trials()
# Suppress warnings 
import warnings


warnings.filterwarnings('ignore')
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 20,
                        trials = trials)

  0%|                                    | 0/20 [00:00<?, ?trial/s, best loss=?]

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
y_train = train.pop('loan_status')
y_test = test.pop('loan_status')
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
# remove_features = ['id','loan_status']
features = [col for col in list(train) if col not in remove_features]

In [None]:
def make_xgb_prediction(train, y, test, features, model_params=None, folds=5):
    
    skf = KFold(n_splits=folds, random_state=SEED, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = xgb.DMatrix(x_tr, label=y_tr)
        dvalid = xgb.DMatrix(x_val, label=y_val)

        clf = xgb.train(
            model_params,
            dtrain,
            num_boost_round=10000, 
            evals=[(dtrain, 'train'), (dvalid, 'valid')],
            verbose_eval=50,
            early_stopping_rounds=100

        )

#         feature_importance[f'fold_{fold+1}'] = clf.get_score()

        y_pred_val = clf.predict(dvalid)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | AUC Score: {roc_auc_score(y_val, y_pred_val)}")

        score += roc_auc_score(y_val, y_pred_val) / folds
        y_preds += clf.predict(xgb.DMatrix(x_test)) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean AUC score = {score}")
    print(f"OOF AUC score = {roc_auc_score(y, y_oof)}")
    
    return y_oof, y_preds, feature_importance

In [None]:
# # xgb model params
# xgb_params = {
#     'objective': 'binary:logistic',
#     'learning_rate': 0.1,
#     'max_depth': 9,
#     'colsample_bytree': 0.8,
#      'subsample': 0.8,
#     'eval_metric': 'auc',
#     'seed': SEED,

#     'min_child_weight' : 3,
#      'tree_method' : 'gpu_hist' 

# } 

In [None]:
# xgb model params
xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'max_depth': int(best_hyperparams['max_depth']),
    'colsample_bytree': best_hyperparams['colsample_bytree'],
     'subsample': best_hyperparams['subsample'],
    'eval_metric': 'auc',
    'seed': SEED,
    'gamma' : best_hyperparams['gamma'],
    'min_child_weight' : int(best_hyperparams['min_child_weight']) ,
    ##'tree_method' : 'gpu_hist' 

} 

In [None]:
y_oof_xgb, y_preds_xgb, fi_xgb = make_xgb_prediction(train, y_train, test, features, \
                                                     model_params=xgb_params)

In [None]:
print(f"TEST AUC score = {roc_auc_score(answer['loan_status'], y_preds_xgb)}")
