In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.model_selection import learning_curve,GridSearchCV
from sklearn.model_selection import LeaveOneOut
from lightgbm import LGBMClassifier

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [16]:
# importing standard packages
import pandas as pd
import numpy as np 
from scipy import stats 

# importing the plot funnctions
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline 

# preprocessing/ model selection 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# importing the classifiers 
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# importing the metrics 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report,f1_score
from sklearn.metrics import roc_curve

# oversampling techniques 
from imblearn.over_sampling import SMOTE

# importing model saving package 
from joblib import dump, load

Using TensorFlow backend.


In [2]:
def resampling_unbalanced(train_x, train_y, sample_method):
    """ 

    Parameters
    ----------
    train_x : test_x 
            pd.DataFrame
    train_y : train_y 
            pd.DataFrame
    sample_method: 'over' or 'under'
            string

    Returns
    -------
    train_x : resampled train_x
                pd.DataFrame
    train_y : resampled train_y
                pd.DataFrame
        

    """
    
    
    if sample_method == "over":
        oversample = SMOTE()
        train_x_ros, train_y_ros = oversample.fit_resample(X_train, y_train)
        
        return train_x_ros, train_y_ros
        
    if sample_method == "under":
        # Concate X and Y train
        trainData = pd.concat([train_x, train_y],axis=1)
        # Class count
        count_class_0, count_class_1 = trainData["hospital_death"].value_counts()

        # Divide by class
        df_class_0 = trainData[trainData['hospital_death'] == 0]
        df_class_1 = trainData[trainData['hospital_death'] == 1]
        
        # Sample the majority class
        df_class_0_under = df_class_0.sample(count_class_1)
        
        # Put the train dataset together
        train_rus = pd.concat([df_class_0_under, df_class_1], axis=0)
        train_x_rus = train_rus.drop("hospital_death", axis = 1)
        train_y_rus = pd.DataFrame(train_rus['hospital_death'])
        
        return train_x_rus, train_y_rus

In [3]:
# grab the stored data frames
%store -r X_wids
%store -r y_wids

In [4]:
X_wids.shape, y_wids.shape

((91713, 598), (91713, 1))

In [5]:
# Split train-test dataset 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_wids, y_wids, test_size = 0.2, random_state = 31, stratify = y_wids)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(73370, 598) (18343, 598) (73370, 1) (18343, 1)


In [6]:
X_train_us = resampling_unbalanced(X_train, y_train, 'under')[0]
y_train_us = resampling_unbalanced(X_train, y_train, 'under')[1]

# GridSearch for LGBM

In [7]:
#set parameters without tuning

params = {'bosting_type' :'gbdt',
          'max_depth' : 7,
          'objective': 'binary',
          'learning_rate': 0.1,
          'n_estimators': 100}


In [8]:
#create parameters to search

gridParams = {
    'learning_rate': [0.1,0.05,0.01],
    'n_estimators': [100,150,200,250],
    'max_depth': [7, -1, 10],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    }



In [9]:
# Create classifier to use

md_1 = LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          max_depth = params['max_depth'],
          n_estimators = params['n_estimators'],
          learning_rate = params['learning_rate'] )

In [10]:
# To view the default model params:
md_1.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq'])

In [11]:
# Create the grid

grid = GridSearchCV(md_1, gridParams,
                    verbose=0,
                    cv=3,
                    n_jobs=1,
                   scoring = 'f1')

In [12]:
# Run the grid with under-sampled data
grid.fit(X_train_us, y_train_us)


# Print the best parameters found


print(grid.best_params_)
print(grid.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'boosting_type': 'gbdt', 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200, 'objective': 'binary'}
0.8128678488536535


In [26]:
# Using parameters already set above, replace in the best from the grid search

params['max_depth'] = grid.best_params_['max_depth']
params['learning_rate'] = grid.best_params_['learning_rate']
params['n_estimators'] = grid.best_params_['n_estimators']



print('Fitting with params: ')
print(params)

Fitting with params: 
{'bosting_type': 'gbdt', 'max_depth': 7, 'objective': 'binary', 'learning_rate': 0.05, 'n_estimators': 200}


In [27]:
# create a dictionary of classifers with the hyperparameters you would like to try 
classifiers = {'LGB_U_tuned': LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          max_depth = params['max_depth'],
          n_estimators = params['n_estimators'],
          learning_rate = params['learning_rate'])}

In [28]:
# keys for the classifiers 
selected_clfs = ['LGB_U_tuned']

In [29]:
clf_under_matrix = pd.DataFrame(columns = ['model', 'precision_test', 'recall_test', 'FPR_test', 'AUROC_test',
                                     'f1_test', 'f1_train', 'f1_CV'])

# for every classifer in the selected classifers list
for idx, classifier in enumerate(selected_clfs):

    # get the classifer and hyperparameters from the model
    clf = classifiers[classifier]

    # fit the model 
    clf.fit(X_train_us, y_train_us.values.ravel())
    print(f"-------- classifier being run is {classifier} ---------")

    print(f"-------- saving the the model ---------")
    # Write the model to file
    dump(clf, '{}.joblib'.format(classifier)) 

    print(f"-------- computing y_predict, y_prob, y_train_predict ---------")
    # prediction of y based on X_test
    y_predict = clf.predict(X_test)
    # prediction of y probability based on X_test
    y_proba = clf.predict_proba(X_test)[:,1]
    # prediction of y based on X_train
    y_train_predict = clf.predict(X_train_us)

    print(f"-------- creating the confusion matrix ---------") 
    # confusion matrix 
    cmat = confusion_matrix(y_test.values.ravel(), y_predict)
    # tp, fn, fp, tn
    tp = cmat[1,1]
    fn = cmat[1,0]
    fp = cmat[0,1]
    tn = cmat[0,0]

    print(f"-------- calculating the metrics ---------") 
    #precision score on test
    p_score_test = precision_score(y_test.values.ravel(), y_predict)

    #recall score on test
    r_score_test = tp/(tp+fn) 

    #FPR score on test
    fpr_test = fp/(fp+tn)

    #AUROC score on test
    auroc_test = roc_auc_score(y_test.values.ravel(), y_proba)

    #f1 score on test
    f1_test = f1_score(y_test.values.ravel(), y_predict)

    #f1 score on train
    f1_train = f1_score(y_train_us.values.ravel(), y_train_predict)

    #f1 score on cross validation 
    k_fold = 4
    f1_cv = cross_val_score(clf, X_train_us, y_train_us.values.ravel(), cv=k_fold, scoring = 'f1')
    print(f"Cross Validation is on {k_fold} folds.")

    print(f"-------- append values to the model matrix  ---------") 

    # append to matrix
    df2 = pd.DataFrame([[classifier,p_score_test,r_score_test, fpr_test, auroc_test, f1_test, f1_train, f1_cv]], 
                       columns=['model', 'precision_test', 'recall_test', 'FPR_test', 'AUROC_test',
                                     'f1_test', 'f1_train', 'f1_CV'])
    
    clf_under_matrix = pd.concat([df2, clf_under_matrix])


-------- classifier being run is LGB_U_tuned ---------
-------- saving the the model ---------
-------- computing y_predict, y_prob, y_train_predict ---------
-------- creating the confusion matrix ---------
-------- calculating the metrics ---------
Cross Validation is on 4 folds.
-------- append values to the model matrix  ---------


In [30]:
clf_under_matrix

Unnamed: 0,model,precision_test,recall_test,FPR_test,AUROC_test,f1_test,f1_train,f1_CV
0,LGB_U_tuned,0.291713,0.820594,0.188186,0.8997,0.430417,0.912183,"[0.8044657097288676, 0.816822429906542, 0.8191..."
