In [13]:
# importing standard packages
import pandas as pd
import numpy as np 
from scipy import stats 

# importing the plot funnctions
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline 

# preprocessing/ model selection 
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

# importing the classifiers 
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# importing the metrics 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report,f1_score
from sklearn.metrics import roc_curve

# oversampling techniques 
from imblearn.over_sampling import SMOTE

# undersampling techniques
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

# importing model saving package 
from joblib import dump, load


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [3]:
# Auto ML
import h2o
from h2o.automl import H2OAutoML, get_leaderboard

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /Users/meghnadiwan/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/tx/409dykm530z8n32rv0hs94zm0000gn/T/tmpih2_ikt_
  JVM stdout: /var/folders/tx/409dykm530z8n32rv0hs94zm0000gn/T/tmpih2_ikt_/h2o_meghnadiwan_started_from_python.out
  JVM stderr: /var/folders/tx/409dykm530z8n32rv0hs94zm0000gn/T/tmpih2_ikt_/h2o_meghnadiwan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,04 secs
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,1 month and 5 days
H2O cluster name:,H2O_from_python_meghnadiwan_mzmklw
H2O cluster total nodes:,1
H2O cluster free memory:,902 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


## Functions for Notebooks

In [14]:
def draw_conf_mat(mat):
    
    """
    Draw confusion matrix
    
    Parameters:
    ------------------
    mat:                ndarray of shape (n_classes, n_classes) 
                        confusion matrix

    """
    
    import matplotlib.pyplot as plt 
    import seaborn as sns
    
    fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(mat, annot=True, fmt="d",
            xticklabels=['0', '1'],
            yticklabels=['0', '1'])
    plt.ylabel("Actual")
    plt.xlabel("Predicted")
    
    return ax

In [15]:
def draw_roc_curve(y_true, y_proba):
    
    """
    Draw baseline and model roc curve
    
    Parameters:
    ------------------
    y_true:             array-like of shape (n_samples,) 
                        True label of target (y)
                        
    y_proba             array-like of shape (n_samples,)
                        The predicted probability of target (y)
    """
    
    
    base_fpr, base_tpr, _ = roc_curve(y_true, [1 for _ in range(len(y_true))])
    model_fpr, model_tpr, _ = roc_curve(y_true, y_proba)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    
    return ax

In [16]:
def metrics_imbalanced(your_confusion_matrix, y_true, y_proba):
    
    """
    Print precision, recall, fallout and auroc score based on the confusion matrix. 
    
    Parameters:
    ------------------
    your_confusion_matrix:                ndarray of shape (n_classes, n_classes) 
                                          confusion matrix
                                          
    y_true:                               array-like of shape (n_samples,) 
                                          True label of target (y)
                                          
    y_proba:                              y_proba             array-like of shape (n_samples,)
                                          The predicted probability of target (y)
    """
    
    # Model evaluation metrics. 
    tp = your_confusion_matrix[1,1]
    fn = your_confusion_matrix[1,0]
    fp = your_confusion_matrix[0,1]
    tn = your_confusion_matrix[0,0]
    auroc = roc_auc_score(y_true, y_proba)
    
    print('Precision = %0.3f'%(tp/(tp+fp)))
    print('Recall (TPR) = %0.3f'%(tp/(tp+fn)))
    print('Fallout (FPR) = %0.3f'%(fp/(fp+tn)))
    print('Roc_auc_score = %0.3f'%auroc)

In [17]:
def resampling_unbalanced(train_x, train_y, sample_method):
    """ 

    Parameters
    ----------
    train_x : test_x 
            pd.DataFrame
    train_y : train_y 
            pd.DataFrame
    sample_method: 'over' or 'under'
            string

    Returns
    -------
    train_x : resampled train_x
                pd.DataFrame
    train_y : resampled train_y
                pd.DataFrame
        

    """
    
    
    if sample_method == "over":
        oversample = SMOTE()
        train_x_ros, train_y_ros = oversample.fit_resample(X_train, y_train)
        
        return train_x_ros, train_y_ros
        
    if sample_method == "under":
        # Concate X and Y train
        trainData = pd.concat([train_x, train_y],axis=1)
        # Class count
        count_class_0, count_class_1 = trainData["hospital_death"].value_counts()

        # Divide by class
        df_class_0 = trainData[trainData['hospital_death'] == 0]
        df_class_1 = trainData[trainData['hospital_death'] == 1]
        
        # Sample the majority class
        df_class_0_under = df_class_0.sample(count_class_1)
        
        # Put the train dataset together
        train_rus = pd.concat([df_class_0_under, df_class_1], axis=0)
        train_x_rus = train_rus.drop("hospital_death", axis = 1)
        train_y_rus = pd.DataFrame(train_rus['hospital_death'])
        
        return train_x_rus, train_y_rus

## Bring in the X_wids and y_wids from the WIDS_Feature_Engineer.ipynb
Run the WIDS Feature Engineer Notebook first in order to grab the dataframes

In [40]:
# grab the stored data frames
%store -r X_wids
%store -r y_wids

In [41]:
X_wids.shape, y_wids.shape

((91713, 598), (91713, 1))

In [42]:
X_wids = X_wids.drop(columns= ['hospital_admit_source_Observation'])

In [43]:
train = pd.concat([X_wids, y_wids],axis=1)
train

Unnamed: 0_level_0,age,bmi,elective_surgery,height,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_post_operative,arf_apache,...,apache_3j_diagnosis_desc_Sepsis_of_urinary_tract_origin,apache_3j_diagnosis_desc_Skin_surgery,apache_3j_diagnosis_desc_Stroke,apache_3j_diagnosis_desc_Subarachnoid_haemorrhage,apache_3j_diagnosis_desc_Subdural_Epidural_haematoma,apache_3j_diagnosis_desc_Unstable_angina,apache_3j_diagnosis_desc_Valvular_heart_surgery,apache_3j_diagnosis_desc_Viral_pneumonia,apache_3j_diagnosis_desc_unknown,hospital_death
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25312,68.0,22.730000,0,180.3,0.541667,0,73.9,2.3,0,0.0,...,1,0,0,0,0,0,0,0,0,0
59342,77.0,27.420000,0,160.0,0.927778,0,70.2,2.6,0,0.0,...,0,0,0,0,0,0,0,0,0,0
50777,25.0,31.950000,0,172.7,0.000694,0,95.3,3.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
46918,81.0,22.640000,1,165.1,0.000694,0,61.7,3.0,1,0.0,...,0,0,0,0,0,0,1,0,0,0
34377,19.0,27.654655,0,188.0,0.073611,0,80.3,3.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78108,75.0,23.060250,0,177.8,0.298611,0,72.9,3.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
13486,56.0,47.179671,0,183.0,0.120139,0,158.0,3.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
58179,48.0,27.236914,0,170.2,0.046528,0,78.9,2.9,0,0.0,...,0,0,0,0,0,0,0,0,0,0
120598,65.0,23.297481,0,154.9,0.081944,0,55.9,3.0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
train_hf = h2o.H2OFrame(train)
train_hf['hospital_death'] = train_hf['hospital_death'].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Run Auto ML

In [45]:
y = "hospital_death"
x = train_hf.columns
x.remove(y)

In [None]:
aml = H2OAutoML(max_models = 10, seed = 12)
aml.train(x = x, y = y, training_frame = train_hf)

In [None]:
lb = aml.leaderboard
lb.head()

In [50]:
aml = H2OAutoML(max_models = 2, seed = 12, balance_classes = True) # max_runtime_secs = 
aml.train(x = x, y = y, training_frame = train_hf)

AutoML progress: |████ (cancelled)


KeyboardInterrupt: 

In [None]:
lb = aml.leaderboard
lb.head()

## Train Test Split without Correction of Imbalanced Data

In [11]:
# Split train-test dataset 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_wids, y_wids, test_size = 0.2, random_state = 31, stratify = y_wids)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(73370, 597) (18343, 597) (73370, 1) (18343, 1)


## Models with Correction Of Imbalanced Data Using Undersampling

In [19]:
X_train_U, y_train_U = resampling_unbalanced(X_train, y_train, "under")

In [20]:
X_train_U.shape, y_train_U.shape

((12664, 597), (12664, 1))