# Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import random
import xgboost as xgb
from mlxtend.plotting import plot_decision_regions

plt.style.use('ggplot')
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [None]:
from sklearn import datasets, metrics
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, accuracy_score, f1_score, fbeta_score
from sklearn.preprocessing import OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

df = pd.read_csv('./diabetes.csv')

In [None]:
del df['Unnamed: 0']

In [None]:
df.head()

In [None]:
df.dtypes

The `readmitted` column currently contains three categorical values for whether a patient is readmitted. We'll simplify the values in the column and designate 0 to mean not readmitted and 1 to signify readmitted.

In [None]:
df['readmitted'].replace({'NO':0, '>30':1, '<30':1}, inplace=True)
df['readmitted'].value_counts()

Let's also look at a heatmap to get an idea of which features are more closely correlated in order to choose predictor variables for our model.

In [None]:
df1 = df[['age', 'admission_type_id', 'discharge_disposition_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_diagnoses', 'readmitted']]

plt.figure(figsize=(7, 7))
sns.heatmap(df1.corr(), cmap='coolwarm', annot=True)

### Variable Pre-Processing

We'll pre-process data using the following steps:
1. Split the data into train, test, and validation sets.
2. One-hot encode categorical variables for each set.
3. Standardize continous variables for each set.
4. Combine the processed features into a dataframe.

In [None]:
#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, label_train, label_test = train_test_split(df[['race', 'gender', 'age', 'time_in_hospital',
                                                                'num_lab_procedures', 'num_procedures', 'num_medications',
                                                                'diag_1', 'diag_2', 'diag_3']], df['readmitted'],
                                                            test_size=0.2, random_state=2018)
X_train, X_val, label_train, label_val = train_test_split(df[['race', 'gender', 'age', 'time_in_hospital',
                                                              'num_lab_procedures', 'num_procedures', 'num_medications',
                                                              'diag_1', 'diag_2', 'diag_3']], df['readmitted'],
                                                          test_size=0.25, random_state=2019)

In [None]:
# Select categorical variables from training set
cat_variable = ['race', 'gender', 'diag_1', 'diag_2', 'diag_3']

X_train_cat = X_train[cat_variable]

In [None]:
# Instantiate OneHotEncoder object
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, drop='first')
ohe.fit(X_train_cat) 
cats = ohe.transform(X_train_cat)

In [None]:
# Create categorial dataframe with column names
columns = ohe.get_feature_names(cat_variable)
X_train_cat_df = pd.DataFrame(cats, columns=columns, index=X_train_cat.index)
X_train_cat_df.head()

In [None]:
# Repeat for test set
X_test_cat = X_test[['race', 'gender', 'diag_1', 'diag_2', 'diag_3']]

cats_test = ohe.transform(X_test_cat)

cat_columns = ohe.get_feature_names(['race', 'gender', 'diag_1', 'diag_2', 'diag_3'])
X_test_cat_df = pd.DataFrame(cats_test, columns=cat_columns, index=X_test_cat.index)
X_test_cat_df.head()

In [None]:
# Repeat for validation set
X_val_cat = X_val[['race', 'gender', 'diag_1', 'diag_2', 'diag_3']]

cats_val = ohe.transform(X_val_cat)

cat_columns = ohe.get_feature_names(['race', 'gender', 'diag_1', 'diag_2', 'diag_3'])
X_val_cat_df = pd.DataFrame(cats_val, columns=cat_columns, index=X_val_cat.index)
X_val_cat_df.head()

In [None]:
# Pre-process continous variables
X_train_cont = X_train[['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications']]

X_train_cont.head()

In [None]:
# Standardize continous variable
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit(X_train_cont)
X_train_scaled = ss.transform(X_train_cont)

cont_columns = X_train_cont.columns
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=cont_columns, index=X_train_cont.index)

X_train_scaled_df.head()

In [None]:
# Repeat for test set
X_test_cont = X_test[['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications']]

X_test_scaled = ss.transform(X_test_cont)

cont_columns = X_test_cont.columns
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=cont_columns, index=X_test_cont.index)

X_test_scaled_df.head()

In [None]:
# Repeat for validation set
X_val_cont = X_val[['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications']]

X_val_scaled = ss.transform(X_val_cont)

cont_columns = X_val_cont.columns
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=cont_columns, index=X_val_cont.index)

X_val_scaled_df.head()

In [None]:
# Combine continous and categorical features for train set
X_train_combined = pd.concat([X_train_cat_df, X_train_scaled_df], axis='columns')

X_train_combined.head()

In [None]:
# Repeat for test set
X_test_combined = pd.concat([X_test_cat_df, X_test_scaled_df], axis='columns')

X_test_combined.head()

In [None]:
# Repeat for validation set
X_val_combined = pd.concat([X_val_cat_df, X_val_scaled_df], axis='columns')

X_val_combined.head()

## KNN
Let's now fit our data to a kNN model and look at the accuracy, precision, and recall scores.

In [None]:
def print_accuracy(model, x_tr, y_tr, x_te, y_te):
    print("The accuracy score for {} is...".format(model))
    print("Training: {:6.2f}%".format(100*model.score(x_tr, y_tr)))
    print("Test set: {:6.2f}%".format(100*model.score(x_te, y_te)))

In [None]:
def precision_and_recall(y_te, y_pred, threshold=0.5):
    if threshold != 0.5:
        print("Threshold of {}...".format(threshold))
    else:
        print("Default threshold...")
    print("Precision: {:6.2f}%, Recall: {:6.2f}%".format(100*precision_score(y_te, y_pred, zero_division=0),
                                                      100*recall_score(y_te, y_pred, zero_division=0)))

In [None]:
def print_scores(model, x_tr, y_tr, x_te, y_te, y_pred, threshold=0.5):
    '''
    This function prints accuracy, precision, and recall scores for a given model
    '''
    
    print("The accuracy score for {} is...".format(model))
    print("Training: {:6.2f}%".format(100*model.score(x_tr, y_tr)))
    print("Test set: {:6.2f}%".format(100*model.score(x_te, y_te)))
    
    if threshold != 0.5:
        print("Threshold of {}...".format(threshold))
    else:
        print("Default threshold...")
    print("Precision: {:6.2f}%, Recall: {:6.2f}%".format(100*precision_score(y_te, y_pred, zero_division=0),
                                                      100*recall_score(y_te, y_pred, zero_division=0)))

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_combined, label_train)

print_accuracy(knn, X_train_combined, label_train, X_test_combined, label_test)

In [None]:
# using the default threshold of 0.5, which is what vanilla predict does
y_predict = knn.predict(X_test_combined)
    
precision_and_recall(label_test, y_predict)

In [None]:
# using the new threshold of 0.06
y_predict = (knn.predict_proba(X_test_combined)[:,1] > 0.06)

precision_and_recall(label_test, y_predict, 0.06)

Let's look at the confusion matrix for our KNeighborsClassifier.

In [None]:
# Print confusion matrix for kNN
knn_confusion = confusion_matrix(label_test, knn.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(knn_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=df['readmitted'].unique(),
           yticklabels=df['readmitted'].unique())

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('kNN confusion matrix');

### Random Oversampling
Given that there is a class imbalance for the `readmitted` target variable, we may be able to improve the model by utilizing random oversampling. 

In [None]:
df['readmitted'].value_counts()

Before we move on to a different model, let's try to improve the current models by using random oversampling. We can see that there is a class imbalance in our target variable, so it's always going to be about as accurate as the imbalance itself. In other words, there are 31019 records classifying when a patient is not readmitted to the hospital and just 8706 when a patient is readmitted. That means 72% of the patients in our data aren't readmitted, so a classification model that predicts guesses "not readmitted" 72% of the time will be fairly accurate.

In [None]:
# Add some random oversampling of the minority classes
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train_combined, label_train)

In [None]:
# Yay, balanced classes!
Counter(y_resampled)

In [None]:
# Run the analysis again. What do we expect to see with balanced classes??
# cell takes ~15s to run
knn_resampled = KNeighborsClassifier()
clf_ros = knn_resampled.fit(X_resampled, y_resampled)

# Print confusion matrix for kNN regression
knn_resampled_confusion = confusion_matrix(label_test, knn_resampled.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(knn_resampled_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Oversampled kNN confusion matrix');

We'll look at the scores again for the resampled data.

In [None]:
print_accuracy(knn_resampled, X_resampled, y_resampled, X_test_combined, label_test)

In [None]:
precision_and_recall(label_test, knn_resampled.predict(X_test_combined))

We can visualize the model performance by looking at the predicted probabilities.

In [None]:
prediction = knn.predict_proba(X_test_combined)[:,1]

plt.figure(figsize=(15,7))
plt.hist(prediction[label_test==0], bins=50, label='Negatives', color='b')
plt.hist(prediction[label_test==1], bins=50, label='Positives', color='r')
plt.xlabel('Probability of being Positive Class', fontsize=25)
plt.ylabel('Number of records in each bucket', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
plt.show()

## Logistic Regression
Let's now look at the same information for the logistic regression model to compare. We'll look at the baseline model and then use random oversampling to see if we can improve, and then we'll look at the scores again.

In [None]:
logit = LogisticRegression(C = 0.95)
logit.fit(X_train_combined, label_train)

print_accuracy(logit, X_train_combined, label_train, X_test_combined, label_test)

In [None]:
# using the default threshold of 0.5, which is what vanilla predict does
y_predict = logit.predict(X_test_combined)
    
precision_and_recall(label_test, y_predict)

In [None]:
# using the new threshold of 0.06
y_predict = (logit.predict_proba(X_test_combined)[:,1] > 0.06)
    
precision_and_recall(label_test, y_predict, 0.06)

Let's do a prediction using our Linear Regression model. We can see that the logistic regression model predicts an outcome of 0, or not readmitted, roughly 80% of the time and an outcome of 1, or readmitted, about 20% of the time.

In [None]:
prediction = logit.predict_proba(X_test_combined)
prediction

We'll also look at the confusion matrix for the Logistic Regression model.

In [None]:
# Print confusion matrix for logistic regression
logit_confusion = confusion_matrix(label_test, logit.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(logit_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Logistic regression confusion matrix');

Given that our categorical variables have a large number of classes, a logistic model will already be hard to interpret. If we find other models with less interpretability, but with better scores, we should go with that.

In [None]:
# Look at the logistic regression model coefficients
logit.coef_

We'll try random oversampling or the LogisticRegression model like we did for our KNeighborsClassifier to see what improvements can be made.

In [None]:
# Run the analysis again. What do we expect to see with balanced classes??
# cell takes ~15s to run
lr_resampled = LogisticRegression()
clf_ros = lr_resampled.fit(X_resampled, y_resampled)

# Print confusion matrix for kNN regression
lr_resampled_confusion = confusion_matrix(label_test, lr_resampled.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(lr_resampled_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Oversampled Logistic Regression confusion matrix');

The scores for the resampled logistic regression model don't seem like an improvement over the original.

In [None]:
print_accuracy(lr_resampled, X_resampled, y_resampled, X_test_combined, label_test)

In [None]:
precision_and_recall(label_test, lr_resampled.predict(X_test_combined))

In [None]:
prediction = logit.predict_proba(X_test_combined)[:,1]

plt.figure(figsize=(15,7))
plt.hist(prediction[label_test==0], bins=50, label='Negatives', color='b')
plt.hist(prediction[label_test==1], bins=50, label='Positives', color='r')
plt.xlabel('Probability of being Positive Class', fontsize=25)
plt.ylabel('Number of records in each bucket', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
plt.show()

## Decision Tree

In [None]:
decisiontree = DecisionTreeClassifier(max_depth=4)
decisiontree.fit(X_train_combined, label_train)
y_pred = decisiontree.predict(X_test_combined)

print_scores(decisiontree, X_train_combined, label_train, X_test_combined, label_test, y_pred)

In [None]:
# Print confusion matrix for logistic regression
dt_confusion = confusion_matrix(label_test, decisiontree.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(dt_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Decision Tree Confusion Matrix');

### Oversampling

In [None]:
# Run the analysis again. What do we expect to see with balanced classes??
# cell takes ~15s to run
dt_resampled = DecisionTreeClassifier()
clf_ros = dt_resampled.fit(X_resampled, y_resampled)

# Print confusion matrix for kNN regression
dt_resampled_confusion = confusion_matrix(label_test, dt_resampled.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(dt_resampled_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Oversampled Decision Tree Confusion Matrix');

In [None]:
print_accuracy(dt_resampled, X_resampled, y_resampled, X_test_combined, label_test)

In [None]:
precision_and_recall(label_test, dt_resampled.predict(X_test_combined))

In [None]:
prediction = decisiontree.predict_proba(X_test_combined)[:,1]

plt.figure(figsize=(15,7))
plt.hist(prediction[label_test==0], bins=50, label='Negatives', color='b')
plt.hist(prediction[label_test==1], bins=50, label='Positives', color='r')
plt.xlabel('Probability of being Positive Class', fontsize=25)
plt.ylabel('Number of records in each bucket', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
plt.show()

## Random Forest
We'll first look at the scores for the model, and then we'll try oversampling and look at the scores again.

In [None]:
randomforest = RandomForestClassifier(n_estimators=100)
randomforest.fit(X_train_combined, label_train)
y_pred = randomforest.predict(X_test_combined)

print_scores(randomforest, X_train_combined, label_train, X_test_combined, label_test, y_pred)

In [None]:
# Print confusion matrix for logistic regression
rf_confusion = confusion_matrix(label_test, randomforest.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(rf_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Decision Tree Confusion Matrix');

### Oversampling

In [None]:
# Run the analysis again. What do we expect to see with balanced classes??
# cell takes ~15s to run
rf_resampled = RandomForestClassifier(n_estimators=100)
clf_ros = rf_resampled.fit(X_resampled, y_resampled)

# Print confusion matrix for kNN regression
rf_resampled_confusion = confusion_matrix(label_test, rf_resampled.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(rf_resampled_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Oversampled Random Forest confusion matrix');

In [None]:
print_accuracy(rf_resampled, X_resampled, y_resampled, X_test_combined, label_test)

In [None]:
precision_and_recall(label_test, rf_resampled.predict(X_test_combined))

In [None]:
prediction = randomforest.predict_proba(X_test_combined)[:,1]

plt.figure(figsize=(15,7))
plt.hist(prediction[label_test==0], bins=50, label='Negatives', color='b')
plt.hist(prediction[label_test==1], bins=50, label='Positives', color='r')
plt.xlabel('Probability of being Positive Class', fontsize=15)
plt.ylabel('Number of records in each bucket', fontsize=15)
plt.title('Performance Analysis of Random Forest Model', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
plt.show()

In [None]:
prediction = rf_resampled.predict_proba(X_test_combined)[:,1]

plt.figure(figsize=(15,7))
plt.hist(prediction[label_test==0], bins=50, label='Negatives', color='b')
plt.hist(prediction[label_test==1], bins=50, label='Positives', color='r')
plt.xlabel('Probability of being Positive Class', fontsize=15)
plt.ylabel('Number of records in each bucket', fontsize=15)
plt.title('Performance Analysis of Random Forest Model', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
plt.show()

In [None]:
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    fi_df.reset_index(drop=True, inplace=True)
    
    # Filter out the top 20 indices in feature_importance column
    bottom_indices = fi_df[(fi_df['feature_importance'] < 0.01)].index

    # Delete these row indexes from DataFrame
    fi_df.drop(bottom_indices, inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'Feature Importance', fontsize=25)
    plt.xlabel('Feature Importance', fontsize=15)
    plt.ylabel('Feature Name', fontsize=15)

In [None]:
plot_feature_importance(rf_resampled.feature_importances_, X_resampled.columns, 'Random Forest')

## XGBoost

In [None]:
gbm = xgb.XGBClassifier( 
        n_estimators=30000,
        max_depth=4,
        objective='binary:logistic', #new objective
        use_label_encoder=False,
        learning_rate=.05, 
        subsample=.8,
        min_child_weight=3,
        colsample_bytree=.8)

eval_set=[(X_train_combined, label_train), (X_val_combined, label_val)]

fit_model = gbm.fit(X_train_combined, label_train,
                    eval_set=eval_set, eval_metric='error',
                    early_stopping_rounds=50, verbose=False)

y_pred = gbm.predict(X_test_combined)

print_scores(gbm, X_train_combined, label_train, X_test_combined, label_test, y_pred)

In [None]:
# Print confusion matrix for logistic regression
rf_confusion = confusion_matrix(label_test, randomforest.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(rf_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Decision Tree Confusion Matrix');

In [None]:
# Run the analysis again. What do we expect to see with balanced classes??
# cell takes ~15s to run
rf_resampled = RandomForestClassifier(n_estimators=100)
clf_ros = rf_resampled.fit(X_resampled, y_resampled)

# Print confusion matrix for kNN regression
rf_resampled_confusion = confusion_matrix(label_test, rf_resampled.predict(X_test_combined))
plt.figure(dpi=150)
sns.heatmap(rf_resampled_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='.0f',
           xticklabels=[0, 1],
           yticklabels=[0, 1])

plt.xlabel('Predicted species')
plt.ylabel('Actual species')
plt.title('Oversampled Random Forest confusion matrix');

In [None]:
prediction = knn.predict_proba(X_test_combined)[:,1]

plt.figure(figsize=(15,7))
plt.hist(prediction[label_test==0], bins=50, label='Negatives', color='b')
plt.hist(prediction[label_test==1], bins=50, label='Positives', color='r')
plt.xlabel('Probability of being Positive Class', fontsize=25)
plt.ylabel('Number of records in each bucket', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
plt.show()

## Compare Models
Evaluate the RMSE of various models used.

In [None]:
def rmse(actuals, preds):
    '''
    Function to calculate Root Mean Squared Error
    '''
    return np.sqrt(((actuals - preds) ** 2).mean())

In [None]:
models = ['kNN', 'LogisticRegression', 'DecisionTree', 'RandomForest', 'XGBoost']
for key in models.items():
    print("RMSE for {}: {}\n".format(key, rmse(key.predict(X_test_combined), label_test)))

In [None]:
plt.figure()

# Add the models to the list that you want to view on the ROC plot
models = [
{
    'label': 'k Nearest Neighbors',
    'model': KNeighborsClassifier(n_neighbors=5),
},
{
    'label': 'Logistic Regression',
    'model': LogisticRegression(C=0.95),
},
{
    'label': 'Decision Tree',
    'model': DecisionTreeClassifier(max_depth=4),
},
{
    'label': 'Random Forest',
    'model': RandomForestClassifier(n_estimators=100),
}
]

# Below for loop iterates through your models list
for m in models:
    model = m['model'] # select the model
    model.fit(X_train_combined, label_train) # train the model
    y_pred = model.predict(X_test_combined) # predict the test data
    # Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = metrics.roc_curve(label_test, model.predict_proba(X_test_combined)[:,1])
    # Calculate Area under the curve to display on the plot
    auc = metrics.roc_auc_score(label_test, model.predict(X_test_combined))
    # Now, plot the computed values
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot 
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()   # Display

In [None]:
plt.figure()

# Add the models to the list that you want to view on the ROC plot
models = [
{
    'label': 'k Nearest Neighbors',
    'model': KNeighborsClassifier(n_neighbors=5),
},
{
    'label': 'Logistic Regression',
    'model': LogisticRegression(C=0.95),
},
{
    'label': 'Decision Tree',
    'model': DecisionTreeClassifier(max_depth=4),
},
{
    'label': 'Random Forest',
    'model': RandomForestClassifier(n_estimators=100),
}
]

# Below for loop iterates through your models list
for m in models:
    model = m['model'] # select the model
    model.fit(X_resampled, y_resampled) # train the model
    y_pred = model.predict(X_test_combined) # predict the test data
    # Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = metrics.roc_curve(label_test, model.predict_proba(X_test_combined)[:,1])
    # Calculate Area under the curve to display on the plot
    auc = metrics.roc_auc_score(label_test, model.predict(X_test_combined))
    # Now, plot the computed values
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot 
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)', fontsize=15)
plt.ylabel('Sensitivity(True Positive Rate)', fontsize=15)
plt.title('Receiver Operating Characteristic', fontsize=25)
plt.legend(loc="lower right")
plt.show()   # Display