In [None]:
# sns.set_palette('colorblind')
# sns.pairplot(data=data, height=3)

In [None]:
#import libraries
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Styling used for VSCode
from matplotlib import style
style.use('dark_background')

# from sklearn.externals import joblib


In [None]:
data = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')
data.shape

In [None]:
#check for completeness of data
data.info()

In [None]:
#correlation of every pair of features
#brighter colors indicate more correlation
corr = data.corr()
print(corr)
sns.heatmap(corr, 
         xticklabels=corr.columns, 
         yticklabels=corr.columns)

Initial Data Visualization Interpretations from Heatmap

1. HighBP, HighChol, BMI, HeartDiseaseorAttack, GenHlth, and Age all have significant correlation with the outcome variable.

In [None]:
#Splitting Features and Label
y = data.iloc[:,0]
X = data.iloc[:,1:-1]
X.head()

In [None]:


#split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)




In [None]:
%%time
#training and fitting Logistic Regression to model
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(X_train,y_train)

#evaluating model
accuracy = model.score(X_test, y_test)
print("accuracy = ", accuracy * 100, "%")


In [None]:
print(model.intercept_)
print(model.coef_)

In [None]:
coeff = list(model.coef_[0])
labels = list(X_train.columns)
features = pd.DataFrame()
features['Features'] = labels
features['importance'] = coeff
features.sort_values(by=['importance'], ascending=True, inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'}))
plt.xlabel('Importance')

Interpretations on the Visualization of the Weights :

1. CholCheck, HighBP, GenHlth, and HighChol have significant influence on the model.
2. HvyAlcoholConsumption has negative influence to the model, meaning that a higher HvyAlcoholConsumption correlates to a person not having diabetes
3. Initial data interpretations, like BMI and Age show that they have a high correlation to the outcome variable, but the model relies more on CholCheck, HighBP, GenHlth, and HighChol instead. (? - data needs to be normalized for this to be an accurate statement) 

In [None]:
#making predictions
predictionProbability = model.predict_proba(X_test)
prediction = model.predict(X_test)
#shows probability of being zlass '0' and class '1'
print('Probability:', predictionProbability)
print('prediction:', prediction)


In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, prediction)
print(confusion_matrix)

Confusion Matrix Interpretations

- True positive is 10653.
- True negative is 284.
- False positive is 213.
- False negative is 1524.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

In [None]:
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
# decision surface for logistic regression on a binary classification dataset
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA'])
cmap_bold = ListedColormap(['#FF0000', '#006400'])

# generate dataset
pca = PCA(n_components=2)
pca.fit(X_train)
PCAX = pca.transform(X_train)

# # define bounds of the domain
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(PCAX,y_train)

x_min, x_max = PCAX[:, 0].min() - .1, PCAX[:, 0].max() + .1
y_min, y_max = PCAX[:, 1].min() - .1, PCAX[:, 1].max() + .1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                        np.linspace(y_min, y_max, 100))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
pca = PCA(n_components=2)
pca.fit(X_test)
PCAX = pca.transform(X_test)
plt.scatter(PCAX[:, 0], PCAX[:, 1], c=y_test, cmap=cmap_bold, s=1)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Logistic Regression')
plt.colorbar()
plt.axis('tight')



In [None]:
'''
# Another way to apply Logistic Regression model to data 
import statsmodels.api as sm
from statsmodels.formula.api import logit

logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

# Using Anova to check for significance
model1 = logit('Diabetes_binary ~ C(HighBP) + C(HighChol) + C(CholCheck) + C(BMI) + C(Smoker) + C(Stroke) + C(HeartDiseaseorAttack) + C(PhysActivity) + C(Fruits) + C(Veggies) + C(HvyAlcoholConsump) + C(AnyHealthcare) + C(NoDocbcCost) + C(GenHlth) + C(MentHlth) + C(PhysHlth) + C(DiffWalk) + C(Sex) + C(Age) + C(Education) + C(Income)', data=data).fit()

print(model1.params)
anova_table = sm.stats.anova_lm(model1, typ=2)
anova_table
'''