# Pattern Recognition and Machine Learning
## Week 4 Tutorial

In [1]:
import pandas as pd

## 3 Diabetes Data Analysis
### 3.1 Import and Loading dataset

In [None]:
# load dataset
diabete_dataset = pd.read_csv("data/diabetes.csv", sep=",")

In [None]:
diabete_dataset.head(10)

In [None]:
diabete_dataset.shape

### 3.2 Explore the data

In [None]:
diabete_dataset.info()

### 3.3 Further analysis

In [None]:
diabete_dataset.corr()

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt

sn.heatmap(diabete_dataset.corr(), annot=True)
plt.show()

## 4 Diabetes Classification from Logistic Regression
### 4.1 Feature extraction

In [None]:
# Extract features and a target
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                   'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

X = diabete_dataset[feature_columns] # features
y = diabete_dataset['Outcome'] # target

In [None]:
print("Feature: " + str(X.shape))
print("Target: " + str(y.shape))

### 4.1 Splitting the dataset

In [None]:
# split X and y into training and testing datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                               test_size=0.25,
                                               random_state=1)

In [None]:
X_train.shape

### 4.3 Build a Logistic Regression model and make a prediction

In [None]:
from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(C = 10, max_iter=5000)

In [None]:
lgr.fit(X_train, y_train)

result = lgr.predict(X_test)

### 4.4 Model Evalation using Confusion Matrix

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

conf_matrix = metrics.confusion_matrix(y_test, result)
print(conf_matrix)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, result))
print("Precision:",metrics.precision_score(y_test, result,
                                           average = 'weighted'))
print("Recall:",metrics.recall_score(y_test, result,
                                     average = 'weighted'))
print("F1-score:",metrics.f1_score(y_test, result,
                                   average = 'weighted'))

### 4.5 ROC Curve

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test, result)
auc = metrics.roc_auc_score(y_test, result)

auc

In [None]:
plt.plot(fpr, tpr, label="auc=" + str(auc))
plt.title("ROC curve for Diabetes classifier")
plt.xlabel("False positive rate (1-Specificity)")
plt.ylabel("True positive rate (Sensitivity)")
plt.legend(loc=4)
plt.show()

In [17]:
# Display Misclassified rows with Predicted Labels
index = 0
misclassifiedIndexes = []
for label, predict in zip(y_test, result):
    if label != predict: 
        misclassifiedIndexes.append(index)
    index +=1

In [None]:
import numpy as np

np.array(misclassifiedIndexes).T

### 4.6 Find C to maximum the F1-score

In [None]:
def linear_regression(c):
    lgr = LogisticRegression(C = c, max_iter=5000)
    fit_lgr = lgr.fit(X_train, y_train)
    predicted_lgr = fit_lgr.predict(X_test)
    cm_lgr = metrics.confusion_matrix(y_test, predicted_lgr)
    
    f1_sc = metrics.f1_score(y_test, predicted_lgr, average = 'weighted')
    return f1_sc

In [None]:
c = 0.0001
c_values = []
f1_values = []

while c < 1000:
    f1_sc = linear_regression(c)
    c_values.append(c)
    f1_values.append(f1_sc)
    c = c*10

f1_lgr = pd.DataFrame({
    "c": c_values,
    "f1": f1_values
})

In [None]:
f1_lgr[f1_lgr['f1'] == f1_lgr['f1'].max()].c

In [None]:
f1_lgr.sort_values('f1', ascending=False)