In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from lem2Alg import *
from sklearn.preprocessing import StandardScaler

import numpy as np
from sklearn.metrics import mean_squared_error as mse

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc

import matplotlib.pyplot as plt


In [2]:
columns = ["preg", "glu", "bp", "st", "insulin", "BMI", "dpf", "age", "oc"]
df = pd.read_csv("diabetes.txt", names=columns, sep=" ")

In [3]:
for item in ["glu", "bp", "st", "insulin", "BMI"]:
    df[item].replace(0, df[item].median(), inplace=True)

df['bmi_age'] = df['BMI'] * df['age']
df['bmi_bp'] = df['BMI'] * df['bp']
df['bp_age'] = df['bp'] * df['age']
df['glu_ins'] = df['glu'] * df['insulin']
df['age_glu'] = df['age'] * df['glu']
df['bmi_ins'] = df['BMI'] * df['insulin']
df['preg_ins'] = df['preg'] * df['insulin']
df['bmi_st'] = df['BMI'] * df['st']
df['age_preg'] = df['age'] * df['preg']
df['dpf_glu'] = df['dpf'] * df['glu']
df['preg_glu'] = df['preg'] * df['glu']
df['dpf_insulin'] = df['dpf'] * df['insulin']
df['dpf_insulin'] = df['dpf'] * df['BMI']
df['glu_bmi_preg'] = df['glu'] * df['age'] * df['preg'] 
df['glu_bmi_age'] = df['glu'] * df['age'] * df['BMI']
df['glu_ins_age'] = df['glu'] * df['age'] * df['insulin'] 
df['bmi_ins_glu'] = df['BMI'] * df['glu'] * df['insulin'] 
df['bmi_preg_age'] = df['BMI'] * df['age'] * df['preg'] 
df['bmi_glu_dpf'] = df['BMI'] * df['glu'] * df['dpf'] 
df['preg_glu_ins'] = df['preg'] * df['glu'] * df['insulin']

df['bmi+st'] = df['BMI'] + df['st']
df['bmi+age'] = df['BMI'] + df['age']
df['glu+ins'] = df['glu'] + df['insulin']
df['preg+ins'] = df['preg'] + df['insulin']




class_0 = df[df['oc'] == 0]
class_1 = df[df['oc'] == 1]

from sklearn.utils import resample
spam_upsample = resample(class_1,
             replace=True,
             n_samples=len(class_0),
             random_state=42)

df = pd.concat([class_0, spam_upsample])
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
model = lem2Classifier()

X = df.drop(['oc'], axis=1)
y = df['oc']

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)
x_scaled = pd.DataFrame(x_scaled)
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)



In [6]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
scores, sens, spec, pre, aucs = [], [], [], [], []
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(y_pred)
    predictions_series = pd.Series(y_pred)
    # Calculate the F1 score
    sensitivity = recall_score(y_test.astype(int), predictions_series.astype(int), pos_label=1)
    sens.append(sensitivity)
    specificity = recall_score(y_test.astype(int), predictions_series.astype(int), pos_label=0)
    spec.append(specificity)
    precision = precision_score(y_test.astype(int), predictions_series.astype(int)) 
    pre.append(precision)
    f1 = f1_score(y_test, y_pred)
    scores.append(f1)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

{'preg': 401, 'age_preg': 54, 'glu': 107, 'oc': 518, 'BMI': 93, 'age': 64, 'age_glu': 5, 'insulin': 317, 'bp': 99, 'preg_ins': 122, 'dpf': 117, 'bp_age': 21, 'preg+ins': 122, 'glu+ins': 5, 'bmi_age': 51, 'bmi_bp': 7, 'bmi_ins': 6, 'bmi+st': 4, 'preg_glu': 12, 'bmi_st': 3, 'st': 34, 'glu_ins': 12, 'bmi+age': 5, 'preg_glu_ins': 4}
[False, False, False, True, True, False, True, True, False, True, False, False, True, False, True, True, False, False, True, False, False, True, True, True, True, True, True, False, False, True, True, False, True, True, False, False, True, False, True, True, False, True, True, True, False, True, True, True, False, True, True, True, True, True, False, True, True, True, True, False, True, True, False, True, False, False, True, False, False, True, True, False, True, True, False, True, False, False, False, False, True, False, True, False, False, True, False, True, False, True, True, True, True, True, False, True, True, True, False, True]
{'preg': 385, 'age_preg': 5

In [None]:
# Print the average F1 score
print("Average F1 score:", np.mean(scores))
average_auc = sum(aucs) / len(aucs)
print('Average AUC:', average_auc)
print("Average sens score:", np.mean(sens))
print("Average spec score:", np.mean(spec))
print("Average prec score:", np.mean(pre))

# Plot the ROC curve
plt.plot(fpr, tpr, label='AUC = %0.2f' % average_auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

''' Average F1 score: 0.7983312240502646
Average AUC: 0.7909999999999999
Average sens score: 0.828
Average spec score: 0.754
Average prec score: 0.7734465152302541 '''

''' Average F1 score: 0.8151394118422811
Average AUC: 0.8049999999999999
Average sens score: 0.858
Average spec score: 0.752
Average prec score: 0.7780823982521291 '''

In [None]:
import seaborn as sns
xd = confusion_matrix(y_test.astype(int), predictions_series.astype(int), labels=model.decision_variables)
plt.figure(figsize=(8, 6))
sns.heatmap(xd, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()