In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
diabetes_data_df = pd.read_csv(
    Path('Resources/cleaned_diabetes_df.csv')   
)

# Review the DataFrame
diabetes_data_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BloodPressureRange,GlucoseRange,BMIRange
0,148,72,35,125,33.6,0.627,50,1,Elevated,Prediabetic,Obesity Class I
1,85,66,29,125,26.6,0.351,31,0,Normal,Normal,Overweight
2,183,64,29,125,23.3,0.672,32,1,Normal,Prediabetic,Normal Weight
3,89,66,23,94,28.1,0.167,21,0,Normal,Normal,Overweight
4,137,40,35,168,43.1,2.288,33,1,Low,Normal,Obesity Class III
...,...,...,...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0,Elevated,Normal,Obesity Class I
764,122,70,27,125,36.8,0.340,27,0,Elevated,Normal,Obesity Class II
765,121,72,23,112,26.2,0.245,30,0,Elevated,Normal,Overweight
766,126,60,29,125,30.1,0.349,47,1,Normal,Normal,Obesity Class I


In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y= diabetes_data_df["Outcome"]


# Separate the X variable, the features
X = diabetes_data_df.drop(columns="Outcome")

In [9]:
#Create a LabelEncoder
label_encoder = LabelEncoder()

#Apply label encoding to selected columns
columns_to_encode = ["BloodPressureRange", "GlucoseRange", "BMIRange"]


for column in columns_to_encode:
    diabetes_data_df[column] = label_encoder.fit_transform(diabetes_data_df[column])

In [10]:
# Review the y variable Series
y[:5]

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [11]:
# Review the X variable DataFrame
x[:5]

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,BloodPressureRange,GlucoseRange,BMIRange
0,148,72,35,125,33.6,0.627,50,Elevated,Prediabetic,Obesity Class I
1,85,66,29,125,26.6,0.351,31,Normal,Normal,Overweight
2,183,64,29,125,23.3,0.672,32,Normal,Prediabetic,Normal Weight
3,89,66,23,94,28.1,0.167,21,Normal,Normal,Overweight
4,137,40,35,168,43.1,2.288,33,Low,Normal,Obesity Class III


In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [16]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(max_iter=1000, random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(x_train, y_train)

In [17]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(x_test)

In [18]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, testing_predictions)
# Print the confusion matrix for the testing data
print(test_matrix)

[[103  19]
 [ 32  34]]


In [19]:
# Print the classification report for the model
# Create and save the training classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the training classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.76      0.84      0.80       122
           1       0.64      0.52      0.57        66

    accuracy                           0.73       188
   macro avg       0.70      0.68      0.69       188
weighted avg       0.72      0.73      0.72       188

