In [30]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [31]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
diabetes_data_df = pd.read_csv(
    Path('Resources/cleaned_diabetes_df.csv')   
)

# Review the DataFrame
diabetes_data_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BloodPressureRange,GlucoseRange,BMIRange
0,148,72,35,125,33.6,0.627,50,1,Elevated,Prediabetic,Obesity Class I
1,85,66,29,125,26.6,0.351,31,0,Normal,Normal,Overweight
2,183,64,29,125,23.3,0.672,32,1,Normal,Prediabetic,Normal Weight
3,89,66,23,94,28.1,0.167,21,0,Normal,Normal,Overweight
4,137,40,35,168,43.1,2.288,33,1,Low,Normal,Obesity Class III
...,...,...,...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0,Elevated,Normal,Obesity Class I
764,122,70,27,125,36.8,0.340,27,0,Elevated,Normal,Obesity Class II
765,121,72,23,112,26.2,0.245,30,0,Elevated,Normal,Overweight
766,126,60,29,125,30.1,0.349,47,1,Normal,Normal,Obesity Class I


In [32]:
# Define a mapping dictionary for each categorical feature
bp_mapping = {"Low": 0, "Normal": 1, "Elevated": 2, "High Stage 1": 3, "High Stage 2": 4}
glucose_mapping = {"Normal": 0, "Prediabetic": 1, "Diabetic": 2}
bmi_mapping = {"Normal Weight": 0, "Overweight": 1, "Obesity Class I": 2, "Obesity Class II": 3, "Obesity Class III": 4}

# Map the categories to numerical values
diabetes_data_df['EncodedBP'] = diabetes_data_df['BloodPressureRange'].map(bp_mapping)
diabetes_data_df['EncodedGlucose'] = diabetes_data_df['GlucoseRange'].map(glucose_mapping)
diabetes_data_df['EncodedBMI'] = diabetes_data_df['BMIRange'].map(bmi_mapping)

diabetes_data_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BloodPressureRange,GlucoseRange,BMIRange,EncodedBP,EncodedGlucose,EncodedBMI
0,148,72,35,125,33.6,0.627,50,1,Elevated,Prediabetic,Obesity Class I,2.0,1,2.0
1,85,66,29,125,26.6,0.351,31,0,Normal,Normal,Overweight,1.0,0,1.0
2,183,64,29,125,23.3,0.672,32,1,Normal,Prediabetic,Normal Weight,1.0,1,0.0
3,89,66,23,94,28.1,0.167,21,0,Normal,Normal,Overweight,1.0,0,1.0
4,137,40,35,168,43.1,2.288,33,1,Low,Normal,Obesity Class III,0.0,0,4.0


In [33]:
# Add an identifying column
diabetes_data_with_index = diabetes_data_df.reset_index()
diabetes_data_with_index = diabetes_data_with_index.rename(columns={"index": "id"})
diabetes_data_with_index.head()

Unnamed: 0,id,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BloodPressureRange,GlucoseRange,BMIRange,EncodedBP,EncodedGlucose,EncodedBMI
0,0,148,72,35,125,33.6,0.627,50,1,Elevated,Prediabetic,Obesity Class I,2.0,1,2.0
1,1,85,66,29,125,26.6,0.351,31,0,Normal,Normal,Overweight,1.0,0,1.0
2,2,183,64,29,125,23.3,0.672,32,1,Normal,Prediabetic,Normal Weight,1.0,1,0.0
3,3,89,66,23,94,28.1,0.167,21,0,Normal,Normal,Overweight,1.0,0,1.0
4,4,137,40,35,168,43.1,2.288,33,1,Low,Normal,Obesity Class III,0.0,0,4.0


In [34]:
#Export to csv file 
output_file_path= Path('Resources/diabetes_data_with_index.csv') 
diabetes_data_with_index.to_csv(output_file_path, index=False)

In [35]:
#Drop the Range columns 
diabetes_data_df.drop(columns=["BloodPressureRange", "GlucoseRange", "BMIRange"], inplace=True)
diabetes_data_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,EncodedBP,EncodedGlucose,EncodedBMI
0,148,72,35,125,33.6,0.627,50,1,2.0,1,2.0
1,85,66,29,125,26.6,0.351,31,0,1.0,0,1.0
2,183,64,29,125,23.3,0.672,32,1,1.0,1,0.0
3,89,66,23,94,28.1,0.167,21,0,1.0,0,1.0
4,137,40,35,168,43.1,2.288,33,1,0.0,0,4.0


In [36]:
#Export to csv file 
output_file_path= Path('Resources/diabetes_data_df.csv') 
diabetes_data_df.to_csv(output_file_path, index=False)

In [37]:
diabetes_data_df.dropna(subset=["EncodedBP", "EncodedBMI"], inplace=True)
diabetes_data_df.reset_index(drop=True, inplace=True)

diabetes_data_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,EncodedBP,EncodedGlucose,EncodedBMI
0,148,72,35,125,33.6,0.627,50,1,2.0,1,2.0
1,85,66,29,125,26.6,0.351,31,0,1.0,0,1.0
2,183,64,29,125,23.3,0.672,32,1,1.0,1,0.0
3,89,66,23,94,28.1,0.167,21,0,1.0,0,1.0
4,137,40,35,168,43.1,2.288,33,1,0.0,0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
746,101,76,48,180,32.9,0.171,63,0,2.0,0,2.0
747,122,70,27,125,36.8,0.340,27,0,2.0,0,3.0
748,121,72,23,112,26.2,0.245,30,0,2.0,0,1.0
749,126,60,29,125,30.1,0.349,47,1,1.0,0,2.0


In [38]:
median_values = diabetes_data_df.median()
median_values

Glucose                     117.000
BloodPressure                72.000
SkinThickness                29.000
Insulin                     125.000
BMI                          32.300
DiabetesPedigreeFunction      0.378
Age                          29.000
Outcome                       0.000
EncodedBP                     2.000
EncodedGlucose                0.000
EncodedBMI                    2.000
dtype: float64

In [39]:
# Separate the data into labels and features

# Separate the y variable, the labels
y= diabetes_data_df["Outcome"]


# Separate the X variable, the features
x = diabetes_data_df.drop(columns="Outcome")

In [40]:
# Review the y variable Series
y[:5]

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [41]:
# Review the X variable DataFrame
x[:5]

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,EncodedBP,EncodedGlucose,EncodedBMI
0,148,72,35,125,33.6,0.627,50,2.0,1,2.0
1,85,66,29,125,26.6,0.351,31,1.0,0,1.0
2,183,64,29,125,23.3,0.672,32,1.0,1,0.0
3,89,66,23,94,28.1,0.167,21,1.0,0,1.0
4,137,40,35,168,43.1,2.288,33,0.0,0,4.0


In [42]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [43]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(max_iter=1000, random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(x_train, y_train)

In [44]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(x_test)

In [45]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, testing_predictions)
# Print the confusion matrix for the testing data
print(test_matrix)

[[103  19]
 [ 32  34]]


In [46]:
# Print the classification report for the model
# Create and save the training classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the training classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.76      0.84      0.80       122
           1       0.64      0.52      0.57        66

    accuracy                           0.73       188
   macro avg       0.70      0.68      0.69       188
weighted avg       0.72      0.73      0.72       188



In [47]:
joblib.dump(lr_model, "diabetes_predictor.joblib")

['diabetes_predictor.joblib']