In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
diabetes_data_df = pd.read_csv(
    Path('Resources/median_diabetes_df.csv')   
)

# Review the DataFrame
diabetes_data_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,72,35,125,33.6,0.627,50,1
1,85,66,29,125,26.6,0.351,31,0
2,183,64,29,125,23.3,0.672,32,1
3,89,66,23,94,28.1,0.167,21,0
4,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,125,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,29,125,30.1,0.349,47,1


In [4]:
median_values = diabetes_data_df.median()
median_values

Glucose                     117.0000
BloodPressure                72.0000
SkinThickness                29.0000
Insulin                     125.0000
BMI                          32.3000
DiabetesPedigreeFunction      0.3725
Age                          29.0000
Outcome                       0.0000
dtype: float64

In [5]:
# Separate the data into labels and features

# Separate the y variable, the labels
y= diabetes_data_df["Outcome"]


# Separate the X variable, the features
x = diabetes_data_df.drop(columns="Outcome")

In [6]:
# Review the y variable Series
y[:5]

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [7]:
# Review the X variable DataFrame
x[:5]

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,72,35,125,33.6,0.627,50
1,85,66,29,125,26.6,0.351,31
2,183,64,29,125,23.3,0.672,32
3,89,66,23,94,28.1,0.167,21
4,137,40,35,168,43.1,2.288,33


In [8]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(max_iter=1000, random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(x_train, y_train)

In [10]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(x_test)

In [11]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, testing_predictions)
# Print the confusion matrix for the testing data
print(test_matrix)

[[107  16]
 [ 26  43]]


In [12]:
# Print the classification report for the model
# Create and save the training classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the training classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.80      0.87      0.84       123
           1       0.73      0.62      0.67        69

    accuracy                           0.78       192
   macro avg       0.77      0.75      0.75       192
weighted avg       0.78      0.78      0.78       192



In [13]:
joblib.dump(lr_model, "diabetes_model.joblib")

['diabetes_model.joblib']