In [1]:
# https://stackoverflow.com/questions/56107259/how-to-save-a-trained-model-by-scikit-learn
# https://scikit-learn.org/stable/modules/model_persistence.html

In [2]:
import pandas as pd


# Preprocessing
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler


# Model definition
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Save the model
import joblib

In [3]:
result_df = pd.read_csv('../model_data/merged_df.csv')
result_df.head()

Unnamed: 0.1,Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,PatientID,Outcome,BirthYear,City,State,Country,Pregnancies,Age
0,0,101,58,17,265,24.2,0.614,1017,0,1998,Winona,Minnesota,United States,2.0,23
1,1,108,70,0,0,30.5,0.955,1031,1,1988,Springfield,Illinois,United States,8.0,33
2,2,148,60,27,318,30.9,0.15,1033,1,1992,Socorro,Texas,United States,4.0,29
3,3,113,76,0,0,33.3,0.278,1035,1,1998,Erie,Pennsylvania,United States,0.0,23
4,4,83,86,19,0,29.3,0.317,1048,0,1987,Sioux Falls,South Dakota,United States,4.0,34


In [4]:
# lowercase all of these column names
result_df.columns = result_df.columns.str.lower()

Separate target variable- Outcome from the dataframe.

In [5]:
X = result_df.drop(columns=['outcome'])
y = result_df['outcome']


Split the data set in training and testing with 80-20 ratio.

In [6]:
# Asking us to seed the data would be helpful

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1234)

In milestone 2 of Project 1, you identified the relevant features using a correlation matrix (Coefficients values > 0.2 or 20%). You should have four features with more than 20% correlation.

In [7]:
# The four features are glucose, bmi, pregnancies, age

features_use = ['glucose', 'bmi', 'pregnancies', 'age']
features_drop = ['patientid', 'birthyear', 'city', 'state', 'country']

Define ML pipeline with pre-processing steps and a logistic regression model. Pre-processing can include the following:

Select 4 relevant/significant features to build your ML model or drop irrelevant/non-significant features.
Normalize these features using Standard Scaler in sklearn. You can use ColumnTransformer, which can drop features and apply standard scaler.

In [8]:
column_transformers_v1 = ColumnTransformer(transformers=[('drop_columns', 'drop', features_drop),
                                                   ('scale_features', StandardScaler(), features_use)
])


Train the model on training data and test it on testing data.

In [9]:
# Logistic regression


lr_model = LogisticRegression()


# create sklearn ML pipeline 
model_pipeline_v1 = Pipeline(steps=[
                    ('pre_processing', column_transformers_v1),
                    ('linear_model', lr_model)        
                ])

# train the model on training dataset
model_pipeline_v1.fit(X_train, y_train)

train_prediction_v1 = model_pipeline_v1.predict(X_train)

test_prediction_v1 = model_pipeline_v1.predict(X_test)

You can analyze the accuracy of the model. Also, check the confusion matrix.

In [10]:
accuracy_score(train_prediction_v1, y_train)

0.7627737226277372

In [11]:
confusion_matrix(train_prediction_v1, y_train)

array([[315,  87],
       [ 43, 103]], dtype=int64)

In [12]:
classification_report(train_prediction_v1, y_train)

'              precision    recall  f1-score   support\n\n           0       0.88      0.78      0.83       402\n           1       0.54      0.71      0.61       146\n\n    accuracy                           0.76       548\n   macro avg       0.71      0.74      0.72       548\nweighted avg       0.79      0.76      0.77       548\n'

In [13]:
print(accuracy_score(test_prediction_v1, y_test))
print(confusion_matrix(test_prediction_v1, y_test))
print(classification_report(test_prediction_v1, y_test))

0.782608695652174
[[80 21]
 [ 9 28]]
              precision    recall  f1-score   support

           0       0.90      0.79      0.84       101
           1       0.57      0.76      0.65        37

    accuracy                           0.78       138
   macro avg       0.74      0.77      0.75       138
weighted avg       0.81      0.78      0.79       138




You build the first version of the model. Now, you will save the model so that you can re-use it later or share it with others. You will also reload the saved model and test it on testing data.

As you will build one more revision, you can postfix your model file name as v1 (for example, ml_pipeline_v1.pkl)

Save the model either using pickle or joblib. The file format should be .pkl

In [14]:
joblib.dump(model_pipeline_v1, '../saved_models/ml_pipeline_v1.pkl')

['../saved_models/ml_pipeline_v1.pkl']

Verify whether the pickle file generated

In [15]:
from os.path import exists

exists('../saved_models/ml_pipeline_v1.pkl')

True

Load the model that you exported

In [16]:
test_pipeline_v1 = joblib.load('../saved_models/ml_pipeline_v1.pkl')

Generate predictions on testing data using the model you loaded

In [17]:
test_prediction_v1 = test_pipeline_v1.predict(X_test)

Verify the accuracy and confusion matrix. It should match your previous observation.

In [18]:
accuracy_score(test_prediction_v1, y_test)

0.782608695652174

In [19]:
confusion_matrix(test_prediction_v1, y_test)

array([[80, 21],
       [ 9, 28]], dtype=int64)

In [20]:
classification_report(test_prediction_v1, y_test)

'              precision    recall  f1-score   support\n\n           0       0.90      0.79      0.84       101\n           1       0.57      0.76      0.65        37\n\n    accuracy                           0.78       138\n   macro avg       0.74      0.77      0.75       138\nweighted avg       0.81      0.78      0.79       138\n'