# Machine Learning Best Practices Assignment

In [5]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [6]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

<IPython.core.display.Javascript object>

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [7]:
data = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv"
)

<IPython.core.display.Javascript object>

### Split the data into training and test sets, with the target variable being the outcome column.

In [8]:
data

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


<IPython.core.display.Javascript object>

In [9]:
X = data.drop("outcome", axis=1)
y = data.outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

<IPython.core.display.Javascript object>

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [10]:
model = RandomForestClassifier(random_state=1)

<IPython.core.display.Javascript object>

In [11]:
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Train Report\n")
print(classification_report(y_train, y_pred_train))

print("Test Report\n")
print(classification_report(y_test, y_pred_test))

Train Report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       380
           1       1.00      1.00      1.00       196

    accuracy                           1.00       576
   macro avg       1.00      1.00      1.00       576
weighted avg       1.00      1.00      1.00       576

Test Report

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       120
           1       0.75      0.60      0.67        72

    accuracy                           0.78       192
   macro avg       0.77      0.74      0.75       192
weighted avg       0.77      0.78      0.77       192



<IPython.core.display.Javascript object>

### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [13]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.25)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Train Report\n")
print(classification_report(y_train, y_pred_train))

print("Test Report\n")
print(classification_report(y_test, y_pred_test))

Train Report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       375
           1       1.00      1.00      1.00       201

    accuracy                           1.00       576
   macro avg       1.00      1.00      1.00       576
weighted avg       1.00      1.00      1.00       576

Test Report

              precision    recall  f1-score   support

           0       0.76      0.88      0.81       125
           1       0.68      0.48      0.56        67

    accuracy                           0.74       192
   macro avg       0.72      0.68      0.69       192
weighted avg       0.73      0.74      0.73       192



<IPython.core.display.Javascript object>

In [14]:
# Test accuracy went down from 0.78 to 0.74 after applying PCA

<IPython.core.display.Javascript object>

### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

pca = PCA(n_components=3)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.transform(X_test)

model.fit(X_pca_train, y_train)

y_pred_train = model.predict(X_pca_train)
y_pred_test = model.predict(X_pca_test)
print("Train Report\n")
print(classification_report(y_train, y_pred_train))

print("Test Report\n")
print(classification_report(y_test, y_pred_test))

Train Report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       380
           1       1.00      1.00      1.00       196

    accuracy                           1.00       576
   macro avg       1.00      1.00      1.00       576
weighted avg       1.00      1.00      1.00       576

Test Report

              precision    recall  f1-score   support

           0       0.74      0.83      0.78       120
           1       0.64      0.50      0.56        72

    accuracy                           0.71       192
   macro avg       0.69      0.67      0.67       192
weighted avg       0.70      0.71      0.70       192



<IPython.core.display.Javascript object>

In [16]:
# Test accuracy went down again from 0.74 to 0.71

<IPython.core.display.Javascript object>

### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [17]:
scores = cross_val_score(model, X_pca_train, y_train, cv=10)
scores.mean()

0.7240471869328494

<IPython.core.display.Javascript object>

### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [24]:
steps = [("pca", PCA(n_components=3)), ("rf", model)]

pipeline = Pipeline(steps)
print(pipeline)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

pipeline.fit(X_train, y_train)

y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)
print("Train Report\n")
print(classification_report(y_train, y_pred_train))

print("Test Report\n")
print(classification_report(y_test, y_pred_test))


Pipeline(steps=[('pca', PCA(n_components=3)),
                ('rf', RandomForestClassifier(random_state=1))])
Train Report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       378
           1       1.00      1.00      1.00       198

    accuracy                           1.00       576
   macro avg       1.00      1.00      1.00       576
weighted avg       1.00      1.00      1.00       576

Test Report

              precision    recall  f1-score   support

           0       0.74      0.84      0.78       122
           1       0.63      0.49      0.55        70

    accuracy                           0.71       192
   macro avg       0.68      0.66      0.67       192
weighted avg       0.70      0.71      0.70       192



<IPython.core.display.Javascript object>

In [31]:
# The test accuracy is the same as the previous step were we split before applying PCA

<IPython.core.display.Javascript object>

### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [26]:
scores = cross_val_score(pipeline, X_train, y_train, cv=10)
scores.mean()

0.7224137931034483

<IPython.core.display.Javascript object>

In [27]:
# 0.722 vs 0.724 for the previous step. The difference is probably because we re-ran the
# train test split

<IPython.core.display.Javascript object>

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [29]:
steps = [("pca", PCA(n_components=3)), ("rf", model)]

pipeline = Pipeline(steps)
print(pipeline)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

grid_params = {
    "pca__n_components": [2, 3, 4, 5, 6, 7, 8],
    "rf__n_estimators": [10, 20, 50, 100, 200],
}
grid = GridSearchCV(pipeline, param_grid=grid_params, cv=10, verbose=True, n_jobs=-1)
grid.fit(X_train, y_train)
grid.best_params_

Pipeline(steps=[('pca', PCA(n_components=3)),
                ('rf', RandomForestClassifier(random_state=1))])
Fitting 10 folds for each of 35 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:   19.8s finished


{'pca__n_components': 8, 'rf__n_estimators': 100}

<IPython.core.display.Javascript object>

### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [30]:
grid.best_estimator_.fit(X_train, y_train)

y_pred_train = grid.best_estimator_.predict(X_train)
y_pred_test = grid.best_estimator_.predict(X_test)
print("Train Report\n")
print(classification_report(y_train, y_pred_train))

print("Test Report\n")
print(classification_report(y_test, y_pred_test))

Train Report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       381
           1       1.00      1.00      1.00       195

    accuracy                           1.00       576
   macro avg       1.00      1.00      1.00       576
weighted avg       1.00      1.00      1.00       576

Test Report

              precision    recall  f1-score   support

           0       0.77      0.89      0.82       119
           1       0.76      0.56      0.65        73

    accuracy                           0.77       192
   macro avg       0.76      0.73      0.74       192
weighted avg       0.76      0.77      0.76       192



<IPython.core.display.Javascript object>

In [38]:
# The test accuracy improved from 0.71 to 0.77 using the best grid parameters

<IPython.core.display.Javascript object>

### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [40]:
final_model = grid.best_estimator_.fit(X, y)

with open("final_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

<IPython.core.display.Javascript object>

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [43]:
with open("final_model.pkl", "rb") as f:
    imported_model = pickle.load(f)

# pretend new data
new_data = X.copy()

preds = imported_model.predict(new_data)
preds

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

<IPython.core.display.Javascript object>