# Machine Learning Best Practices Assignment

In [26]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [27]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pregnancies        768 non-null    int64  
 1   glucose            768 non-null    int64  
 2   blood_pressure     768 non-null    int64  
 3   skin_thickness     768 non-null    int64  
 4   insulin            768 non-null    int64  
 5   bmi                768 non-null    float64
 6   diabetes_pedigree  768 non-null    float64
 7   age                768 non-null    int64  
 8   outcome            768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [29]:
data.head()

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Split the data into training and test sets, with the target variable being the outcome column.

In [30]:
X = data.drop(columns=['outcome'])
y = data['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [31]:
print(data.shape)
print(X.shape)
print(y.shape)

(768, 9)
(768, 8)
(768,)


### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [32]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)

In [33]:
print(classification_report(preds, y_test))

              precision    recall  f1-score   support

           0       0.86      0.74      0.80       111
           1       0.51      0.70      0.59        43

    accuracy                           0.73       154
   macro avg       0.69      0.72      0.69       154
weighted avg       0.76      0.73      0.74       154



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [34]:
pca = PCA(n_components=3)
pca_ = pca.fit_transform(X)

X1 = pca_
y1 = data['outcome']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=.2)


rfc1 = RandomForestClassifier()
rfc1.fit(X_train1, y_train1)
preds1 = rfc1.predict(X_test1)

In [35]:
print(classification_report(preds1, y_test1))

              precision    recall  f1-score   support

           0       0.87      0.75      0.81       114
           1       0.49      0.68      0.57        40

    accuracy                           0.73       154
   macro avg       0.68      0.71      0.69       154
weighted avg       0.77      0.73      0.75       154



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [36]:
X2 = data.drop(columns=['outcome'])
y2 = data['outcome']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=.2)

pca1 = PCA(n_components=3)
pca_1 = pca1.fit_transform(X_train2)
pca_t1 = pca1.transform(X_test2)

rfc2 = RandomForestClassifier()
rfc2.fit(pca_1, y_train2)
preds2 = rfc2.predict(pca_t1)

In [37]:
print(classification_report(preds2, y_test2))

              precision    recall  f1-score   support

           0       0.74      0.71      0.73        98
           1       0.53      0.55      0.54        56

    accuracy                           0.66       154
   macro avg       0.63      0.63      0.63       154
weighted avg       0.66      0.66      0.66       154



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [38]:
cvs = cross_val_score(RandomForestClassifier(), X_train2, y_train2, cv=10)
print(cvs.mean())

0.7703595980962454


In [39]:
cvs

array([0.79032258, 0.79032258, 0.72580645, 0.77419355, 0.78688525,
       0.75409836, 0.75409836, 0.7704918 , 0.80327869, 0.75409836])

### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [40]:
pipeline = Pipeline([('pca', PCA(n_components=3)),
                     ('rf', RandomForestClassifier())])

X3 = data.drop(columns=['outcome'])
y3 = data['outcome']
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=.2)

pipeline.fit(X_train3, y_train3)
pipe_preds = pipeline.predict(X_test3)

print(classification_report(pipe_preds, y_test3))


              precision    recall  f1-score   support

           0       0.83      0.78      0.80       116
           1       0.43      0.53      0.48        38

    accuracy                           0.71       154
   macro avg       0.63      0.65      0.64       154
weighted avg       0.73      0.71      0.72       154



### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [41]:
cvs1 = cross_val_score(pipeline, X_train3, y_train3, cv=10)
print(cvs1.mean())

0.7002379693283978


The score decreased which is to be expected. The previous CVS was pulling on a feature set that included all the features. The pipeline CVS is only pulling from the top 3 "features". As such some information is lost in the modeling process and the score has taken a hit. 

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [57]:
pipeline1 = Pipeline([('pca', PCA()),
                     ('rf', RandomForestClassifier())])

X4 = data.drop(columns=['outcome'])
y4 = data['outcome']
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=.2)


parameters = {'pca__n_components': [2,3,4,5,6,7,8],
              'rf__n_estimators': [10,20,50,100,200]
             }

search = GridSearchCV(pipeline1, parameters, cv=10)
search.fit(X_train4, y_train4)


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('rf', RandomForestClassifier())]),
             param_grid={'pca__n_components': [2, 3, 4, 5, 6, 7, 8],
                         'rf__n_estimators': [10, 20, 50, 100, 200]})

### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [58]:
print(search.best_params_)

{'pca__n_components': 8, 'rf__n_estimators': 20}


In [59]:
best_pipe = search.best_estimator_
print(best_pipe)
gs_pred = best_pipe.predict(X_test4)
print(classification_report(gs_pred, y_test4))

Pipeline(steps=[('pca', PCA(n_components=8)),
                ('rf', RandomForestClassifier(n_estimators=20))])
              precision    recall  f1-score   support

           0       0.83      0.77      0.80       106
           1       0.56      0.65      0.60        48

    accuracy                           0.73       154
   macro avg       0.70      0.71      0.70       154
weighted avg       0.75      0.73      0.74       154



### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [49]:
best_pipe.fit(X,y)
with open('model.pkl', 'wb') as f:
  pickle.dump(best_pipe, f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [50]:
with open('model.pkl','rb') as f:
  loaded_pipe = pickle.load(f)

preds_0 = loaded_pipe.predict(X)
print(classification_report(preds_0, y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       268

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768



In [56]:
loaded_pipe.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('pca', PCA(n_components=6)),
                ('rf',
                 RandomForestClassifier(n_estimators=200, random_state=2))])>