# Machine Learning Best Practices Assignment

In [29]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [30]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')

In [66]:
# data.info()
# data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pregnancies        768 non-null    int64  
 1   glucose            768 non-null    int64  
 2   blood_pressure     768 non-null    int64  
 3   skin_thickness     768 non-null    int64  
 4   insulin            768 non-null    int64  
 5   bmi                768 non-null    float64
 6   diabetes_pedigree  768 non-null    float64
 7   age                768 non-null    int64  
 8   outcome            768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Split the data into training and test sets, with the target variable being the outcome column.

In [32]:
X = data.drop('outcome', 1)
y = data['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [33]:
model = RandomForestClassifier(random_state=42)

In [34]:
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

In [35]:
print('Test Report:')
print(classification_report(y_test, test_preds))

Test Report:
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [36]:
pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
model_pca = RandomForestClassifier(random_state=42)
model_pca.fit(X_train_pca, y_train_pca)
pca_train_preds = model.predict(X_train_pca)
pca_test_preds = model.predict(X_test_pca)

In [38]:
print('Test Report (PCA before split):')
print(classification_report(y_test_pca, pca_test_preds))

Test Report (PCA before split):
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
pca_after = PCA(n_components=3, random_state=42)
X_train_pca_after = pca_after.fit_transform(X_train)
X_test_pca_after = pca_after.transform(X_test)

In [41]:
model_pca2 = RandomForestClassifier(random_state=42)
model_pca2.fit(X_train_pca_after, y_train)
train_pca_preds = model_pca2.predict(X_train_pca_after)
test_pca_preds = model_pca2.predict(X_test_pca_after)

In [42]:
print('Test Report (PCA after split):')
print(classification_report(y_test, test_pca_preds))

Test Report (PCA after split):
              precision    recall  f1-score   support

           0       0.76      0.83      0.79        99
           1       0.63      0.53      0.57        55

    accuracy                           0.72       154
   macro avg       0.69      0.68      0.68       154
weighted avg       0.71      0.72      0.71       154



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
model_cross_val = RandomForestClassifier(random_state=42)
scores = cross_val_score(model_cross_val, X_train, y_train, cv=5)

In [45]:
scores.mean()

0.7720511795281888

### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [46]:
pipeline = Pipeline([('pca_pipe', PCA(n_components=3, random_state=42)),
                     ('model_pipe', RandomForestClassifier(random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
preds_pipe = pipeline.predict(X_test)

print('Test Report (with Pipe):')
print(classification_report(y_test, preds_pipe))

Test Report (with Pipe):
              precision    recall  f1-score   support

           0       0.76      0.83      0.79        99
           1       0.63      0.53      0.57        55

    accuracy                           0.72       154
   macro avg       0.69      0.68      0.68       154
weighted avg       0.71      0.72      0.71       154



### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [47]:
pipeline = Pipeline([('pca_pipe', PCA(n_components=3, random_state=42)),
                     ('model_pipe', RandomForestClassifier(random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scores_pipe = cross_val_score(pipeline, X_train, y_train, cv=5)

In [48]:
scores_pipe.mean()

0.7279888044782087

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [50]:
pipeline = Pipeline([('pca_pipe', PCA(n_components=3, random_state=42)),
                     ('model_pipe', RandomForestClassifier(random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

parameters = {'pca_pipe__n_components': [2, 3, 4, 5, 6, 7, 8],
              'model_pipe__n_estimators': [10, 20, 50, 100, 200]}

search_pipe = GridSearchCV(pipeline, parameters, cv=10)
search_pipe.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca_pipe',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=3, random_state=42,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('model_pipe',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                          

In [51]:
search_pipe.best_score_

0.7734531993654151

In [56]:
best_params = search_pipe.best_params_
best_params

{'model_pipe__n_estimators': 200, 'pca_pipe__n_components': 6}

### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [54]:
preds_search_pipe = search_pipe.predict(X_test)

In [55]:
print('Test Report (with GridSearch and Pipe):')
print(classification_report(y_test, preds_search_pipe))

Test Report (with GridSearch and Pipe):
              precision    recall  f1-score   support

           0       0.78      0.78      0.78        99
           1       0.60      0.60      0.60        55

    accuracy                           0.71       154
   macro avg       0.69      0.69      0.69       154
weighted avg       0.71      0.71      0.71       154



### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [58]:
pipeline = Pipeline([('pca_pipe', PCA(n_components=6, random_state=42)),
                     ('model_pipe', RandomForestClassifier(n_estimators=200, random_state=42))])

pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('pca_pipe',
                 PCA(copy=True, iterated_power='auto', n_components=6,
                     random_state=42, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('model_pipe',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=200, n_jobs=None,
                                        oob_score=False, random_state=42,
            

In [59]:
with open('model.pkl', 'wb') as file:
  pickle.dump(pipeline, file)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [61]:
with open('model.pkl', 'rb') as file:
  load_pipe = pickle.load(file)

In [71]:
data_samp = data.sample(2000, replace=True)

In [72]:
X_samp = data_samp.drop('outcome', 1)
y_samp = data_samp['outcome']

In [73]:
samp_preds = load_pipe.predict(X_samp)

In [74]:
print('Test Report (with pickle pipeline):')
print(classification_report(y_samp, samp_preds))

Test Report (with pickle pipeline):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1336
           1       1.00      1.00      1.00       664

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

