# Machine Learning Best Practices Assignment

In [24]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [25]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')

### Split the data into training and test sets, with the target variable being the outcome column.

In [26]:
X = data.drop('outcome', axis=1)
y = data['outcome']

X_train, X_test, y_train, y_test = train_test_split(X,y)

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [27]:
model = RandomForestClassifier(random_state=1)

In [28]:
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       114
           1       0.73      0.58      0.64        78

    accuracy                           0.74       192
   macro avg       0.74      0.71      0.72       192
weighted avg       0.74      0.74      0.73       192



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [29]:
pca = PCA(n_components=3)
pca_components = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(pca_components, y)

In [30]:
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(preds, y_test))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78       124
           1       0.59      0.62      0.60        68

    accuracy                           0.71       192
   macro avg       0.69      0.69      0.69       192
weighted avg       0.72      0.71      0.71       192



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
pca = PCA(n_components=3)
pca_X_train = pca.fit_transform(X_train)
pca_X_test = pca.transform(X_test)
model = RandomForestClassifier(random_state=1)
model.fit(pca_X_train, y_train)
preds = model.predict(pca_X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80       127
           1       0.62      0.49      0.55        65

    accuracy                           0.72       192
   macro avg       0.69      0.67      0.67       192
weighted avg       0.71      0.72      0.72       192



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [32]:
scores = cross_val_score(model, pca_X_train, y_train, cv=10)
print('10-fold cross validation mean score: ', round(scores.mean(), 2))

10-fold cross validation mean score:  0.73


### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [33]:
pipeline = Pipeline([('pca', PCA(n_components=3)),
                     ('rf', RandomForestClassifier(random_state=1))])
X_train, X_test, y_train, y_test = train_test_split(X,y)

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

print(classification_report(preds, y_test))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       139
           1       0.46      0.55      0.50        53

    accuracy                           0.70       192
   macro avg       0.64      0.65      0.64       192
weighted avg       0.72      0.70      0.71       192



### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [34]:
scores = cross_val_score(pipeline, X_train, y_train, cv=10)
print('10-fold cross validation mean score: ', round(scores.mean(), 2))

10-fold cross validation mean score:  0.74


This score is slightly higer than for the last model.

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [36]:
pipeline = Pipeline([('pca', PCA()),
                     ('rf', RandomForestClassifier(random_state=1))])
X_train, X_test, y_train, y_test = train_test_split(X,y)

parameters = {'pca__n_components': [2, 3, 4, 5, 6, 7, 8],
              'rf__n_estimators': [10, 20, 50, 100, 200],
              }
search = GridSearchCV(pipeline, parameters, cv=10)
search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
      

### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [37]:
pipeline = search.best_estimator_

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.78      0.68      0.73        66

    accuracy                           0.82       192
   macro avg       0.81      0.79      0.80       192
weighted avg       0.82      0.82      0.82       192



These results are significantly better than any of the previous ones, which makes sense given we've used the best estimator from GridSearchCV. 

### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [38]:
import pickle

pipeline.fit(X,y)

with open('pipeline.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [42]:
copy = X.copy()

with open('pipeline.pkl', 'rb') as f:
  loaded_pipe = pickle.load(f)

preds = loaded_pipe.predict(copy)