# Machine Learning Best Practices Assignment

In [23]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [24]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')

### Split the data into training and test sets, with the target variable being the outcome column.

In [25]:
X = data.drop('outcome', axis=1)
y = data.outcome
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [26]:
model = RandomForestClassifier(random_state=1)

In [27]:
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.80       119
           1       0.70      0.55      0.62        73

    accuracy                           0.74       192
   macro avg       0.73      0.70      0.71       192
weighted avg       0.74      0.74      0.73       192



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [28]:
pca = PCA(n_components=3)
pca_X = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(pca_X, y)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.72      0.83      0.77       120
           1       0.62      0.46      0.53        72

    accuracy                           0.69       192
   macro avg       0.67      0.65      0.65       192
weighted avg       0.68      0.69      0.68       192



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [29]:
pca = PCA(n_components=3)
X_train, X_test, y_train, y_test = train_test_split(X, y)
pca_X_train = pca.fit_transform(X_train)
pca_X_test = pca.transform(X_test)
model.fit(pca_X_train, y_train)
pred = model.predict(pca_X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81       134
           1       0.56      0.60      0.58        58

    accuracy                           0.73       192
   macro avg       0.69      0.70      0.69       192
weighted avg       0.74      0.73      0.74       192



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [30]:
rf = RandomForestClassifier()
print(cross_val_score(rf, X_train, y_train, cv=10).mean())

0.7343617664851784


### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [31]:
pca_rf_pipeline = Pipeline([('pca', PCA(n_components=3)), ('rf', RandomForestClassifier())])
X_train, X_test, y_train, y_test = train_test_split(X, y)
pca_rf_pipeline.fit(X_train, y_train)
pred = pca_rf_pipeline.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       118
           1       0.69      0.55      0.62        74

    accuracy                           0.73       192
   macro avg       0.72      0.70      0.71       192
weighted avg       0.73      0.73      0.73       192



### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [32]:
print(cross_val_score(pca_rf_pipeline, X_train, y_train, cv=10).mean())

0.7392921960072596


In [None]:
#slight increase in score, but very similar and could be due to random splitting

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [33]:
parameters = {'pca__n_components': [2, 3, 4, 5, 6, 7, 8], 
              'rf__n_estimators': [10, 20, 50, 100, 200]}
search = GridSearchCV(pca_rf_pipeline, parameters, cv=10)
search.fit(X_train, y_train)

print('search best score:', search.best_score_)
print('search best parameters:', search.best_params_)

search best score: 0.7724742891712039
search best parameters: {'pca__n_components': 8, 'rf__n_estimators': 50}


### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [36]:
pca_rf_pipeline = Pipeline([('pca', PCA(n_components=8)), ('rf', RandomForestClassifier(n_estimators=50))])
pca_rf_pipeline.fit(X_train, y_train)
pred = pca_rf_pipeline.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       118
           1       0.71      0.54      0.62        74

    accuracy                           0.74       192
   macro avg       0.73      0.70      0.71       192
weighted avg       0.74      0.74      0.73       192



### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [41]:
pca_rf_pipeline.fit(X, y)
with open('model.pkl', 'wb') as f:
  pickle.dump(pca_rf_pipeline, f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [42]:
with open('model.pkl', 'rb') as f:
  load_pipe = pickle.load(f)

data_copy = X.copy()
preds = load_pipe.predict(data_copy)