# Machine Learning Best Practices Assignment

In [1]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [3]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')
data

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### Split the data into training and test sets, with the target variable being the outcome column.

In [5]:
X = data.drop(columns='outcome')
y = data.outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [7]:
rfc = RandomForestClassifier(random_state=1)
model = rfc.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84        52
           1       0.68      0.60      0.64        25

    accuracy                           0.78        77
   macro avg       0.75      0.73      0.74        77
weighted avg       0.77      0.78      0.78        77



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [10]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_pca, y, test_size=0.1)

model2 = rfc.fit(X_train2, y_train2)
y_pred2 = model2.predict(X_test2)

print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

           0       0.75      0.81      0.78        53
           1       0.50      0.42      0.45        24

    accuracy                           0.69        77
   macro avg       0.63      0.61      0.62        77
weighted avg       0.68      0.69      0.68        77



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [20]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.1)

X_train_pca = pca.fit_transform(X_train3)
X_test_pca = pca.transform(X_test3)

model3 = rfc.fit(X_train_pca, y_train)

y_pred3 = model3.predict(X_test_pca)

print(classification_report(y_test3, y_pred3))


              precision    recall  f1-score   support

           0       0.65      0.84      0.73        49
           1       0.43      0.21      0.29        28

    accuracy                           0.61        77
   macro avg       0.54      0.53      0.51        77
weighted avg       0.57      0.61      0.57        77



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [23]:
print(cross_val_score(rfc, X_train, y_train, cv=10).mean())

0.7612422360248446


### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [25]:
rf_pca_pipeline = Pipeline([('pca', PCA(n_components=3)), ('rf', RandomForestClassifier())])
X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, test_size=0.1)
rf_pca_pipeline.fit(X_train4, y_train4)
y_pred4 = rf_pca_pipeline.predict(X_test4)

print(classification_report(y_test4, y_pred4))

              precision    recall  f1-score   support

           0       0.81      0.75      0.78        52
           1       0.55      0.64      0.59        25

    accuracy                           0.71        77
   macro avg       0.68      0.70      0.69        77
weighted avg       0.73      0.71      0.72        77



### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [26]:
print(cross_val_score(rf_pca_pipeline, X_train, y_train, cv=10).mean())

0.7178260869565217


### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [27]:
grid = {'pca__n_components': [2, 3, 4, 5, 6, 7, 8],
        'rf__n_estimators': [10, 20, 50, 100, 200]}
search = GridSearchCV(rf_pca_pipeline, grid, cv=10)
search.fit(X_train, y_train)

print('best search score:', search.best_score_)
print('best seach parameters:', search.best_params_)

best search score: 0.7626708074534161
best seach parameters: {'pca__n_components': 8, 'rf__n_estimators': 200}


### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [28]:
rf_pca_pipeline2 = Pipeline([('pca', PCA(n_components=8)), ('rf', RandomForestClassifier(n_estimators=200))])
rf_pca_pipeline2.fit(X_train, y_train)
pred = rf_pca_pipeline2.predict(X_test)

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.81      0.83        52
           1       0.64      0.72      0.68        25

    accuracy                           0.78        77
   macro avg       0.75      0.76      0.76        77
weighted avg       0.79      0.78      0.78        77



### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [29]:
rf_pca_pipeline2.fit(X, y)
with open('model.pkl', 'wb') as f:
  pickle.dump(rf_pca_pipeline2, f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [30]:
with open('model.pkl', 'rb') as f:
  load_pipe = pickle.load(f)

data_copy = X.copy()
preds = load_pipe.predict(data_copy)