# Machine Learning Best Practices Assignment

In [1]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [2]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')

In [3]:
print(data.shape)
data.head()

(768, 9)


Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Split the data into training and test sets, with the target variable being the outcome column.

In [4]:
X = data.drop(['outcome'], axis=1)
y = data.outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [5]:
forest = RandomForestClassifier(random_state=1)

In [6]:
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       101
           1       0.68      0.60      0.64        53

    accuracy                           0.77       154
   macro avg       0.74      0.73      0.73       154
weighted avg       0.76      0.77      0.76       154



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [7]:
#pca
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

#split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2)

In [8]:
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.78      0.76       100
           1       0.56      0.52      0.54        54

    accuracy                           0.69       154
   macro avg       0.66      0.65      0.65       154
weighted avg       0.68      0.69      0.69       154



*Precision and recall scores for the test set goes down when using PCA before train test split.*

### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [11]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#pca
pca = PCA(n_components=3)
X_tr_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

#model
forest.fit(X_tr_pca, y_train)
y_pred = forest.predict(X_test_pca)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83        99
           1       0.71      0.58      0.64        55

    accuracy                           0.77       154
   macro avg       0.75      0.73      0.73       154
weighted avg       0.76      0.77      0.76       154



*The precision and recall is better than when pca was performed before the train test split, but it seems to have similar numbers as when the model was fitted without any transformation*

### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [12]:
#cv=10
#cv with reg train
scores = cross_val_score(forest, X_train, y_train, cv=10)
print('Cross-validation score (10-folds):', scores.mean())

Cross-validation score (10-folds): 0.770386039132734


In [13]:
#cv with train pca
scores = cross_val_score(forest, X_tr_pca, y_train, cv=10)
print('Cross-validation score (10-folds):', scores.mean())

Cross-validation score (10-folds): 0.7183765203595982


### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [14]:
pipeline = Pipeline([('pca', PCA(n_components=3)),
                     ('forest', RandomForestClassifier(random_state=1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       107
           1       0.71      0.57      0.64        47

    accuracy                           0.80       154
   macro avg       0.77      0.74      0.75       154
weighted avg       0.79      0.80      0.79       154



*The scores at least for the class "0" are better than any previous reports, but the class balance is a little worse than the others.*

### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [15]:
scores = cross_val_score(pipeline, X_train, y_train, cv=10)
print('Cross-validation score (10-folds):', scores.mean())

Cross-validation score (10-folds): 0.6952670544685351


*The cv score from the pipeline is lower than the previous one.*

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [23]:
pipeline = Pipeline([('pca', PCA()),
                     ('forest', RandomForestClassifier(random_state=1))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

parameters = {'pca__n_components': [2, 3, 4, 5, 6, 7, 8],
              'forest__n_estimators': [10, 20, 50, 100, 200],
              }

search = GridSearchCV(pipeline, parameters, cv=10)
search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('forest',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
  

In [26]:
print('Grid best CV score:', search.best_score_)
print('Grid best parameters:', search.best_params_)

Grid best CV score: 0.7557905869910101
Grid best parameters: {'forest__n_estimators': 100, 'pca__n_components': 6}


### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [25]:
pipeline = search.best_estimator_

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81        97
           1       0.69      0.65      0.67        57

    accuracy                           0.76       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.76      0.76      0.76       154



### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [27]:
import pickle

pipeline.fit(X, y)

with open('model.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [28]:
copy = X.copy()

with open('model.pkl', 'rb') as f:
  loaded_pipe = pickle.load(f)

preds = loaded_pipe.predict(copy)