# Machine Learning Best Practices Assignment

In [39]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [49]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')
data

Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### Split the data into training and test sets, with the target variable being the outcome column.

In [50]:
y = data.outcome
X = data.drop('outcome',axis=1)

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [56]:
model = RandomForestClassifier(random_state=1)
model.fit(X,y)
print(classification_report(y, model.predict(X)))

1.0

### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [62]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
X_train, X_test,y_train,y_test = train_test_split(X_pca,y,test_size=0.2)
model = RandomForestClassifier(random_state=1).fit(X_train, y_train)
print(classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.81      0.79        97
           1       0.64      0.56      0.60        57

    accuracy                           0.72       154
   macro avg       0.70      0.69      0.69       154
weighted avg       0.72      0.72      0.72       154



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [63]:
pca = PCA(n_components=3)
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
model = RandomForestClassifier(random_state=1).fit(X_train_pca, y_train)
print(classification_report(y_test,model.predict(X_test_pca)))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79        99
           1       0.63      0.40      0.49        55

    accuracy                           0.70       154
   macro avg       0.68      0.63      0.64       154
weighted avg       0.69      0.70      0.68       154



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [65]:
scores = cross_val_score(model, X_train, y_train, cv=10)
scores.mean()

0.745848757271285

### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [64]:
pipeline = Pipeline([('pca', PCA(n_components=3)),
                     ('rf', RandomForestClassifier(random_state=1))])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
pipeline.fit(X_train,y_train)
y_preds=pipeline.predict(X_test)

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82        99
           1       0.69      0.60      0.64        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154



### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [66]:
scores = cross_val_score(pipeline, X_train,y_train,cv=10)
scores.mean()

0.6855367530407191

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [68]:
pipeline = Pipeline([('pca', PCA()),
                     ('rf', RandomForestClassifier(random_state=1))])
parameters = {'pca__n_components': [2,3,4,5,6,7,8],
              'rf__n_estimators': [10,20,50,100,200],}
search = GridSearchCV(pipeline,parameters,cv=10)
search.fit(X_train,y_train)

print(search.best_score_)
print(search.best_params_)


0.7474881015335801
{'pca__n_components': 6, 'rf__n_estimators': 50}


### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [69]:
pipeline = Pipeline([('pca', PCA(n_components=6)),
                     ('rf', RandomForestClassifier(random_state=1,n_estimators=50))])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
pipeline.fit(X_train,y_train)
y_preds=pipeline.predict(X_test)

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83        99
           1       0.69      0.65      0.67        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154



### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [70]:
pipeline = search.best_estimator_
pipeline.fit(X,y)
y_pred = pipeline.predict(X_test)
with open('model.pkl', 'wb') as f:
  pickle.dump(pipeline,f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [77]:
with open('model.pkl', 'rb') as f:
  loaded_pipe = pickle.load(f)
preds = loaded_pipe.predict(X)
data['pred'] = preds
data['result'] = data['outcome'] - data['pred']
data.result.value_counts()

#all predictions are the same as 'outcome' column

0    768
Name: result, dtype: int64