# Machine Learning Best Practices Assignment

In [40]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [41]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')
data.shape

(768, 9)

### Split the data into training and test sets, with the target variable being the outcome column.

In [42]:
y = data['outcome']
X = data.drop(columns=['outcome']).copy()

#Split the data.
SIZE = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE)
print('There are {:d} training samples and {:d} test samples'.format(X_train.shape[0], X_test.shape[0]))

There are 614 training samples and 154 test samples


### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [43]:
model = RandomForestClassifier(random_state=1)

In [44]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85       105
           1       0.67      0.73      0.70        49

    accuracy                           0.80       154
   macro avg       0.77      0.78      0.77       154
weighted avg       0.81      0.80      0.80       154



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [45]:
#principle component analysis, scree plot.
pca = PCA(n_components = 3).fit_transform(X)

#Split the data.
SIZE = 0.2
pca_train, pca_test, y_train, y_test = train_test_split(pca, y, test_size=SIZE)
print('There are {:d} training samples and {:d} test samples'.format(pca_train.shape[0], pca_test.shape[0]))

There are 614 training samples and 154 test samples


In [46]:
model.fit(pca_train, y_train)
y_pred = model.predict(pca_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80        99
           1       0.65      0.56      0.60        55

    accuracy                           0.73       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.73      0.73       154



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [47]:
#Split the data.
SIZE = 0.2
pca_train, pca_test, y_train, y_test = train_test_split(X, y, test_size=SIZE)
print('There are {:d} training samples and {:d} test samples'.format(pca_train.shape[0], pca_test.shape[0]))

There are 614 training samples and 154 test samples


In [48]:
#principle component analysis
pca = PCA(n_components = 3)
train_comp = pca.fit_transform(pca_train)
test_comp = pca.transform(pca_test)

model.fit(pca_train, y_train)
y_pred = model.predict(pca_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84        99
           1       0.74      0.62      0.67        55

    accuracy                           0.79       154
   macro avg       0.77      0.75      0.76       154
weighted avg       0.78      0.79      0.78       154



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [49]:
score_train = cross_val_score(model, X_train, y_train, cv=10)
print("training: {}".format(score_train.mean()))

training: 0.6596509783183502


### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [50]:
pipeline = Pipeline([('pca', PCA(n_components=3)),
                     ('rf', RandomForestClassifier(random_state=1))])

#Split the data.
SIZE = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.87      0.80       101
           1       0.64      0.43      0.52        53

    accuracy                           0.72       154
   macro avg       0.69      0.65      0.66       154
weighted avg       0.71      0.72      0.70       154



I have to omit the very first one due to the support balance. In comparison o  the others - the score looks better; actually. 

### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [51]:
score_train = cross_val_score(pipeline, X_train, y_train, cv=10)
print("training: {}".format(score_train.mean()))

training: 0.7246694870438921


### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [52]:
pipeline = Pipeline([('pca', PCA()),
                     ('rf', RandomForestClassifier(random_state=1))])
params = { 'pca__n_components' :  [2, 3, 4, 5, 6, 7, 8],
           'rf__n_estimators' : [10, 20, 50, 100, 200]
          }

search = GridSearchCV(pipeline, params, cv=10)
search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
      

### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [53]:
pipeline = search.best_estimator_

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.83      0.80       101
           1       0.61      0.51      0.56        53

    accuracy                           0.72       154
   macro avg       0.69      0.67      0.68       154
weighted avg       0.71      0.72      0.71       154



Better than some worse than a few others. Not bad. 

### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [54]:
pipeline.fit(X,y)

with open("model.pkl", 'wb') as f:
  pickle.dump(pipeline, f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [60]:
from sklearn.utils import resample
new_data = resample(data, random_state=42, replace=True, n_samples=2800)
py = new_data['outcome']
nX = new_data.drop(columns='outcome').copy()

with open("model.pkl", 'rb') as f:
  loaded_pipe = pickle.load(f)

preds = loaded_pipe.predict(nX)

In [61]:
print(classification_report(py, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1812
           1       1.00      1.00      1.00       988

    accuracy                           1.00      2800
   macro avg       1.00      1.00      1.00      2800
weighted avg       1.00      1.00      1.00      2800



This would probably be more fun with a larger data set, one that had a subset prior to running through this whole process.

I forgot how large you said a good size of data is to work with - but whatever that is, double it. Then split it in two. Do all the necessary work on one set and then have a look at how this pickle does on the untouched data. I can even run a classification report in that scenario. Sigh. 

Otherwise, I just get to witness how I'm still going to overfit the data if I don't have any new data. 

I can assume it's a hundred percent correct. 

:'(

Oh. It is 100% correct, right? Because I resampled it and then ran the report?

Sigh. 

In [69]:
#just wanted to look at something. 
results = pd.DataFrame(search.cv_results_)

In [70]:
results.describe(include="all")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__n_components,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
count,35.0,35.0,35.0,35.0,35.0,35.0,35,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0,35.0
unique,,,,,7.0,5.0,35,,,,,,,,,,,,,
top,,,,,8.0,20.0,"{'pca__n_components': 6, 'rf__n_estimators': 100}",,,,,,,,,,,,,
freq,,,,,5.0,7.0,1,,,,,,,,,,,,,
mean,0.163131,0.003184,0.008249,0.000528,,,,0.765438,0.728571,0.8,0.745622,0.733021,0.755972,0.748009,0.715222,0.674005,0.72459,0.739045,0.046238,17.971429
std,0.134631,0.004355,0.005505,0.00113,,,,0.034845,0.034876,0.035682,0.033213,0.023037,0.066623,0.053059,0.049514,0.035512,0.042786,0.023769,0.008632,10.239731
min,0.021069,0.000173,0.002504,3.8e-05,,,,0.693548,0.66129,0.725806,0.677419,0.704918,0.655738,0.655738,0.639344,0.590164,0.606557,0.685431,0.029493,1.0
25%,0.048068,0.001011,0.004104,0.000137,,,,0.741935,0.709677,0.790323,0.725806,0.721311,0.688525,0.704918,0.672131,0.655738,0.704918,0.723678,0.040129,9.5
50%,0.128226,0.001949,0.005746,0.000219,,,,0.758065,0.725806,0.806452,0.741935,0.721311,0.770492,0.754098,0.721311,0.672131,0.721311,0.742597,0.046761,18.0
75%,0.232955,0.00412,0.009755,0.000318,,,,0.790323,0.75,0.822581,0.774194,0.754098,0.819672,0.786885,0.770492,0.704918,0.745902,0.759704,0.051483,26.5
