# Machine Learning Best Practices Assignment

In [1]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Import the [Pima Indians Diabetes data set](https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv).

In [2]:
data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/pima_indians_diabetes.csv')
data

### Split the data into training and test sets, with the target variable being the outcome column.

In [6]:
y = data.outcome
X = data.drop('outcome',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Train a Random Forest Classifier on the data without doing any transformations and print a classification report.

This will provide us with a basis for comparison.

In [10]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78       102
           1       0.58      0.58      0.58        52

    accuracy                           0.71       154
   macro avg       0.68      0.68      0.68       154
weighted avg       0.71      0.71      0.71       154



### Reduce the data down to 3 dimensions using PCA. Then do the train-test split, fit the model, and print a classification report.

Compare these results to the previous ones.

In [17]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_pca,y,test_size=0.2)

model = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.80      0.75      0.77       109
           1       0.47      0.53      0.50        45

    accuracy                           0.69       154
   macro avg       0.63      0.64      0.64       154
weighted avg       0.70      0.69      0.69       154



### Fit the model and print a classification report again, but this time, perform the train-test split before you transform the data using PCA.

Compare these results to the previous ones.

In [15]:
pca = PCA(n_components=3)

X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.2)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

model = RandomForestClassifier().fit(X_train_pca, y_train)
print(classification_report(y_test,model.predict(X_test_pca)))

              precision    recall  f1-score   support

           0       0.71      0.85      0.77        91
           1       0.70      0.51      0.59        63

    accuracy                           0.71       154
   macro avg       0.70      0.68      0.68       154
weighted avg       0.71      0.71      0.70       154



### Using the Random Forest Classifier, perform 10-fold cross validation on the training set and print the mean cross validation score.

In [16]:
scores = cross_val_score(model, X_train, y_train, cv=10)
scores.mean()

0.7524590163934426

### Create a pipeline with a PCA step and a Random Forest Classifier step. Perform the train-test split again, fit the pipeline, and then generate a classification report.

Compare these results to the previous ones.

In [14]:
pipeline = Pipeline([('pca', PCA(n_components=3)),
                     ('rf', RandomForestClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipeline.fit(X_train, y_train)
y_preds = pipeline.predict(X_test)
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82        99
           1       0.69      0.60      0.64        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154



### Using the pipeline you built, perform 10-fold cross validation on the training set and print the mean cross validation score.

How does this score compare to the previous one?

In [18]:
scores = cross_val_score(pipeline, X_train, y_train, cv=10)
scores.mean()

0.7537546271813855

### Use GridSearchCV to find the optimal set of parameters from the ones below.

- PCA Number of Components: 2, 3, 4, 5, 6, 7, 8
- Random Forest Number of Estimators: 10, 20, 50, 100, 200

In [20]:
pipeline = Pipeline([('pca', PCA()),
                     ('rf', RandomForestClassifier())])
parameters = {'pca__n_components': [2,3,4,5,6,7,8],
              'rf__n_estimators': [10,20,50,100,200],}
search = GridSearchCV(pipeline, parameters, cv=10)
search.fit(X_train, y_train)

print(search.best_score_)
print(search.best_params_)

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with svd_solver='full'

ValueError: n_components=4 must be between 0 and min(n_samples, n_features)=3 with

0.7620042305658382
{'pca__n_components': 3, 'rf__n_estimators': 20}


### Using the best estimator pipeline from above, fit the pipeline to the training set and generate a classification report showing the results.

Compare these results to the previous ones.

In [21]:
pipeline = Pipeline([('pca', PCA(n_components=6)),
                     ('rf', RandomForestClassifier(n_estimators=50))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       105
           1       0.63      0.55      0.59        49

    accuracy                           0.75       154
   macro avg       0.71      0.70      0.71       154
weighted avg       0.75      0.75      0.75       154



### Fit the best estimator pipeline to the entire data set and save your model to disk using pickle.

In [23]:
pipeline = search.best_estimator_
pipeline.fit(X, y)
y_pred = pipeline.predict(X_test)
with open('model.pkl', 'wb') as f:
  pickle.dump(pipeline,f)

### Load the model you saved to disk, create a copy of the features in the data, and generate a set of predictions for those features. 

In [24]:
with open('model.pkl', 'rb') as f:
  loaded_pipe = pickle.load(f)
preds = loaded_pipe.predict(X)
data['pred'] = preds
data['result'] = data['outcome'] - data['pred']
data.result.value_counts()

 0    763
 1      4
-1      1
Name: result, dtype: int64