<a href="https://colab.research.google.com/github/swhaley01/Coding-Dojo/blob/main/Week_07/Lecture_02/Challenge/SOLUTIONS_Challenge_GridSearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center>
<img src="https://course_report_production.s3.amazonaws.com/rich/rich_files/rich_files/2470/s300/cd-logo-blue-600x600.png" alt="Coding Dojo Logo" class="center" height="50">

# Classification - Pipelines and GridSearchCV

*Make a copy of this notebook to edit!*

<img src="https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fcdn.searchenginejournal.com%2Fwp-content%2Fuploads%2F2020%2F08%2Fcopy-the-colab-notebook-to-your-google-drive-5f2579179f746.jpg&f=1&nofb=1" alt="Make a copy" class="center" height="300">

</center>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Task

Your task is to build the best model possible using [this dataset](https://archive.ics.uci.edu/ml/datasets/abalone). Your goal is to predict the Sex.

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#load data
df = pd.read_csv('/content/drive/MyDrive/Coding Dojo/Exercises/2 - DS Machine Learning/DS Machine Learning-Week 1/abalone.data', header=None)


In [None]:
#Rename columns with a dictionary

df = df.rename(columns={0: 'Sex',
                        1: 'Length', 
                        2: 'Diameter',
                        3: 'Height',
                        4: 'Whole Weight',
                        5: 'Shgucked Weight', 
                        6: 'Viscera Weight',
                        7: 'Shell Weight', 
                        8: 'Rings'})

In [None]:
# check for duplicates
df.duplicated().sum()

0

In [None]:
#Check the info of the data.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              4177 non-null   object 
 1   Length           4177 non-null   float64
 2   Diameter         4177 non-null   float64
 3   Height           4177 non-null   float64
 4   Whole Weight     4177 non-null   float64
 5   Shgucked Weight  4177 non-null   float64
 6   Viscera Weight   4177 non-null   float64
 7   Shell Weight     4177 non-null   float64
 8   Rings            4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [None]:
# Check the class balance
df['Sex'].value_counts()

M    1528
I    1342
F    1307
Name: Sex, dtype: int64

In [None]:
# split data into train and test sets.
# notice that stratify = y will give the same proportions in our train and test sets
X = df.drop(columns='Sex')
y = df['Sex']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
# instantiate scaler and one hot encoder
scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
# instantiate the column selectors
category_selector = make_column_selector(dtype_include='object')
numeric_selector = make_column_selector(dtype_include='number')

In [None]:
# create tuples and put them in a column transformer
ohe_tuple = (encoder, category_selector)
scaler_tuple = (scaler, numeric_selector)

col_transformer = make_column_transformer(ohe_tuple, scaler_tuple, remainder='passthrough')

In [None]:
# create and fit a KNN model
# use a pipeline with the column transformer and the model
knn = KNeighborsClassifier()

knn_pipe = make_pipeline(col_transformer, knn)

knn_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f546c444e90>),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f546c8fa390>)])),
                ('kneighborsclassifier', KNeighborsClassifier())])

In [None]:
# get paramaters out from the pipeline
knn_pipe.get_params()

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f546c444e90>),
                                 ('standardscaler', StandardScaler(),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f546c8fa390>)]),
 'columntransformer__n_jobs': None,
 'columntransformer__onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse=False),
 'columntransformer__onehotencoder__categories': 'auto',
 'columntransformer__onehotencoder__drop': None,
 'columntransformer__onehotencoder__dtype': numpy.float64,
 'columntransformer__onehotencoder__handle_unknown': 'ignore',
 'columntransformer__onehotencoder__sparse': False,
 'columntransformer__remainde

In [None]:
# If we wanted to stop here, we could now get the accuracy
train_preds = knn_pipe.predict(X_train)
test_preds = knn_pipe.predict(X_test)

print('train accuracy:', accuracy_score(y_train, train_preds))
print('\n')
print('test accuracy:', accuracy_score(y_test, test_preds))

train accuracy: 0.6819923371647509


test accuracy: 0.5301435406698565


# GridSearchCV
Since we just tried the default model, we should tune the model to optimize our results
to understand your options, use the documentation...the link is provided below.
It is beyond the scope of this course to discuss all of the hyperparameters, but you can try them out and see 

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
# Using the documentation as your guide, define a dictionary of the pararameters 
# you want to tune and the values you want to try out
params = {'kneighborsclassifier__n_neighbors': range(1,20),
          'kneighborsclassifier__weights': ['uniform','distance'],
          'kneighborsclassifier__p': [2,3,4]}

In [None]:
# Instantiate a gridsearch with the model you want to use and your hyperparamter dictionary
# Depending on how many parameters you are testing and how many options you are trying, this may take awhile!
knn_grid = GridSearchCV(knn_pipe, params)

knn_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(handle_unknown='ignore',
                                                                                       sparse=False),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f546c444e90>),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f546c8fa390>)])),
                                       ('kneighbo

In [None]:
# Check the optimal combinations of hyperparameters
knn_grid.best_params_

{'kneighborsclassifier__n_neighbors': 11,
 'kneighborsclassifier__p': 3,
 'kneighborsclassifier__weights': 'distance'}

In [None]:
# extract the best version of the model from the gridsearch object
best_knn = knn_grid.best_estimator_

In [None]:
# Run a classification report on both train and test datasets for the best model. 
# How is your model doing?  Did the performance improve? How is the overfitting?
train_preds = best_knn.predict(X_train)
test_preds = best_knn.predict(X_test)
print('Best Model Train Metrics')
print(classification_report(y_train, train_preds))
print('\nBest Model Test Metrics')
print(classification_report(y_test, test_preds))

Best Model Train Metrics
              precision    recall  f1-score   support

           F       1.00      1.00      1.00       980
           I       1.00      1.00      1.00      1006
           M       1.00      1.00      1.00      1146

    accuracy                           1.00      3132
   macro avg       1.00      1.00      1.00      3132
weighted avg       1.00      1.00      1.00      3132


Best Model Test Metrics
              precision    recall  f1-score   support

           F       0.43      0.35      0.39       327
           I       0.70      0.74      0.72       336
           M       0.46      0.51      0.48       382

    accuracy                           0.53      1045
   macro avg       0.53      0.53      0.53      1045
weighted avg       0.53      0.53      0.53      1045



In [None]:
# print accurcay score for both the training and testing sets.
print('Best Model Train Metrics')
print(accuracy_score(y_train, train_preds))
print('\nBest Model Test Metrics')
print(accuracy_score(y_test, test_preds))

Best Model Train Metrics
1.0

Best Model Test Metrics
0.5320574162679426


In [None]:
# Your turn! Try creating a pipeline with gridsearchCV using two other classification models! 
# Remember to explore the hyperparameter options in the documentation for the model