# Classification Workflow with Pipelines

Let's add pipelines into our workflow! This is an example of an end-to-end, data to model process with all the tools we've dicsussed so far, including Pipeline, Column Transformers, Grid Search, SMOTE, and more.

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

In [None]:
titanic = pd.read_csv('data/titanic.csv')
titanic.head()

In [None]:
X = titanic.drop(['PassengerId', 'Name', 'Survived'], axis=1)
y = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Using `Pipeline` and `FunctionTransformer`

Let's try a very simple pipeline first.

In [None]:
def grab_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

In [None]:
# The FunctionTransformer will turn my function
# into a transformer.

GrabNumeric = FunctionTransformer(grab_numeric)

In [None]:
pipe = Pipeline(steps=[('num', GrabNumeric),
                       ('ss', StandardScaler())])

In [None]:
pipe.fit(X_train)

In [None]:
pipe.transform(X_train)

That looks like it worked!

## Using `Pipeline` and `ColumnTransformer`

When we use the `ColumnTransformer` we'll want to choose the relevant column numbers, so let's remind ourselves which columns are where:

In [None]:
X.head()

In [None]:
# We'll throw these mini-pipelines into our ColumnTransformer.

subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer()),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [None]:
# The "remainder='passthrough'" bit tells the compiler to leave
# the other df columns unchanged.

CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [2, 3, 4, 6]),
                                         ('subpipe_cat', subpipe_cat, [0, 1, 5, 7, 8])],
                           remainder='passthrough')

In [None]:
# The `ColumnTransformer` will take care of our preprocessing,
# so now we can add our model at the end of the pipeline.

logreg_model_pipe = Pipeline(steps=[('ct', CT),
                            ('logreg', LogisticRegression(random_state=42))])

In [None]:
logreg_model_pipe.fit(X_train, y_train)

In [None]:
logreg_model_pipe.score(X_train, y_train)

## Bringing in Our Modeling Class from before

In [None]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [None]:
log_pipe = ModelWithCV(logreg_model_pipe, model_name='log_pipe', X=X_train, y=y_train)

In [None]:
fig, ax = plt.subplots()

log_pipe.plot_cv(ax=ax);

## Trying Other Models at the End of the Pipeline

Can I have multiple models in a single pipeline? Yes. We'll forgo this here, but for more on this see [here](https://stackoverflow.com/questions/48507651/multiple-classification-models-in-a-scikit-pipeline-python).

### Random Forest

In [None]:
rfc = RandomForestClassifier(random_state=42)

rfc_model_pipe = Pipeline([('ct', CT), ('rfc', rfc)])

In [None]:
rfc_model_pipe.fit(X_train, y_train)

In [None]:
rfc_model_pipe.score(X_train, y_train)

In [None]:
forest_pipe = ModelWithCV(model=rfc_model_pipe,
                          model_name='forest_pipe',
                          X=X_train,
                          y=y_train)

In [None]:
fig, ax = plt.subplots()

forest_pipe.plot_cv(ax=ax);

### Gradient Booster

In [None]:
gbc_model_pipe = Pipeline([('ct', CT), ('gbc', GradientBoostingClassifier(random_state=42))])

In [None]:
gbc_model_pipe.fit(X_train, y_train)

In [None]:
gbc_model_pipe.score(X_train, y_train)

In [None]:
boost_pipe = ModelWithCV(model=gbc_model_pipe,
                         model_name='boost_pipe',
                         X=X_train,
                         y=y_train)

In [None]:
fig, ax = plt.subplots()

boost_pipe.plot_cv(ax=ax);

## Tuning and Cross-Validating

In [None]:
params = {}
params['rfc__criterion'] = ['gini', 'entropy']
params['rfc__min_samples_leaf'] = [1, 5, 10]

gs = GridSearchCV(estimator=rfc_model_pipe,
                 param_grid=params,
                 cv=10)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
# Mean test score for each of six models

gs.cv_results_['mean_test_score']

## `imblearn` Pipelines

### Dealing with the Target Imbalance

One big advantage of the process outlined above is that, by putting the preprocessing inside the Pipeline, it lets us use sklearn's cross validator (and thus, GridSearchCV and related tuning tools) without causing data leakage to the validation slice.  This is because Pipelines let us apply steps like StandardScaler after the cross val has taken in the unprocessed data and split it into the training and validation slices. This is important because tools like StandardScaler look at all the rows in a training set upon which they are fit. Then, once we've fit the Scaler, we have to transform the x data of our test and/or validation set using the fit we found with the training data. Pipelines make this easy.

However, there are preprocessing steps we might want to take and NOT apply to our test/validation set. One example of this is SMOTE. SMOTE creates synthetic training data points for training our models, but we always want to test on 'real' data. That is, un-SMOTEd test/validation information. So Pipelines help us by baking transforming into our process so we can treat our X_test just as we treat our X_train, but that can hurt us when we want to use something like SMOTE and *not* apply a step to the X_test that we applied to the X_train. 

Enter the [imblearn Pipeline](https://imbalanced-learn.org/stable/references/generated/imblearn.pipeline.Pipeline.html#imblearn.pipeline.Pipeline) Remember that Imblearn is just an offshoot of sklearn with some special sauce for imbalanced datasets. One of the advantages that imblearn offers is a distinction between 'resamplers' and 'transformers'. An imblearn.Pipeline is just like an sklearn.Pipeline except that it will *only apply the resamplers during a fit()*. This means we can put SMOTE or another resampler into an imblearn Pipeline and not end up accidentally 'validating' our data with a bunch of synthetic points. 

Let's use `SMOTE()`:

In [None]:
y_train.value_counts()

In [None]:
sm = SMOTE(sampling_strategy='auto', random_state=42)

In [None]:
sm2 = SMOTE(sampling_strategy=0.8, random_state=42)

To make things simple (and avoid errors!), let's just grab the numeric types and eliminate the NaNs from X_train:

In [None]:
X_train_clean = X_train.select_dtypes(['float', 'int']).dropna()
y_train_clean = y_train[X_train_clean.index]

In [None]:
y_train_clean.value_counts()

In [None]:
# Even distribution

X_clean_resmp, y_clean_resmp = sm.fit_resample(X_train_clean, y_train_clean)

y_clean_resmp.value_counts()

In [None]:
# Distribution where count of 1's = 0.8 * count of 0's

X_clean_resmp2, y_clean_resmp2 = sm2.fit_resample(X_train_clean, y_train_clean)

y_clean_resmp2.value_counts()

### `imblearn` Pipeline

Of course, we want to be able to perform all of our preprocessing steps from above, but just now add `SMOTE`. Good thing we can throw it all into a pipeline!

In [None]:
imb_pipe = ImPipeline(steps=[('ct', CT),
                             ('sm', SMOTE(random_state=42)),
                            ('rfc', RandomForestClassifier(random_state=42))])

In [None]:
# When we use .fit() here it will engage the SMOTE part of the pipeline.
imb_pipe.fit(X_train, y_train)

In [None]:
# But it won't when we call .score()
imb_pipe.score(X_train, y_train)

### Gridsearching

In [None]:
parameters = {'rfc__criterion': ['gini', 'entropy'],
          'rfc__min_samples_leaf': [1, 5, 10],
          'sm__k_neighbors': [3, 5, 9]}

gs = GridSearchCV(estimator=imb_pipe,
                 param_grid=parameters,
                 cv=10)

In [None]:
# Likewise here, the Pipeline should use SMOTE on the training slices of the cross-val, but not on the validation holdout used to score each version of the GridSearch.
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

## Evaluation on Test Set

Let's suppose that we choose our final model from this last GridSearch run. Note that the optimal parameters are also the default values!

In [None]:
final_model = imb_pipe

In [None]:
plot_confusion_matrix(final_model, X_test, y_test);

In [None]:
y_hat = final_model.predict(X_test)

In [None]:
print(f"""
Our final model's accuracy on the test set is {round(accuracy_score(y_test, y_hat), 2)}. \n
Our final model's recall on the test set is {round(recall_score(y_test, y_hat), 2)} \n
Our final model's precision on the test set is {round(precision_score(y_test, y_hat), 2)} \n
Our final model's f1-score on the test is {round(f1_score(y_test, y_hat), 2)}.
""")

## Exercise: Your Turn!

Use SMOTE and an estimator (model) of your choice in a pipeline to model the exoplanets' method of discovery ("method"). You can build a model one feature at a time or just throw them all in from the beginning. Consider using the LabelEncoder (from sklearn.preprocessing) to code up the target. You'll also need to make a choice about how to handle the null values.

In [None]:
exos = sns.load_dataset('planets')

In [None]:
### Your work here






