In [1]:
from __future__ import annotations

# Exercise: NLP Pipeline with Scikit-learn

# Preparation

We'll first want to make sure spaCy is ready to use.

In [2]:
# ! python -m spacy download en_core_web_sm

In [3]:
import spacy

nlp = spacy.load('en_core_web_sm')

## Data Preparation

Let's also read in some data into a Pandas DataFrame.

In [4]:
import pandas as pd

df = pd.read_csv('../data/reviews.csv')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9101 entries, 0 to 9100
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review            9101 non-null   object
 1   votes_helpful     9101 non-null   int64 
 2   votes_unhelpful   9101 non-null   int64 
 3   date_day_of_week  9101 non-null   object
 4   date_month        9101 non-null   object
 5   date_year         9101 non-null   int64 
 6   recommend         9101 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 497.8+ KB


Unnamed: 0,review,votes_helpful,votes_unhelpful,date_day_of_week,date_month,date_year,recommend
0,"Not much to write about here, but it does exac...",0,0,Friday,February,2014,1
1,The product does exactly as it should and is q...,13,14,Saturday,March,2013,1
2,The primary job of this device is to block the...,1,1,Wednesday,August,2013,1
3,Nice windscreen protects my MXL mic and preven...,0,0,Friday,February,2014,1
4,This pop filter is great. It looks and perform...,0,0,Friday,February,2014,1


### Preparing features (`X`) & target (`y`)

In [5]:
data = df

# separate features from labels
X = data.drop('recommend', axis=1)
y = data['recommend'].copy()

print('Labels:', y.unique())
print('Features:')
display(X.head())

Labels: [1 0]
Features:


Unnamed: 0,review,votes_helpful,votes_unhelpful,date_day_of_week,date_month,date_year
0,"Not much to write about here, but it does exac...",0,0,Friday,February,2014
1,The product does exactly as it should and is q...,13,14,Saturday,March,2013
2,The primary job of this device is to block the...,1,1,Wednesday,August,2013
3,Nice windscreen protects my MXL mic and preven...,0,0,Friday,February,2014
4,This pop filter is great. It looks and perform...,0,0,Friday,February,2014


Next we need to split the data into a train & test sets so we can evaluate our
end model's performance.

In [6]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    shuffle=True,
    random_state=27,
)

# Building a Pipeline: Splitting Numerical, Categorical, and Text Data

Need to separate the data into the different feature types so we can better
process & utilize them as features for our model.
Depending on the situation, you may instead only use certain features & feature
types or even do more feature engineering by combining the given data columns!

However, in this scenario, we're going to have you define the following feature
groups:

- Numerical: `num_features`
- Categorical: `cat_features`
- Text: `text_features`

In [7]:
# This will be useful to use in creating a pipeline
from sklearn.pipeline import Pipeline

In [8]:
# TODO: split data into numerical, categorical, and text features

num_features = (
    X
    .select_dtypes(exclude=['object']).columns
    .drop(
        [
            'date_year', # More of category than a numerical feature
        ],
    )
)
print('Numerical features:', num_features)

cat_features = (
    X[[
        'date_day_of_week',
        'date_month',
        'date_year',
    ]].columns
)
print('Categorical features:', cat_features)


text_features = (
    X[[
        'review',
    ]].columns
)
print ('Review Text features:', text_features)


Numerical features: Index(['votes_helpful', 'votes_unhelpful'], dtype='object')
Categorical features: Index(['date_day_of_week', 'date_month', 'date_year'], dtype='object')
Review Text features: Index(['review'], dtype='object')


## Numerical Features Pipeline

In [9]:
# TODO: define pipeline for numerical features
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

num_pipeline = Pipeline([
    (
        'imputer',
        SimpleImputer(strategy='mean'),
    ),
    (
        'scaler',
        MinMaxScaler(),
    ),
])

num_pipeline

## Categorical Features Pipeline

In [10]:
# TODO: define pipeline for categorical features
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    (
        'ordinal_encoder',
        OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1,
        )
    ),    
    (
        'imputer',
        SimpleImputer(
            strategy='most_frequent',
        )
    ),
    (
        'cat_encoder',
        OneHotEncoder(
            sparse_output=False,
            handle_unknown='ignore',
        )
    ),
])

cat_pipeline

## Text Feature Pipeline

For the text part of the pipeline, there are multiple ways we can process the
pipeline.

We specifically are going to utilize spaCy and some built-in Python functions to
process the text in our custom Scikit-learn `Transformers`

### Custom `Transformer`: Count Characters

You will create a `CountCharacter()` Scikit-learn `Transformer` using 
[`BaseEstimator`](https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html) and
[`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html#sklearn.base.TransformerMixin).

This custom `Transformer` will take in a string for a character to return the
number of times a certain character appears in the text input.
This way we have a way to see how many times a certain character
(like an exclamation point `!`)
appears.
You can use built-in Python functions to do this.

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
# TODO: create CountCharacter()
# Takes in a string for the character to count
# Outputs the number times that character appears in the text

class CountCharacter(BaseEstimator, TransformerMixin):
    def __init__(self, character: str):
        self.character = character

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [[text.count(self.character)] for text in X]

Now we will use `CountCharacter()` to create a feature for the following:

- Number of spaces in the text
- Number of exclamations (`!`) in the text
- Number of question marks (`?`) in the text

You may find using [`FeatureUnion`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html)
to be useful in your pipeline.

> Note:
> We also provided an `initial_text_preprocess` to make sure the text is in the
> expected shape for your `CountCharacter()`.

In [12]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
import numpy as np

initial_text_preprocess = Pipeline([
    (
        'dimension_reshaper',
        FunctionTransformer(
            np.reshape,
            kw_args={'newshape':-1},
        ),
    ),
])

# TODO: create a pipeline for counting the number of spaces, `!`, and `?`


feature_engineering = FeatureUnion([
    ('count_spaces', CountCharacter(character=' ')),
    ('count_exclamations', CountCharacter(character='!')),
    ('count_question_marks', CountCharacter(character='?')),
])

character_counts_pipeline = Pipeline([
    (
        'initial_text_preprocess',
        initial_text_preprocess,
    ),
    (
        'feature_engineering',
        feature_engineering,
    ),
])
character_counts_pipeline

### Custom `Transformer`: spaCy and TF-IDF

Next we want to use TF-IDF to get a vector representation of the review text.

But before we use TF-IDF, we can simplify the text with lemmatization. This way
words like 'good' and 'better' are converted to the same value. This
representation will carry over into TF-IDF.

Create a custom `Transformer` called `SpacyLemmatizer()` to lemmatize the text
given.
Then in your `tfidf_pipeline`, use `SpacyLemmatizer()` followed by
a [`TfidfVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
in your pipeline

> Note:
> As before, we provided an `initial_text_preprocess` to ensure the text is
> in te expected shape for your `SpacyLemmatizer()`.

In [13]:
# TODO: Create your SpacyLemmatizer
class SpacyLemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        lemmatized = [
            ' '.join(
                token.lemma_ for token in doc
                if not token.is_stop
            )
            for doc in self.nlp.pipe(X)
        ]
        return lemmatized   

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_pipeline = Pipeline([
    (
        'dimension_reshaper',
        FunctionTransformer(
            np.reshape,
            kw_args={'newshape':-1},
        ),
    ),
    (
        'lemmatizer',
        SpacyLemmatizer(nlp=nlp),
    ),
    (
        'tfidf_vectorizer',
        TfidfVectorizer(
            stop_words='english',
        ),
    ),
])
tfidf_pipeline 

# Combine Feature Engineering Pipelines

In [15]:
from sklearn.compose import ColumnTransformer

feature_engineering = ColumnTransformer([
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features),
        ('character_counts', character_counts_pipeline, text_features),
        ('tfidf_text', tfidf_pipeline, text_features),
])

feature_engineering

# Train & Evaluate Model

Now that we have the feature engineering pipeline created, we will append a
machine learning model (a classifier) to be trained with the features
engineering pipeline you created.

We specifically will use a
[RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
but in practice, you could use a different kind of model with the features
you've created.

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

model_pipeline = make_pipeline(
    feature_engineering,
    RandomForestClassifier(random_state=27),
)

model_pipeline.fit(X_train, y_train)

## Evaluate Model

Now that your model has been fitted, let's observe the accuracy of the model.

In [17]:
from sklearn.metrics import accuracy_score

y_pred_forest_pipeline = model_pipeline.predict(X_test)
accuracy_forest_pipeline = accuracy_score(y_test, y_pred_forest_pipeline)

print('Accuracy:', accuracy_forest_pipeline)

Accuracy: 0.8902305159165752


## Fine-Tune Model

Finally, we can use a parameter search to better adjust our model.

Using either 
[`RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
or
[`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
allows us to use cross-validation (CV) to better evaluate different models
independent of the test set.

After finding the best parameters based on our search, we can use this
fine-tuned model against the test set to observe its performance.

----

Note that parameter searches can take a significant amount of time. We recommend
using `RandomizedSearchCV` since this allows you to specify a number of
iterations over a set of parameter combinations.

In [18]:
from sklearn.model_selection import RandomizedSearchCV

# TODO: set parameters to randomly search over
# A couple parameters with 2-5 options each is plenty
my_distributions = dict(
    randomforestclassifier__max_features=[
        100,
        150,
        250,
    ],
    randomforestclassifier__n_estimators=[
        150,
        200,
    ],
)

param_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=my_distributions,
    n_iter=6,     # Try 6 different combinations of parameters
    cv=5,         # Use 5-fold cross-validation
    n_jobs=-1,    # Use all available processors (for multiprocessing)
    refit=True,   # Refit the model using the best parameters found
    verbose=3,    # Output of parameters, score, time
    random_state=27,
)

param_search.fit(X_train, y_train)

# Retrieve the best parameters
param_search.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 2/5] END randomforestclassifier__max_features=100, randomforestclassifier__n_estimators=150;, score=0.885 total time= 2.0min
[CV 1/5] END randomforestclassifier__max_features=100, randomforestclassifier__n_estimators=150;, score=0.885 total time= 2.0min
[CV 3/5] END randomforestclassifier__max_features=100, randomforestclassifier__n_estimators=150;, score=0.883 total time= 2.0min
[CV 4/5] END randomforestclassifier__max_features=100, randomforestclassifier__n_estimators=150;, score=0.887 total time= 2.0min
[CV 5/5] END randomforestclassifier__max_features=100, randomforestclassifier__n_estimators=150;, score=0.882 total time= 2.0min
[CV 2/5] END randomforestclassifier__max_features=100, randomforestclassifier__n_estimators=200;, score=0.885 total time= 2.0min
[CV 1/5] END randomforestclassifier__max_features=100, randomforestclassifier__n_estimators=200;, score=0.885 total time= 2.0min
[CV 4/5] END randomforestclassifier__



[CV 5/5] END randomforestclassifier__max_features=150, randomforestclassifier__n_estimators=150;, score=0.885 total time= 2.0min
[CV 4/5] END randomforestclassifier__max_features=150, randomforestclassifier__n_estimators=150;, score=0.886 total time= 2.0min
[CV 1/5] END randomforestclassifier__max_features=150, randomforestclassifier__n_estimators=200;, score=0.886 total time= 2.0min
[CV 2/5] END randomforestclassifier__max_features=150, randomforestclassifier__n_estimators=200;, score=0.886 total time= 2.0min
[CV 4/5] END randomforestclassifier__max_features=150, randomforestclassifier__n_estimators=200;, score=0.886 total time= 2.0min
[CV 1/5] END randomforestclassifier__max_features=250, randomforestclassifier__n_estimators=150;, score=0.885 total time= 2.0min
[CV 3/5] END randomforestclassifier__max_features=150, randomforestclassifier__n_estimators=200;, score=0.886 total time= 2.0min
[CV 5/5] END randomforestclassifier__max_features=150, randomforestclassifier__n_estimators=200;,

{'randomforestclassifier__n_estimators': 150,
 'randomforestclassifier__max_features': 150}

In [19]:
model_best = param_search.best_estimator_
model_best

In [20]:
y_pred_forest_pipeline = model_best.predict(X_test)
accuracy_forest_pipeline = accuracy_score(y_test, y_pred_forest_pipeline)

print('Accuracy:', accuracy_forest_pipeline)

Accuracy: 0.8858397365532382
