# Vectorizer Tuning

In [1]:
import pickle
import string
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

with open("reviews_2", "rb") as pkl_reviews: data = pickle.load(pkl_reviews)

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [2]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def lower_all(text): return text.lower()

data["clean_text"] = data['reviews'].apply(remove_punctuations)
data["clean_text"] = data['clean_text'].apply(lower_all)
print(data.head())

  target                                            reviews  \
0    neg  plot : two teen couples go to a church party ,...   
1    neg  the happy bastard's quick movie review \ndamn ...   
2    neg  it is movies like these that make a jaded movi...   
3    neg   " quest for camelot " is warner bros . ' firs...   
4    neg  synopsis : a mentally unstable man undergoing ...   

                                          clean_text  
0  plot  two teen couples go to a church party  d...  
1  the happy bastards quick movie review \ndamn t...  
2  it is movies like these that make a jaded movi...  
3    quest for camelot  is warner bros   first fe...  
4  synopsis  a mentally unstable man undergoing p...  


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

First let's start creating a pipeline using a CountVectorizer:

In [3]:
# Create Pipeline
pipeline = Pipeline([
    ("count_vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())
])

# Set parameters to search (model and vectorizer)
parameters = {"nb__alpha": (0.01, 0.1, 0.5, 1)}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           n_jobs=1, 
                           verbose=1, 
                           scoring="accuracy", 
                           refit=True, 
                           cv=5)

grid_search.fit(data["clean_text"], data["target"])

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [4]:
print("Best parameters with a CountVectorizer: ")
print(grid_search.best_params_)
print(f"Best score: {grid_search.best_score_}")

Best parameters with a CountVectorizer: 
{'nb__alpha': 1}
Best score: 0.8130000000000001


Now, let's do the same thing with a TF-IDF vectorizer:

In [5]:
# Create Pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("nb", MultinomialNB())
])

# Set parameters to search (model and vectorizer)
parameters = {"tfidf__ngram_range": ((1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (2, 4)),
              "nb__alpha": (0.01, 0.1, 0.5, 1)}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           n_jobs=1, 
                           verbose=1, 
                           scoring="accuracy", 
                           refit=True, 
                           cv=5)

grid_search.fit(data["clean_text"], data["target"])

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [6]:
print("Best parameters with a TfidfVectorizer: ")
print(grid_search.best_params_)
print(f"Best score: {grid_search.best_score_}")

Best parameters with a TfidfVectorizer: 
{'nb__alpha': 0.01, 'tfidf__ngram_range': (2, 4)}
Best score: 0.8405000000000001


⚠️ Please push the exercise once you are done 🙃

## 🏁 