# CountVectorizer vs HashingVectorizer

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(
    '../00.data/kaggle.com/word2vec-nlp-tutorial/labeledTrainData.tsv',
    sep='\t',
    header=0,
    quoting=3
)
df['review'] = df['review'].str.replace('<br />', ' ')
df['review'] = df['review'].str.replace('[^a-zA-Z]', ' ', regex=True)
df = df.head(1000)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,With all this stuff going down at the moment ...
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...
3,"""3630_4""",0,It must be assumed that those who praised thi...
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...


In [3]:
from sklearn.model_selection import train_test_split

X_data = df['review']
y_data = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, stratify=y_data, test_size=0.3, random_state=156
)

X_train.shape, X_test.shape

((700,), (300,))

## CountVectorizer

In [4]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(
    stop_words='english',
    ngram_range=(1,2)
)
cv.fit(X_train)
X_train_vec = cv.transform(X_train)
X_test_vec = cv.transform(X_test)

CPU times: user 1.08 s, sys: 53.1 ms, total: 1.13 s
Wall time: 1.51 s


In [5]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(C=10)
lr.fit(X_train_vec, y_train)
pred = lr.predict(X_test_vec)

accuracy_score(y_test, pred)

CPU times: user 1.58 s, sys: 357 ms, total: 1.94 s
Wall time: 1.42 s


0.7766666666666666

In [6]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('cv', CountVectorizer(
        stop_words='english', 
        ngram_range=(1,2)
    )),
    ('lr', LogisticRegression(C=10))
])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
score = accuracy_score(y_test, pred)
print('CountVectorizer + LogisticRegression Accuracy: {}'.format(score))

CountVectorizer + LogisticRegression Accuracy: 0.7766666666666666
CPU times: user 2.37 s, sys: 398 ms, total: 2.77 s
Wall time: 4.57 s


## HashingVectorizer

In [7]:
%%time
from sklearn.feature_extraction.text import HashingVectorizer

hv = HashingVectorizer(
    stop_words='english',
    ngram_range=(1,2)
)
hv.fit(X_train)
X_train_vec = hv.transform(X_train)
X_test_vec = hv.transform(X_test)

CPU times: user 410 ms, sys: 18.7 ms, total: 428 ms
Wall time: 1.45 s


In [8]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(C=10)
lr.fit(X_train_vec, y_train)
pred = lr.predict(X_test_vec)

accuracy_score(y_test, pred)

CPU times: user 7.87 s, sys: 1.13 s, total: 8.99 s
Wall time: 14.5 s


0.7633333333333333

In [10]:
%%time
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('hv', HashingVectorizer(
        stop_words='english', 
        ngram_range=(1,2)
    )),
    ('lr', LogisticRegression(C=10))
])
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
score = accuracy_score(y_test, pred)

print('HashingVectorizer + LogisticRegression Accuracy: {}'.format(score))

HashingVectorizer + LogisticRegression Accuracy: 0.7633333333333333
CPU times: user 7.84 s, sys: 1.06 s, total: 8.9 s
Wall time: 11.9 s


In [11]:
%%time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint

# Extension of sag that also allows for L1 regularization. Should generally train faster than sag
pipe = Pipeline([
    ('hv', HashingVectorizer(stop_words='english')),
    ('lr', LogisticRegression(solver='saga'))
])

# Set up the grid search with 5-fold cross validation
rs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions={
        'hv__ngram_range': [(1, randint.rvs(1,10))],
        'lr__C': [randint.rvs(1,20)]
    },
    cv=5,
    scoring='accuracy',
    verbose=10, 
    n_jobs=-1,
)
rs.fit(X_train, y_train)
print('best_score_: {}'.format(rs.best_score_))
print('best_params_: {}'.format(rs.best_params_))

estimator = rs.best_estimator_
pred = estimator.predict(X_test)
score = accuracy_score(y_test, pred)
print('accuracy_score: {}'.format(score))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
best_score_: 0.7914285714285715
best_params_: {'lr__C': 18, 'hv__ngram_range': (1, 2)}
accuracy_score: 0.7533333333333333
CPU times: user 3.61 s, sys: 231 ms, total: 3.84 s
Wall time: 18.9 s


In [12]:
%%time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint

# Extension of sag that also allows for L1 regularization. Should generally train faster than sag
pipe = Pipeline([
    ('hv', HashingVectorizer(stop_words='english')),
    ('lr', LogisticRegression(solver='saga'))
])

# Set up the grid search with 5-fold cross validation
gs = GridSearchCV(
    estimator=pipe,
    param_grid={
        'hv__ngram_range': [(1, 3), (1, 4), (1, 5)],
        'lr__C': [17, 18, 19]
    },
    cv=5,
    scoring='accuracy',
    verbose=10, 
    n_jobs=-1,
)
gs.fit(X_train, y_train)
print('best_score_: {}'.format(gs.best_score_))
print('best_params_: {}'.format(gs.best_params_))

estimator = gs.best_estimator_
pred = estimator.predict(X_test)
score = accuracy_score(y_test, pred)
print('accuracy_score: {}'.format(score))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
best_score_: 0.7914285714285715
best_params_: {'hv__ngram_range': (1, 4), 'lr__C': 19}
accuracy_score: 0.75
CPU times: user 3.19 s, sys: 202 ms, total: 3.39 s
Wall time: 4min 23s


In [14]:
%%time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint

# Extension of sag that also allows for L1 regularization. Should generally train faster than sag
pipe = Pipeline([
    ('hv', HashingVectorizer(stop_words='english')),
    ('lr', LogisticRegression(solver='saga'))
])

# Set up the grid search with 5-fold cross validation
gs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions={
        'hv__ngram_range': (1, randint(1,10)),
        'lr__C': randint(1,100)
    },
    cv=5,
    scoring='accuracy',
    verbose=10, 
    n_jobs=-1,
)
gs.fit(X_train, y_train)
print('best_score_: {}'.format(gs.best_score_))
print('best_params_: {}'.format(gs.best_params_))

estimator = gs.best_estimator_
pred = estimator.predict(X_test)
score = accuracy_score(y_test, pred)
print('accuracy_score: {}'.format(score))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


TypeError: cannot unpack non-iterable int object