In [134]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.nlp import *
from sklearn.linear_model import LogisticRegression

## Tokenizing and term document matrix creation

In [135]:
PATH='data/aclImdb/'
names = ['neg','pos']

In [141]:
trn,trn_y = texts_labels_from_folders(f'{PATH}train',names)
val,val_y = texts_labels_from_folders(f'{PATH}test',names)

In [144]:
veczr = CountVectorizer(tokenizer=tokenize)

In [145]:
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

In [150]:
vocab = veczr.get_feature_names(); vocab[5000:5005]

['aussie', 'aussies', 'austen', 'austeniana', 'austens']

## Naive Bayes

We define the **log-count ratio** $r$ for each word $f$:

$r = \log \frac{\text{ratio of feature $f$ in positive documents}}{\text{ratio of feature $f$ in negative documents}}$

where ratio of feature $f$ in positive documents is the number of times a positive document has a feature divided by the number of positive documents.

In [161]:
def pr(y_i):
    p = x[y==y_i].sum(0)
    return p+1

In [162]:
x=trn_term_doc
y=trn_y

p = pr(1)/pr(1).sum()
q = pr(0)/pr(0).sum()
r = np.log(p/q)
b = np.log((y==1).mean() / (y==0).mean())

In [172]:
val_term_doc.shape

(25000, 75132)

In [179]:
x.shape

(25000, 75132)

In [173]:
r.shape

(1, 75132)

In [174]:
(val_term_doc @ r.T).shape

(25000, 1)

In [175]:
val_term_doc @ r.T

matrix([[ -8.61109],
        [ -4.82614],
        [  2.91111],
        ...,
        [117.64775],
        [  2.4274 ],
        [  5.32726]])

In [176]:
val_y

array([0, 0, 0, ..., 1, 1, 1])

In [177]:
pre_preds = val_term_doc @ r.T + b
preds = pre_preds.T>0
(preds==val_y).mean()

0.8074

In [206]:
pre_preds = val_term_doc @ np.stack([np.log(p), np.log(q)]).T + b

In [207]:
pre_preds

matrix([[ -841.49517,  -832.88408],
        [ -893.40053,  -888.57439],
        [-1479.0384 , -1481.9495 ],
        ...,
        [-7603.59445, -7721.2422 ],
        [-1036.11346, -1038.54086],
        [ -725.37551,  -730.70278]])

In [208]:
preds = pre_preds.T[0] > pre_preds.T[1]
(preds==val_y).mean()

0.8074

## Logistic regression (sklearn)

Here is how we can fit logistic regression where the features are the unigrams.

In [210]:
LogisticRegression

sklearn.linear_model._logistic.LogisticRegression

In [212]:
m = LogisticRegression(C=1e8, dual=False, max_iter=1000)
m.fit(x, y)
preds = m.predict(val_term_doc)
(preds==val_y).mean()

0.85184

...and the regularized version

In [209]:
m = LogisticRegression(C=0.01, dual=False, max_iter=1000)
m.fit(x, y)
preds = m.predict(val_term_doc)
(preds==val_y).mean()

0.8826

## Logistic regression (PyTorch)

## Deep NB