In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.nlp import *
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.utils import shuffle

## create dataset

In [2]:
PATH='data/aclImdb/'
names = ['neg','pos']
names1 = ['neg', 'pos_']
names2 = ['neg_', 'pos']

In [3]:
! ls {PATH}train

labeledBow.feat [34mpos[m[m             unsupBow.feat   urls_pos.txt
[34mneg[m[m             [34munsup[m[m           urls_neg.txt    urls_unsup.txt


In [4]:
trn1,trn1_y = texts_labels_from_folders(f'{PATH}train',names1)
val1,val1_y = texts_labels_from_folders(f'{PATH}test',names1)

In [5]:
trn2,trn2_y = texts_labels_from_folders(f'{PATH}train',names2)
val2,val2_y = texts_labels_from_folders(f'{PATH}test',names2)

In [6]:
len(trn1), len(trn1_y), len(trn2), len(trn2_y)

(12500, 12500, 12500, 12500)

In [7]:
len(val1), len(val2)

(12500, 12500)

In [8]:
assert (trn1_y==0).all()
(trn1_y==0).all()

True

In [9]:
assert (trn1_y==0).all()
(val1_y==0).all()

True

In [10]:
assert (trn2_y==1).all()
(trn2_y==1).all()

True

In [11]:
assert (val2_y==1).all()
(val2_y==1).all()

True

In [12]:
def sample_util(data, label, n):
    assert len(data) == label.shape[0]
    idx = np.random.choice(range(len(data)),n)
    data_new = [data[i] for i in idx]
    label_new = label[idx]
    return data_new, label_new

In [13]:
def sample(data1, label1, data2, label2, n):
    t1, t2 = sample_util(data1, label1, n)
    t3, t4 = sample_util(data2, label2, n)
    data = t1 + t3
    label = np.concatenate((t2, t4))
    return data, label

In [14]:
# trn, trn_y = sample(trn1, trn1_y, trn2, trn2_y, 64*100)
# val, val_y = sample(val1, val1_y, val2, val2_y, 64*20)

In [15]:
trn,trn_y = texts_labels_from_folders(f'{PATH}train',names)
val,val_y = texts_labels_from_folders(f'{PATH}test',names)

## create vectors and vocab

In [16]:
veczr = CountVectorizer(tokenizer=tokenize)

In [17]:
trn_term_doc = veczr.fit_transform(trn)

In [18]:
val_term_doc = veczr.transform(val)

In [19]:
vocab = veczr.get_feature_names(); vocab[5000:5005]

['aussie', 'aussies', 'austen', 'austeniana', 'austens']

In [20]:
vocab[:5]

['\x08\x08\x08\x08a', '\x10own', '!', '"', '#']

In [21]:
len(vocab)

75132

In [22]:
trn_term_doc = trn_term_doc.sign()

In [23]:
val_term_doc = val_term_doc.sign()

In [24]:
trn_term_doc.shape, val_term_doc.shape

((25000, 75132), (25000, 75132))

## Naive Bayes

We define the **log-count ratio** $r$ for each word $f$:

$r = \log \frac{\text{ratio of feature $f$ in positive documents}}{\text{ratio of feature $f$ in negative documents}}$

where ratio of feature $f$ in positive documents is the number of times a positive document has a feature divided by the number of positive documents.

In [25]:
def pr(y_i):
    p = x[y==y_i].sum(0)
    return p+1

In [26]:
x=trn_term_doc
y=trn_y

p = pr(1)/pr(1).sum()
q = pr(0)/pr(0).sum()
r = np.log(p/q)
b = np.log((y==1).mean() / (y==0).mean())

In [27]:
val_term_doc.shape

(25000, 75132)

In [28]:
val_term_doc.shape[1]

75132

In [29]:
x.shape

(25000, 75132)

In [30]:
r.shape

(1, 75132)

In [31]:
b

0.0

In [32]:
val_term_doc

<25000x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 3640465 stored elements in Compressed Sparse Row format>

In [33]:
(val_term_doc @ r.T).shape

(25000, 1)

In [34]:
val_term_doc @ r.T

matrix([[-5.25616],
        [-3.41848],
        [ 0.42768],
        ...,
        [68.81967],
        [ 4.6196 ],
        [ 3.74532]])

In [35]:
val_y

array([0, 0, 0, ..., 1, 1, 1])

In [36]:
pre_preds = val_term_doc @ r.T + b
preds = pre_preds.T>0
(preds==val_y).mean()

0.82844

In [37]:
pre_preds = val_term_doc @ np.stack([np.log(p), np.log(q)]).T + b

In [38]:
pre_preds

matrix([[ -617.48225,  -612.22609],
        [ -549.41436,  -545.99589],
        [-1036.89288, -1037.32056],
        ...,
        [-4026.31409, -4095.13376],
        [ -633.7084 ,  -638.328  ],
        [ -508.107  ,  -511.85232]])

In [39]:
preds = pre_preds.T[0] > pre_preds.T[1]
nb_score = (preds==val_y).mean()
nb_score

0.82844

In [40]:
type(val_term_doc)

scipy.sparse.csr.csr_matrix

In [41]:
val_term_doc[0]

<1x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 83 stored elements in Compressed Sparse Row format>

In [42]:
xx = val_term_doc[0]

In [43]:
xx.toarray().shape

(1, 75132)

## Logistic regression (sklearn)

Here is how we can fit logistic regression where the features are the unigrams.

In [44]:
LogisticRegression

sklearn.linear_model._logistic.LogisticRegression

In [45]:
m = LogisticRegression(C=1e8, dual=False, max_iter=1000)
m.fit(x, y)
preds = m.predict(val_term_doc)
lr_score = (preds==val_y).mean()
lr_score

0.85704

...and the regularized version

In [46]:
m = LogisticRegression(C=1.0, dual=False, max_iter=1000)
m.fit(x, y)
preds = m.predict(val_term_doc)
lr_score2 = (preds==val_y).mean()
lr_score2

0.87396

## Logistic regression (PyTorch)

In [47]:
def binary_loss(pred, y):
    pred = pred.clamp(-10, 10)
    # y2 = torch.max(y,axis=1)[0]
    y2 = y
    p = torch.exp(pred) / (1+torch.exp(pred))
    result = torch.mean(-(y2 * torch.log(p) + (1-y2)*torch.log(1-p)))
    # return result.reshape(1, -1)
    return result

In [48]:
def score(pred, y):
    return np.sum(to_np((pred > 0) == y))

In [49]:
class SimpleNB2(nn.Module):
    def __init__(self, nf, ny):
        super().__init__()
        self.w = nn.Embedding(nf, ny)
        # self.w = nn.Embedding(nf+1, ny)
        # self.w.weight.data.uniform_(-1, 1)
        self.w.weight.data = torch.FloatTensor(r)[0].reshape(-1,1)
        # self.r = nn.Embedding(nf, ny)
        
    def forward(self, feat_idx):
        idx = feat_idx.nonzero()[1]
        v = self.w(V(idx))
        x = v.sum()
        return x

In [50]:
net_a = SimpleNB2(len(vocab),1)

loss = binary_loss
# loss = torch.nn.CrossEntropyLoss
lr = 1e-2
losses=[]

In [51]:
idx = trn_term_doc[0].nonzero()[1]
idx

array([    8,    13,    15,    16,  1039,  1041,  1050,  3219,  3814,  4926,  6304,  7012,  8696,  8775,
       10930, 15444, 24540, 24770, 25593, 26188, 28081, 34716, 40649, 46096, 46749, 46986, 47002, 57049,
       59330, 61967, 61969, 63122, 63341, 65484, 66458, 66554, 66684, 67252, 69418, 72931, 73145, 73488,
       73782], dtype=int32)

In [52]:
net_a.w(V(idx)).sum()

tensor(3.9171, grad_fn=<SumBackward0>)

In [53]:
net_a(trn_term_doc[-1])

tensor(13.8084, grad_fn=<SumBackward0>)

In [54]:
trn_y[-1]

1

In [55]:
pred = net_a(trn_term_doc[-1])

In [56]:
y = trn_y[-1]

In [57]:
binary_loss(pred, y)

tensor(4.5420e-05, grad_fn=<MeanBackward0>)

In [58]:
score(net_a(trn_term_doc[-1]), trn_y[-1])

1

In [59]:
trn_term_doc[0] @ r.T

matrix([[3.91715]])

In [60]:
trn_term_doc[-1] @ r.T

matrix([[13.80843]])

In [61]:
net_a(trn_term_doc[0])

tensor(3.9171, grad_fn=<SumBackward0>)

In [62]:
(net_a(trn_term_doc[0]) > 0) == trn_y[0]

tensor(False)

In [63]:
lr

0.01

In [66]:
import os
filename = 'acc.txt'
try:
    os.remove(filename)
    print('removed')
except:
    print('pass')
    pass

removed


In [67]:
ii = 1

In [68]:
ii = 67

In [69]:
_x = trn_term_doc[ii]
_y = trn_y[ii]

In [70]:
_y_pred = net_a(_x)
_y_pred

tensor(-10.8404, grad_fn=<SumBackward0>)

In [71]:
l = loss(_y_pred, V(_y))
# l = loss(yt, y_pred)
# loss_list.append(l)
# print(f'{index}, {l}, {datetime.now().time()}')

In [72]:
l

tensor(4.5420e-05, grad_fn=<MeanBackward0>)

In [73]:
l.backward()

In [74]:
l

tensor(4.5420e-05, grad_fn=<MeanBackward0>)

In [75]:
net_a.w.weight.grad.data

tensor([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])

In [76]:
net_a.w.weight.data -= net_a.w.weight.grad.data * lr
# net2.b.data -= net2.b.grad.data * lr

net_a.w.weight.grad.data.zero_()

tensor([[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]])

In [77]:
net_a.w.weight.data

tensor([[ 0.6904],
        [-0.6959],
        [-0.0575],
        ...,
        [ 0.6904],
        [-0.0028],
        [-0.6959]])

## Train

In [78]:
train_loss_list = []
val_loss_list = []
val_acc_list = []
train_acc_list = []

In [79]:
net_a = SimpleNB2(len(vocab),1)

loss = binary_loss
# loss = torch.nn.CrossEntropyLoss
lr = 1e-3

In [80]:
trn_scores = []
for x, y in tqdm(zip(trn_term_doc, trn_y), total=trn_term_doc.shape[0]):
    trn_scores.append(score(net_a(x),y))
print(np.mean(to_np(trn_scores)))

100%|██████████| 25000/25000 [00:04<00:00, 5090.41it/s]
0.90968


In [81]:
val_scores = []
for x, y in tqdm(zip(val_term_doc, val_y), total=val_term_doc.shape[0]):
    val_scores.append(score(net_a(x),y))
print(np.mean(to_np(val_scores)))

100%|██████████| 25000/25000 [00:05<00:00, 4864.08it/s]
0.82844


In [82]:
if False:
    train_acc_scores = []
    for x, y in tqdm(zip(trn_term_doc, trn_y), total=trn_term_doc.shape[0]):
        train_acc_scores.append(score(net_a(x),y))
    l3 = np.mean(to_np(train_acc_scores))
    print(l3)

    acc_scores = []
    for x, y in tqdm(zip(val_term_doc, val_y), total=val_term_doc.shape[0]):
        acc_scores.append(score(net_a(x),y))
    l4 = np.mean(to_np(acc_scores))
    print(l4)

In [83]:
print(f'lr={lr}')
f = open(filename, 'a')

from datetime import datetime
train_loss_list = []
val_loss_list = []
val_acc_list = []
train_acc_list = []

# loss_list = [0]
loss_list = []
for epoch in range(5000):
    if epoch % 1 == 0:
        train_scores = []
        for x, y in tqdm(zip(trn_term_doc, trn_y), total=trn_term_doc.shape[0]):
            train_scores.append(loss(net_a(x), V(y)))
        l1 = np.mean(to_np(train_scores))
        train_loss_list.append(l1)

        val_scores = []
        for x, y in tqdm(zip(val_term_doc, val_y), total=val_term_doc.shape[0]):
            val_scores.append(loss(net_a(x), V(y)))
        l2 = np.mean(to_np(val_scores))
        val_loss_list.append(l2)

        train_acc_scores = []
        for x, y in tqdm(zip(trn_term_doc, trn_y), total=trn_term_doc.shape[0]):
            train_acc_scores.append(score(net_a(x),y))
        l3 = np.mean(to_np(train_acc_scores))
        train_acc_list.append(l3)
        
        acc_scores = []
        for x, y in tqdm(zip(val_term_doc, val_y), total=val_term_doc.shape[0]):
            acc_scores.append(score(net_a(x),y))
        l4 = np.mean(to_np(acc_scores))
        val_acc_list.append(l4)

        # print(f'epoch={epoch}, score={np.mean(val_scores)}')
        # print(f'epoch={epoch}, score={l2}')
        print(f'epoch={epoch}, train-loss={l1}')
        print(f'epoch={epoch}, valid-loss={l2}')
        print(f'epoch={epoch}, train-acc={l3}')
        print(f'epoch={epoch}, valid-acc={l4}')
        f.write(f"{epoch}\t{l1}\t{l2}\t{l3}\t{l4}\t{nb_score}\t{lr_score}\t{lr_score2}\n")
        f.flush()

    print('')
    print('epoch:', epoch)
    print('time:', datetime.now())
    shuffle_x, shuffle_y = shuffle(trn_term_doc, trn_y)
    for _x, _y in tqdm(zip(shuffle_x, shuffle_y), total=shuffle_x.shape[0]):
        _y_pred = net_a(_x)
        l = loss(_y_pred, V(_y))
        # l = loss(yt, y_pred)
        loss_list.append(l)
        # print(f'{index}, {l}, {datetime.now().time()}')

        # Backward pass: 
        # compute gradient of the loss with respect to 
        # model parameters
        l.backward()
        net_a.w.weight.data -= net_a.w.weight.grad.data * lr
        # net2.b.data -= net2.b.grad.data * lr
        
        net_a.w.weight.grad.data.zero_()
        # net2.b.grad.data.zero_()   

f.close()

lr=0.001
100%|██████████| 25000/25000 [00:07<00:00, 3197.62it/s]
100%|██████████| 25000/25000 [00:07<00:00, 3258.21it/s]
100%|██████████| 25000/25000 [00:04<00:00, 5054.64it/s]
100%|██████████| 25000/25000 [00:04<00:00, 5047.96it/s]
epoch=0, train-loss=0.4026816487312317
epoch=0, valid-loss=0.8581857085227966
epoch=0, train-acc=0.90968
epoch=0, valid-acc=0.82844

epoch: 0
time: 2020-01-06 00:33:04.901118
100%|██████████| 25000/25000 [00:23<00:00, 1084.23it/s]
100%|██████████| 25000/25000 [00:07<00:00, 3537.56it/s]
100%|██████████| 25000/25000 [00:07<00:00, 3559.92it/s]
100%|██████████| 25000/25000 [00:04<00:00, 5045.30it/s]
100%|██████████| 25000/25000 [00:04<00:00, 5040.32it/s]
epoch=1, train-loss=0.3187373876571655
epoch=1, valid-loss=0.7220572829246521
epoch=1, train-acc=0.9244
epoch=1, valid-acc=0.84884

epoch: 1
time: 2020-01-06 00:33:52.875047
100%|██████████| 25000/25000 [00:22<00:00, 1088.76it/s]
100%|██████████| 25000/25000 [00:07<00:00, 3548.97it/s]
100%|██████████| 25000/250

KeyboardInterrupt: 

In [None]:
len(loss_list)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
len(train_loss_list)

In [None]:
length=len(train_loss_list)
df = pd.DataFrame({
    'train':train_loss_list[:length], 
    'valid':val_loss_list[:length],
    'train_acc':train_acc_list[:length], 
    'valid_acc':val_acc_list[:length]
})

In [None]:
df

In [None]:
df[['train','valid']].plot(subplots=True)

In [None]:
df.plot(subplots=True)

In [None]:
plt.plot(loss_list[:1280])

In [None]:
plt.plot(loss_list)

In [None]:
plt.plot(loss_list)

## Deep NB