# python dependency
- pytorch
- torchtext

# data dependency
- glove 6b
- aclImdb

In [1]:
import os
import time
import random
import collections
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm

In [2]:
# settings
kDevice = "cpu"
kDataDir = "data" # put in the current directory

In [3]:
# read data
def readImdb(data_dir, part_folder):
    text_data_list = []
    # pos and neg are sub folders and show the label info
    for label in ["pos", "neg"]:
        folder_path = os.path.join(data_dir, "aclImdb", part_folder, label)
        for file in tqdm(os.listdir(folder_path)):
            with open(os.path.join(folder_path, file), "rb") as f:
                movie_review = f.read().decode("utf-8").replace('\n', '').lower()
                text_data_list.append([movie_review, 1 if label == "pos" else 0])
    random.shuffle(text_data_list)
    return text_data_list

In [4]:
train_data, test_data = readImdb(kDataDir, "train"), readImdb(kDataDir, "test")

100%|█████████████████████████████████| 12500/12500 [00:01<00:00, 10397.65it/s]
100%|█████████████████████████████████| 12500/12500 [00:01<00:00, 10448.58it/s]
100%|█████████████████████████████████| 12500/12500 [00:01<00:00, 11034.06it/s]
100%|█████████████████████████████████| 12500/12500 [00:01<00:00, 11091.43it/s]


In [5]:
# pre process data
def tokenizer(text):
    return [tok.lower() for tok in text.split(' ')]

def getTokenizedImdb(data):
    # data: list of [string, int]
    return [tokenizer(review) for review, _ in data]

def getImdbVocab(data):
    tokenized_data = getTokenizedImdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5) # filter out the words count less than 5

In [6]:
vocab = getImdbVocab(train_data)

In [7]:
def pad(x, max_len):
    return x[:max_len] if len(x) > max_len else x + [0] * (max_len - len(x))

In [8]:
def preprocessImdb(data, vocab):
    max_len = 500 # pading to 500 words for each review
    tokenized_data = getTokenizedImdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words], max_len) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [20]:
batch_size = 64
train_set = Data.TensorDataset(*preprocessImdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocessImdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [21]:
class TextRNN(nn.Module):
    def __init__(self, vocab_len, embed_size, num_hiddens, num_layers):
        super(TextRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_len, embed_size)
        
        # bidrectional lstm
        self.encoder = nn.LSTM(input_size=embed_size,
                              hidden_size=num_hiddens,
                              num_layers=num_layers,
                              bidirectional=True)
        # full connect layer
        self.decoder = nn.Linear(4 * num_hiddens, 2)
        
    def forward(self, inputs):
        # inputs shape: (batch_size, words_len)
        # inverse inputs and fetch the attributes, outputs shape: (words_len, batch_size, word_vec_dim)
        embeddings = self.embedding(inputs.permute(1, 0))
        outputs, _ = self.encoder(embeddings)
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

In [22]:
# build a 2 hidden layer bidirectional nural network
embed_size, num_hiddens, num_layers = 100, 100, 2
net = TextRNN(len(vocab), embed_size, num_hiddens, num_layers) # make sure the model args are convienient for C++

In [23]:
# download and cache larget scale pretrained vocab from torchtext
# website link: https://nlp.stanford.edu/projects/glove
# domestic link: https://sunyanhust.github.io/post/nlp-chang-yong-mo-xing-he-shu-ju-ji-gao-su-xia-zai/
# you can manually down load the glove.6B.100d.zip, rename as glove.6B.zip and put in the cache dir
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(kDataDir, "glove"))

In [24]:
def loadPretrainedEmbedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1 # ?
    
    if oov_count > 0:
        print ("there are %d oov words" % oov_count)
        
    return embed

In [25]:
net.embedding.weight.data.copy_(loadPretrainedEmbedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # pretrained data no need to udpate

there are 21202 oov words


In [26]:
# train
def evaluate_accuracy(data_iter, net, device=None):
    if device is None:
        # if not specified device, use net device
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            net.eval() # eval mode will close dropout
            acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
            net.train() # back to train mode
            n += y.shape[0]
    return acc_sum / n

def train(net, train_iter, test_iter, batch_size, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, batch_count, start = 0.0, 0.0, 0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
            
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [46]:
lr = 0.01
num_epochs = 5

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(net, train_iter, test_iter, batch_size, loss, optimizer, kDevice, num_epochs) # the training may take a long time in cpu

training on  cpu
train iter batch_count 1
train iter batch_count 2
train iter batch_count 3
train iter batch_count 4
train iter batch_count 5
train iter batch_count 6
train iter batch_count 7
train iter batch_count 8
train iter batch_count 9
train iter batch_count 10
train iter batch_count 11
train iter batch_count 12
train iter batch_count 13
train iter batch_count 14
train iter batch_count 15
train iter batch_count 16
train iter batch_count 17
train iter batch_count 18
train iter batch_count 19
train iter batch_count 20
train iter batch_count 21
train iter batch_count 22
train iter batch_count 23
train iter batch_count 24
train iter batch_count 25
train iter batch_count 26
train iter batch_count 27
train iter batch_count 28
train iter batch_count 29
train iter batch_count 30
train iter batch_count 31
train iter batch_count 32
train iter batch_count 33
train iter batch_count 34
train iter batch_count 35
train iter batch_count 36
train iter batch_count 37
train iter batch_count 38
trai

train iter batch_count 307
train iter batch_count 308
train iter batch_count 309
train iter batch_count 310
train iter batch_count 311
train iter batch_count 312
train iter batch_count 313
train iter batch_count 314
train iter batch_count 315
train iter batch_count 316
train iter batch_count 317
train iter batch_count 318
train iter batch_count 319
train iter batch_count 320
train iter batch_count 321
train iter batch_count 322
train iter batch_count 323
train iter batch_count 324
train iter batch_count 325
train iter batch_count 326
train iter batch_count 327
train iter batch_count 328
train iter batch_count 329
train iter batch_count 330
train iter batch_count 331
train iter batch_count 332
train iter batch_count 333
train iter batch_count 334
train iter batch_count 335
train iter batch_count 336
train iter batch_count 337
train iter batch_count 338
train iter batch_count 339
train iter batch_count 340
train iter batch_count 341
train iter batch_count 342
train iter batch_count 343
t

train iter batch_count 220
train iter batch_count 221
train iter batch_count 222
train iter batch_count 223
train iter batch_count 224
train iter batch_count 225
train iter batch_count 226
train iter batch_count 227
train iter batch_count 228
train iter batch_count 229
train iter batch_count 230
train iter batch_count 231
train iter batch_count 232
train iter batch_count 233
train iter batch_count 234
train iter batch_count 235
train iter batch_count 236
train iter batch_count 237
train iter batch_count 238
train iter batch_count 239
train iter batch_count 240
train iter batch_count 241
train iter batch_count 242
train iter batch_count 243
train iter batch_count 244
train iter batch_count 245
train iter batch_count 246
train iter batch_count 247
train iter batch_count 248
train iter batch_count 249
train iter batch_count 250
train iter batch_count 251
train iter batch_count 252
train iter batch_count 253
train iter batch_count 254
train iter batch_count 255
train iter batch_count 256
t

train iter batch_count 133
train iter batch_count 134
train iter batch_count 135
train iter batch_count 136
train iter batch_count 137
train iter batch_count 138
train iter batch_count 139
train iter batch_count 140
train iter batch_count 141
train iter batch_count 142
train iter batch_count 143
train iter batch_count 144
train iter batch_count 145
train iter batch_count 146
train iter batch_count 147
train iter batch_count 148
train iter batch_count 149
train iter batch_count 150
train iter batch_count 151
train iter batch_count 152
train iter batch_count 153
train iter batch_count 154
train iter batch_count 155
train iter batch_count 156
train iter batch_count 157
train iter batch_count 158
train iter batch_count 159
train iter batch_count 160
train iter batch_count 161
train iter batch_count 162
train iter batch_count 163
train iter batch_count 164
train iter batch_count 165
train iter batch_count 166
train iter batch_count 167
train iter batch_count 168
train iter batch_count 169
t

train iter batch_count 44
train iter batch_count 45
train iter batch_count 46
train iter batch_count 47
train iter batch_count 48
train iter batch_count 49
train iter batch_count 50
train iter batch_count 51
train iter batch_count 52
train iter batch_count 53
train iter batch_count 54
train iter batch_count 55
train iter batch_count 56
train iter batch_count 57
train iter batch_count 58
train iter batch_count 59
train iter batch_count 60
train iter batch_count 61
train iter batch_count 62
train iter batch_count 63
train iter batch_count 64
train iter batch_count 65
train iter batch_count 66
train iter batch_count 67
train iter batch_count 68
train iter batch_count 69
train iter batch_count 70
train iter batch_count 71
train iter batch_count 72
train iter batch_count 73
train iter batch_count 74
train iter batch_count 75
train iter batch_count 76
train iter batch_count 77
train iter batch_count 78
train iter batch_count 79
train iter batch_count 80
train iter batch_count 81
train iter b

train iter batch_count 349
train iter batch_count 350
train iter batch_count 351
train iter batch_count 352
train iter batch_count 353
train iter batch_count 354
train iter batch_count 355
train iter batch_count 356
train iter batch_count 357
train iter batch_count 358
train iter batch_count 359
train iter batch_count 360
train iter batch_count 361
train iter batch_count 362
train iter batch_count 363
train iter batch_count 364
train iter batch_count 365
train iter batch_count 366
train iter batch_count 367
train iter batch_count 368
train iter batch_count 369
train iter batch_count 370
train iter batch_count 371
train iter batch_count 372
train iter batch_count 373
train iter batch_count 374
train iter batch_count 375
train iter batch_count 376
train iter batch_count 377
train iter batch_count 378
train iter batch_count 379
train iter batch_count 380
train iter batch_count 381
train iter batch_count 382
train iter batch_count 383
train iter batch_count 384
train iter batch_count 385
t

train iter batch_count 262
train iter batch_count 263
train iter batch_count 264
train iter batch_count 265
train iter batch_count 266
train iter batch_count 267
train iter batch_count 268
train iter batch_count 269
train iter batch_count 270
train iter batch_count 271
train iter batch_count 272
train iter batch_count 273
train iter batch_count 274
train iter batch_count 275
train iter batch_count 276
train iter batch_count 277
train iter batch_count 278
train iter batch_count 279
train iter batch_count 280
train iter batch_count 281
train iter batch_count 282
train iter batch_count 283
train iter batch_count 284
train iter batch_count 285
train iter batch_count 286
train iter batch_count 287
train iter batch_count 288
train iter batch_count 289
train iter batch_count 290
train iter batch_count 291
train iter batch_count 292
train iter batch_count 293
train iter batch_count 294
train iter batch_count 295
train iter batch_count 296
train iter batch_count 297
train iter batch_count 298
t

In [57]:
# predict
def predict(net, vocab, sentence):
    device = list(net.parameters())[0].device
    words = tokenizer(sentence)
    sentence_tensor = torch.tensor([vocab.stoi[word] for word in words], device=device)
    output = net(sentence_tensor.view((1, -1)))
    label = torch.argmax(output, dim=1)
    print ("output:", output)
    print ("label:", label.item())
    return "positive" if label.item() == 1 else "negative"

In [58]:
sentence1 = "I feel the movie kind of great and to my taste"
sentence_tensor1 = torch.tensor([vocab.stoi[word] for word in tokenizer(sentence1)], device=list(net.parameters())[0].device).view(1, -1) # display the input tensor for C++ use
print ("input:", sentence_tensor1)

res = predict(net, vocab, sentence1)
print (res)

input: tensor([[   9,  223,    2,   20,  232,    5,   88,    4,    6,   57, 1743]])
output: tensor([[-1.7009,  1.5822]], grad_fn=<AddmmBackward>)
label: 1
positive


In [59]:
sentence2 = "the movie has bad experience"
sentence_tensor2 = torch.tensor([vocab.stoi[word] for word in tokenizer(sentence2)], device=list(net.parameters())[0].device).view(1, -1) # display the input tensor for C++ use
print ("input:", sentence_tensor2)

res = predict(net, vocab, sentence2)
print (res)

input: tensor([[  2,  20,  41,  97, 802]])
output: tensor([[ 0.2492, -0.2555]], grad_fn=<AddmmBackward>)
label: 0
negative


In [47]:
# export model
example_sentence = "funny movie and make me exciting"
example_sentence_tensor = torch.tensor([vocab.stoi[word] for word in tokenizer(sentence2)], device=list(net.parameters())[0].device).view(1, -1)
traced_script_module = torch.jit.trace(net, example_sentence_tensor)
traced_script_module.save("text_rnn.pt")

In [61]:
# use the exported model to predict
predict(traced_script_module, vocab, sentence2)

output: tensor([[ 0.2492, -0.2555]], grad_fn=<AddBackward0>)
label: 0


'negative'