# Sentiment Analysis (SA) with pretrained language model example

## Step 1: Load gluon

In [1]:
import mxnet as mx
from mxnet import gluon, autograd

  from ._conv import register_converters as _register_converters


## Step 2: Load SA raw data using gluon.data and load pretrained LM

In [2]:
context = mx.gpu(2)

with open('./wikitext2_vocab.json', 'r') as file:
    wikitext2_vocab_json = file.read()
vocab = gluon.text.vocab.Vocabulary.json_deserialize(wikitext2_vocab_json)

In [None]:
# lm_model, vocab = gluon.model_zoo.text.standard_lstm_lm_650('wikitext-2', wikitext2_vocab, True, ctx = context)

In [3]:
from mxnet.gluon.model_zoo.text.lm import StandardRNN, AWDRNN
lm_model = StandardRNN('lstm', len(vocab), 650, 650, 2, 0.5, True)
lm_model.initialize(mx.init.Xavier(), ctx=context)
lm_model.load_params('./standard_lstm_lm_650.params', ctx=context)

In [None]:
# param_dict = mx.nd.load('./model.params.29')
# new_param_dict = {}
# #standardrnn0_
# for k, v in param_dict.items():
#     nk = k.split('standardrnn0_')[1]
#     new_param_dict[nk] = v
# mx.nd.save('./standard_lstm_lm_650.params', new_param_dict)
# print(new_param_dict)
    

In [4]:
train = gluon.data.text.IMDB(root='data/imdb', segment='train')
test = gluon.data.text.IMDB(root='data/imdb', segment='test')

## Step 3: Load user-defined tokenizer and Tokenize SA raw data

In [5]:
import spacy
spacy_en = spacy.load('en')

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]


In [6]:
train_tokenized = []
train_labels = []
for text, score in train:
    train_tokenized.append(tokenizer(text))
    train_labels.append(score)
test_tokenized = []
test_labels = []
for text, score in test:
    test_tokenized.append(tokenizer(text))
    test_labels.append(score)

## Step 4: Map tokenized data into nd array based instances according to lm's training data vocab

In [7]:
def encode_samples(x_raw_samples, vocab):
    x_encoded_samples = []
    for sample in x_raw_samples:
        x_encoded_sample = []
        for word in sample:
            if word in vocab.token_to_idx:
                x_encoded_sample.append(vocab.token_to_idx[word])
            else:
                x_encoded_sample.append(0)
        x_encoded_samples.append(x_encoded_sample)            
    return x_encoded_samples
    
def encode_labels(y_raw_samples):
    y_encoded_samples = []
    for score in y_raw_samples:
        if score >= 7:
            y_encoded_samples.append(1)
        elif score <= 4:
            y_encoded_samples.append(0)
    return y_encoded_samples

def pad_samples(x_encoded_samples, maxlen = 500, val = 0):
    x_samples = []
    for sample in x_encoded_samples:
        if len(sample) > maxlen:
            new_sample = sample[:maxlen]
        else:
            num_padding = maxlen - len(sample)
            new_sample = sample
            for i in range(num_padding):
                new_sample.append(val)
        x_samples.append(new_sample)
    return x_samples
    


In [8]:
x_encoded_train = encode_samples(train_tokenized, vocab)
x_encoded_test = encode_samples(test_tokenized, vocab)

In [9]:
x_train = mx.nd.array(pad_samples(x_encoded_train, 500, 0), ctx = context)
x_test = mx.nd.array(pad_samples(x_encoded_test, 500, 0), ctx = context)

In [10]:
y_train = mx.nd.array(encode_labels(train_labels), ctx = context)
y_test = mx.nd.array(encode_labels(test_labels), ctx = context)

## Step 5: Build SA classifier: pretrained lm encoder's hidden state as feature + binary dense layer as decoder

In [11]:
nclass = 2
# ##hyper parameters
lr = 0.001
epochs = 10
batch_size = 1

model = gluon.nn.Sequential()
with model.name_scope():
    model.add(lm_model.embedding)
    model.add(lm_model.encoder)
    model.add(gluon.nn.HybridLambda('SequenceLast'))
    model.add(gluon.nn.Dense(nclass, flatten=False))

model[3].initialize(mx.init.Xavier(), ctx = context)
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                       {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()

## Step 6: Report evaluation results: train and test accuracy

In [12]:
def eval(x_samples, y_samples):
    accuracy = mx.metric.Accuracy()
    for i, data in enumerate(x_samples):
        data = mx.nd.reshape(data, (-2, batch_size)).as_in_context(context)
        target = y_samples[i].as_in_context(context)
        output = model(data)
        predicts = mx.nd.argmax(output, axis=1)
        accuracy.update(preds=predicts, labels=target)
    return accuracy.get()[1]

## Step 7: Train SA model and evaluate on train and test set

In [None]:
for epoch in range(epochs):
    for i, data in enumerate(x_train):
        data = mx.nd.reshape(data, (-2, batch_size)).as_in_context(context)
        target = y_train[i].as_in_context(context)
        with autograd.record():
            output = model(data)
            L = loss(output, target)
        L.backward()
        trainer.step(batch_size)
        if i % 1000 == 0:
            print("Batch %s. loss %s"%(i, L))
    train_accuracy = eval(x_train, y_train)
    test_accuracy = eval(x_test, y_test)
    print("Epoch %s. Train_acc %s, Test_acc %s"%(epoch, train_accuracy, test_accuracy))

Batch 0. loss 
[0.06101925]
<NDArray 1 @gpu(2)>
Batch 1000. loss 
[0.00621813]
<NDArray 1 @gpu(2)>
Batch 2000. loss 
[0.00489169]
<NDArray 1 @gpu(2)>
Batch 3000. loss 
[0.00227122]
<NDArray 1 @gpu(2)>
Batch 4000. loss 
[0.0014558]
<NDArray 1 @gpu(2)>
Batch 5000. loss 
[0.00248873]
<NDArray 1 @gpu(2)>
Batch 6000. loss 
[0.00143651]
<NDArray 1 @gpu(2)>
Batch 7000. loss 
[0.0006521]
<NDArray 1 @gpu(2)>
Batch 8000. loss 
[0.00105492]
<NDArray 1 @gpu(2)>
Batch 9000. loss 
[0.1721312]
<NDArray 1 @gpu(2)>
Batch 10000. loss 
[0.0005992]
<NDArray 1 @gpu(2)>
Batch 11000. loss 
[0.00045945]
<NDArray 1 @gpu(2)>
