In [1]:
import torch
import numpy as np
from rcnn.load_data import load_dataset
import pandas as pd
import torchtext
import os
from rcnn.models.RCNN import RCNN

In [2]:
torch.cuda.set_device(3)

TEXT, vocab_size, word_embeddings, train_iter, valid_iter, mapping = load_dataset()


Length of Text Vocabulary: 82174
Vector size of Text Vocabulary:  torch.Size([82174, 300])
Label Length: 5
Label Mapping:  defaultdict(<function _default_unk_index at 0x7feaf894b400>, {'5': 0, '4': 1, '1': 2, '3': 3, '2': 4})
Most frequent:  [('the', 552891), ('and', 387193), ('i', 340429), ('a', 280821), ('to', 273912), ('was', 204645), ('it', 165267), ('of', 159128), ('is', 137961), ('for', 133581), ('in', 125574), ('my', 108601), ('that', 100386), ('we', 93376), ('they', 93247), ('you', 91329), ('with', 91262), ('this', 90064), ('but', 83384), ('on', 77544)]


In [3]:
def map_forward(label):
    return mapping.get(str(label))

In [4]:
# Map the output of the model (0-4) back to original region (1-5)
def map_reverse(label):
    for k, v in mapping.items():
        if label == v:
            return float(k)

In [15]:
batch_size = 128
output_size = 5
hidden_size = 256
embedding_length = 300

model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
checkpoint = torch.load('checkpoint/rcnn_ep7.pth')
model.load_state_dict(checkpoint['model_state_dict'])

In [16]:
model.to(3)

RCNN(
  (word_embeddings): Embedding(82174, 300)
  (dropout): Dropout(p=0.7)
  (lstm): LSTM(300, 256, bidirectional=True)
  (W2): Linear(in_features=812, out_features=256, bias=True)
  (label): Linear(in_features=256, out_features=5, bias=True)
)

In [17]:
test = pd.read_csv('data/test_gt.csv')

In [18]:
print(TEXT.preprocess(test['text'][1]))

['was', 'given', 'a', 'massage', 'envy', 'gift', 'card', 'for', 'mother', "'s", 'day', 'and', 'got', 'an', 'additional', 'ad', 'on', 'with', 'it', 'one', 'hour', 'massage', 'and', 'a', 'sugar', 'foot', 'scrub', 'so', 'relaxed', 'tricia', 'was', 'amazing', 'perfect', 'pressure', 'and', 'she', 'really', 'listened', 'to', 'what', 'i', 'needed', 'will', 'definitely', 'be', 'back']


In [24]:
import torch.nn.functional as F
from torch.autograd import Variable
pred = []
for i in range(len(test['text'])):
    with torch.no_grad():
        test_sen = TEXT.preprocess(test['text'][i])
        test_sen = np.asarray([[TEXT.vocab.stoi[x] for x in test_sen]])
        test_sen = torch.LongTensor(test_sen)
        test_tensor = Variable(test_sen)
        test_tensor = test_tensor.cuda(3)
        model.eval()
        output = model(test_tensor, 1)
        out = F.softmax(output, 1)
        out = torch.argmax(out[0]).item()

        out = map_reverse(out)
        pred.append(out)

  


In [25]:
# This is the accuracy of our model on test set
print(np.mean(pred == test['stars']))

0.7085


In [26]:
# Store the prediction to csv
sub_df = pd.DataFrame()
sub_df["pre"] = pred
sub_df.to_csv("data/pre_rcnn_h256.csv", index=False)

In [27]:
import torch.nn.functional as F
from torch.autograd import Variable
val = pd.read_csv('data/mod_valid.csv')
pred_val = []
for i in range(len(val['text'])):
    with torch.no_grad():
        test_sen = TEXT.preprocess(val['text'][i])
        test_sen = np.asarray([[TEXT.vocab.stoi[x] for x in test_sen]])
        test_sen = torch.LongTensor(test_sen)
        test_tensor = Variable(test_sen)
        test_tensor = test_tensor.cuda()
        model.eval()
        output = model(test_tensor, 1)
        out = F.softmax(output, 1)
        out = torch.argmax(out[0]).item()
        out = map_reverse(out)
        pred_val.append(out)

In [28]:
# This is the validation accuracy
print(np.mean(pred_val == val['stars']))

0.726
