In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import collections
import os
import random
import torch
import numpy as np
import time
import torch.utils.data as Data
from torch import nn
from tqdm import tqdm
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab

In [6]:
# Read the data

def read_data(folder='train', data_root='/content/drive/MyDrive/'):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

DATA_ROOT = '/content/drive/MyDrive'
data_root = os.path.join(DATA_ROOT, "aclImdb")
train_data, test_data = read_data('train', data_root), read_data('test', data_root)

100%|██████████| 12500/12500 [01:32<00:00, 135.58it/s] 
100%|██████████| 12500/12500 [00:14<00:00, 863.90it/s] 
100%|██████████| 12500/12500 [00:14<00:00, 858.74it/s] 
100%|██████████| 12500/12500 [03:15<00:00, 63.88it/s]  


In [29]:
# build the function we need in the next steps 

def tokenizer(text):
    return [tok.lower() for tok in text.split(' ')]

def get_tokenized_data(data):
    return [tokenizer(review) for review, _ in data]

def get_vocab_data(data):
    tokenized_data = get_tokenized_data(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return vocab(counter,min_freq=10)

vocab = get_vocab_data(train_data) # This is used for filter some words have low frequency

In [8]:
# calculate TD-IDF results

def TF_IDF_trans(word,counter1,counter2):
  TF1 = 0
  TF2 = 0
  flag1 = 0
  flag2 = 0 
  TF1 = counter1[word]
  TF2 = counter2[word]
  flag1 = 1 if TF1!=0 else 0
  flag2 = 1 if TF2!=0 else 0
  IDF = np.log((2+1)/(flag1+flag2+1))+1
  return [TF1*IDF,TF2*IDF]

In [9]:
cato1 = [train_data[i] for i in range(len(train_data)) if train_data[i][1] == 1]
cato2 = [train_data[i] for i in range(len(train_data)) if train_data[i][1] == 0]
cato1 = get_tokenized_data(cato1)
cato2 = get_tokenized_data(cato2)
counter1 = collections.Counter([tk for st in cato1 for tk in st])  #used to calculate TF-IDF
counter2 = collections.Counter([tk for st in cato2 for tk in st])

In [11]:
tokenized_data = get_tokenized_data(train_data)
embedding = {} # record the TD-IDF results
for words in tokenized_data:
  for word in words:
    if word not in embedding:
      embedding[word] = TF_IDF_trans(word,counter1,counter2)

In [30]:
# Preprocess the data

def preprocess_data(data, embedding):
  tokenized_data = get_tokenized_data(data)
  max_l = 500 
  def pad(x):
    return x[:max_l] if len(x) > max_l else x + [[0,0]] * (max_l -len(x))
  features = torch.tensor([pad([embedding[word] for word in words if word in embedding and word in vocab]) for words in tokenized_data])
  labels = torch.tensor([score for _, score in data])
  return features, labels

In [31]:
# Create data iterator

batch_size = 64
train_set = Data.TensorDataset(*preprocess_data(train_data, embedding))
test_set = Data.TensorDataset(*preprocess_data(test_data, embedding))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [45]:
# check the data

for X, y in train_iter:
  print('X', X.shape, 'y', y.shape)
  break

X torch.Size([64, 500, 2]) y torch.Size([64])


In [38]:
# Create Bi-LSTM model

class BiRNN(nn.Module):
  def __init__(self, embed_size, num_hiddens, num_layers):
    super(BiRNN, self).__init__()
    self.LSTM = nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=True)
    self.fc = nn.Linear(4*num_hiddens, 2)
    self.dropout = nn.Dropout(0.8)
  def forward(self, inputs):
    inputs = inputs.float()
    outputs, _ = self.LSTM(inputs.permute(1,0,2))
    outputs = self.dropout(outputs)
    result = torch.cat((outputs[0], outputs[-1]), -1)
    outs = self.fc(result)
    return outs 

In [39]:
# Parameters we need

embed_size, num_hiddens, num_layers = 2, 100, 2
net = BiRNN(embed_size, num_hiddens, num_layers)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()

In [40]:
def evaluate_accuracy(data_iter, net,device):
  acc_sum, n = 0.0, 0
  with torch.no_grad():
    for X, y in data_iter:
      net.eval()
      acc_sum += (net(X.to(device)).argmax(dim=1) ==y.to(device)).float().sum().cpu().item()
      net.train()
      n += y.shape[0]
  return acc_sum / n

In [43]:
def train(train_iter, test_iter, net, loss, optimizer, device,num_epochs):
  net = net.to(device)
  batch_count = 0
  for epoch in range(num_epochs):
    train_loss_sum, train_acc_sum, n = 0.0, 0.0, 0
    for X, y in train_iter: 
      optimizer.zero_grad()
      X = X.to(device) 
      y = y.to(device)
      y_hat = net(X) 
      l = loss(y_hat, y)
      l.backward()
      optimizer.step()
      train_loss_sum += l.cpu().item()
      train_acc_sum += (y_hat.argmax(dim=1) ==y).sum().cpu().item()
      n += y.shape[0]
      batch_count += 1
    test_acc = evaluate_accuracy(test_iter, net,device)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'% (epoch + 1, train_loss_sum / batch_count,train_acc_sum / n, test_acc))

In [44]:
train(train_iter, test_iter, net, loss, optimizer, device,num_epochs)

epoch 1, loss 0.2145, train acc 0.916, test acc 0.815
epoch 2, loss 0.1056, train acc 0.917, test acc 0.830
epoch 3, loss 0.0701, train acc 0.917, test acc 0.836
epoch 4, loss 0.0523, train acc 0.917, test acc 0.837
epoch 5, loss 0.0418, train acc 0.917, test acc 0.838
