In [1]:
import csv
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.nn import utils
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe, vocab, build_vocab_from_iterator

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [2]:
tokenizer = get_tokenizer("basic_english")

glove = GloVe(name = "6B", dim = 200, cache = "/content/drive/Shareddrives/CSE258/")

In [3]:
vocabulary = vocab(glove.stoi, min_freq = 0)
pretrained_weights = glove.vectors

unk_idx = len(vocabulary)
vocabulary.insert_token("<unk>", unk_idx)
vocabulary.set_default_index(unk_idx)

In [4]:
class reviewsDataset(Dataset):
  def __init__(self, csv_file):
    self.data = pd.read_csv(csv_file)
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx):
    review = self.data.iloc[idx]['review']
    rating = self.data.iloc[idx]['rating']

    return {'review': str(review), 'rating': rating}

In [5]:
def collate_fn(data):
  reviews = []
  ratings = []
  length = []

  for d in data:
    review = d['review']
    rating = d['rating']

    tokens = vocabulary(tokenizer(review))
    tokens = [t for t in tokens if t != 400000]
    if(len(tokens) > 0):
      reviews.append(torch.tensor(tokens, dtype = torch.long, device = device))
      length.append(len(tokens))
      ratings.append(rating)
  
  ratings = torch.tensor(ratings, dtype = torch.float, device = device)
  return reviews, ratings, length

In [6]:
class LSTM(nn.Module):
  def __init__(self, weight):
    super().__init__()
    self.hidden_size = weight.shape[-1]

    self.embedding = nn.Embedding.from_pretrained(weight, freeze = True)

    self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, batch_first = True)
    self.linear1 = nn.Linear(self.hidden_size, int(self.hidden_size/2))
    self.dropout = nn.Dropout(p=0.6)
    self.linear2 = nn.Linear(int(self.hidden_size/2), 1)
    self.relu = nn.ReLU()

  def forward(self, input):
    embeddings = []
    for r in input:
      embeddings.append(self.embedding(r))
    packed_embeddings = utils.rnn.pack_sequence(embeddings, enforce_sorted = False)

    output, (h_n,c_n) = self.lstm(packed_embeddings)
    h_n_linear1 = self.dropout(self.relu(self.linear1(h_n)))
    h_n_linear2 = self.linear2(h_n_linear1)
    h_n_linear2 = h_n_linear2.squeeze(0).squeeze(-1)

    return h_n_linear2

lstm = LSTM(pretrained_weights).to(device)

In [None]:
class BiLSTM(nn.Module):
  def __init__(self, vocab_size, hidden_size):
    super().__init__()
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)

    self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, batch_first = True, bidirectional = True)
    self.linear1 = nn.Linear(2*self.hidden_size, self.hidden_size)
    self.linear2 = nn.Linear(self.hidden_size, 1)
    self.relu = nn.ReLU()

  def forward(self, input):
    embeddings = []
    for r in input:
      embeddings.append(self.embedding(r))
    packed_embeddings = utils.rnn.pack_sequence(embeddings, enforce_sorted = False)

    output, (h_n,c_n) = self.lstm(packed_embeddings)
    h_n = torch.cat((h_n[0], h_n[1]), dim = 1)
    h_n_linear1 = self.relu(self.linear1(h_n))
    h_n_linear2 = self.linear2(h_n_linear1)
    h_n_linear2 = h_n_linear2.squeeze(0).squeeze(-1)

    return h_n_linear2

bilstm = BiLSTM(pretrained_weights).to(device)

In [7]:
dataset_path = '/content/drive/Shareddrives/CSE258/clean_data/train.csv'
dataset = reviewsDataset(dataset_path)
dataloader = DataLoader(dataset, batch_size = 64, collate_fn = collate_fn)

valid_dataset_path = '/content/drive/Shareddrives/CSE258/clean_data/valid.csv'
valid_dataset = reviewsDataset(valid_dataset_path)
valid_dataloader = DataLoader(valid_dataset, batch_size = 64, collate_fn = collate_fn)

test_dataset_path = '/content/drive/Shareddrives/CSE258/clean_data/test.csv'
test_dataset = reviewsDataset(test_dataset_path)
test_dataloader = DataLoader(test_dataset, batch_size = 64, collate_fn = collate_fn)

In [8]:
def test_loss(dataset, dataloader, model, criterion):
  model.eval()
  test_loss = 0

  for i, (packed_sequence, ratings, _) in enumerate(dataloader):
    prediction = model(packed_sequence)
    test_loss += criterion(prediction, ratings)

  return test_loss/len(dataset)

def validation_loss(dataset, dataloader, model, criterion):
  model.eval()
  valid_loss = 0

  for i, (packed_sequence, ratings, _) in enumerate(dataloader):
    prediction = model(packed_sequence)
    valid_loss += criterion(prediction, ratings)

  return valid_loss/len(dataset)

def train(input, target, model, optimizer, criterion):
  prediction = model(input)
  loss = criterion(prediction, target)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  return loss/target.shape[0]

def train_iter(model, epochs, print_every = 244, lr = 0.001):
  optimizer = optim.Adam(model.parameters(), lr = lr)
  criterion = nn.MSELoss(reduction = 'sum')

  for epoch in range(epochs):
    model.train()
    loss_per_epoch = 0
    loss_per_print = 0

    for i, (packed_sequence, ratings, _) in enumerate(dataloader):
      loss = train(packed_sequence, ratings, model, optimizer, criterion)
      loss_per_epoch += loss
      loss_per_print += loss

      if (i+1)%print_every == 0:
        print(f"Epoch : {epoch+1}/{epochs}, i: {i+1}/{len(dataloader)}, Training Loss: {loss_per_print/(print_every)}")
        loss_per_print = 0
    
    print("==============================")
    print(f"Epoch {epoch+1} Summary")
    print(f"Training Loss: {loss_per_epoch/len(dataloader)}")
    valid_loss = validation_loss(valid_dataset, valid_dataloader, model, criterion)
    print(f"Validation Loss: {valid_loss}")
    testing_loss = test_loss(test_dataset, test_dataloader, model, criterion)
    print(f"Test Loss: {testing_loss}")
    print("==============================")

train_iter(lstm, 10)

Epoch : 1/10, i: 244/2446, Training Loss: 3.051501750946045
Epoch : 1/10, i: 488/2446, Training Loss: 2.544358491897583
Epoch : 1/10, i: 732/2446, Training Loss: 2.1468088626861572
Epoch : 1/10, i: 976/2446, Training Loss: 1.8294475078582764
Epoch : 1/10, i: 1220/2446, Training Loss: 1.8028931617736816
Epoch : 1/10, i: 1464/2446, Training Loss: 1.740801215171814
Epoch : 1/10, i: 1708/2446, Training Loss: 1.7076895236968994
Epoch : 1/10, i: 1952/2446, Training Loss: 1.6558178663253784
Epoch : 1/10, i: 2196/2446, Training Loss: 1.6239523887634277
Epoch : 1/10, i: 2440/2446, Training Loss: 1.5827375650405884
Epoch 1 Summary
Training Loss: 1.9680137634277344
Validation Loss: 1.3882640600204468
Test Loss: 1.4040980339050293
Epoch : 2/10, i: 244/2446, Training Loss: 1.535567045211792
Epoch : 2/10, i: 488/2446, Training Loss: 1.4973684549331665
Epoch : 2/10, i: 732/2446, Training Loss: 1.5110254287719727
Epoch : 2/10, i: 976/2446, Training Loss: 1.4281284809112549
Epoch : 2/10, i: 1220/2446, 