In [None]:
import csv
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.nn import utils
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe, vocab, build_vocab_from_iterator

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(filereader):
    for rows in filereader:
      review = rows[5]
      tokens = tokenizer(review)
      yield tokens

def build_vocab_from_dataset(dataset_path = '/content/drive/Shareddrives/CSE258/clean_data/train.csv', min_freq = 5):
    csvfile = open(dataset_path, newline='')
    filereader = csv.reader(csvfile, delimiter=',', quotechar='"')
    column_names = next(filereader)
    
    vocabulary = build_vocab_from_iterator(yield_tokens(filereader), min_freq = min_freq, specials=["<unk>"])
    vocabulary.set_default_index(vocabulary["<unk>"])

    return vocabulary

vocabulary = build_vocab_from_dataset()

In [None]:
class reviewsDataset(Dataset):
  def __init__(self, csv_file):
    self.data = pd.read_csv(csv_file)
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx):
    review = self.data.iloc[idx]['review']
    rating = self.data.iloc[idx]['rating']

    return {'review': str(review), 'rating': rating}

In [None]:
train_dataset_path = '/content/drive/Shareddrives/CSE258/clean_data/train.csv'

def collate_fn(data):
  reviews = []
  ratings = []
  length = []

  for d in data:
    review = d['review']
    rating = d['rating']

    tokens = vocabulary(tokenizer(review))
    reviews.append(torch.tensor(tokens, dtype = torch.long, device = device))
    length.append(len(tokens))
    ratings.append(rating)
  
  ratings = torch.tensor(ratings, dtype = torch.float, device = device)
  return reviews, ratings, length

train_dataset = reviewsDataset(train_dataset_path)
train_dataloader = DataLoader(train_dataset, batch_size = 64, collate_fn = collate_fn)

In [None]:
class RNN(nn.Module):
  def __init__(self, vocab_size, hidden_size):
    super().__init__()
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)

    self.lstm = nn.RNN(self.hidden_size, self.hidden_size, batch_first = True)
    self.linear1 = nn.Linear(self.hidden_size, int(self.hidden_size/2))
    self.linear2 = nn.Linear(int(self.hidden_size/2), 1)
    self.relu = nn.ReLU()

  def forward(self, input):
    embeddings = []
    for r in input:
      embeddings.append(self.embedding(r))
    packed_embeddings = utils.rnn.pack_sequence(embeddings, enforce_sorted = False)

    output, h_n = self.lstm(packed_embeddings)
    h_n_linear1 = self.relu(self.linear1(h_n))
    h_n_linear2 = self.linear2(h_n_linear1)
    h_n_linear2 = h_n_linear2.squeeze(0).squeeze(-1)

    return h_n_linear2

rnn = RNN(len(vocabulary), 256).to(device)

In [None]:
valid_dataset_path = '/content/drive/Shareddrives/CSE258/clean_data/valid.csv'
valid_dataset = reviewsDataset(valid_dataset_path)
valid_dataloader = DataLoader(valid_dataset, batch_size = 64, collate_fn = collate_fn)

test_dataset_path = '/content/drive/Shareddrives/CSE258/clean_data/test.csv'
test_dataset = reviewsDataset(test_dataset_path)
test_dataloader = DataLoader(test_dataset, batch_size = 64, collate_fn = collate_fn)

In [None]:
def test_loss(model, criterion):
  model.eval()
  test_loss = 0
  torch.cuda.empty_cache()
  with torch.no_grad():
    for i, (packed_sequence, ratings, _) in enumerate(test_dataloader):
      prediction = model(packed_sequence)
      test_loss += criterion(prediction, ratings)
  return test_loss/len(test_dataset)

def validation_loss(model, criterion):
  model.eval()
  valid_loss = 0

  for i, (packed_sequence, ratings, _) in enumerate(valid_dataloader):
    prediction = model(packed_sequence)
    valid_loss += criterion(prediction, ratings)

  return valid_loss/len(valid_dataset)

def train(input, target, model, optimizer, criterion):
  prediction = model(input)
  loss = criterion(prediction, target)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  return loss/target.shape[0]

def train_iter(model, epochs, print_every = int(len(train_dataloader)/10), lr = 0.001):
  optimizer = optim.Adam(model.parameters(), lr = lr)
  criterion = nn.MSELoss(reduction = 'sum')

  for epoch in range(epochs):
    model.train()
    loss_per_epoch = 0
    loss_per_print = 0

    for i, (packed_sequence, ratings, _) in enumerate(train_dataloader):
      loss = train(packed_sequence, ratings, model, optimizer, criterion)
      loss_per_epoch += loss
      loss_per_print += loss

      if (i+1)%print_every == 0:
        print(f"Epoch : {epoch+1}/{epochs}, i: {i+1}/{len(train_dataloader)}, Training Loss: {loss_per_print/(print_every)}")
        loss_per_print = 0
    
    print("==============================")
    print(f"Epoch {epoch+1} Summary")
    print(f"Training Loss: {loss_per_epoch/len(train_dataloader)}")
    valid_loss = validation_loss(model, criterion)
    print(f"Validation Loss: {valid_loss}")
    testing_loss = test_loss(model, criterion)
    print(f"Validation Loss: {testing_loss}")
    print("==============================")

train_iter(rnn, 10)

Epoch : 1/10, i: 244/2446, Training Loss: 2.836580276489258


KeyboardInterrupt: ignored