# Text Classification with Amazon Review Data

CSI4121 Big Data, Spring 2023

Homework 1 - 2020147565 Younghan Park

In this notebook, we are going to classify ratings of Amazon reviews on video games.

https://nijianmo.github.io/amazon/index.html

## Preparing Datas

### Loading Datas

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=True)
os.chdir('drive/MyDrive/연세대학교 2학년 2학기 (2023-1)/CSI4121 Big Data/Homework 1')

In [None]:
!ls

In [None]:
import gzip
import json

data = []
with gzip.open('Video_Games_5.json.gz') as f:
  for l in f:
    data.append(json.loads(l.strip()))

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(data)[['overall', 'reviewText']].dropna(subset=['overall', 'reviewText'])

df = df[(df['overall'] < 2.5) | (df['overall'] > 3.5)]
for index, row in df.iterrows():
  if row['overall'] < 2.5:
    df.loc[index, 'overall'] = 0
  elif row['overall'] > 3.5:
    df.loc[index, 'overall'] = 1

df = df.head(200000)
print(df.head())
print('# of Datas:', len(df))

### Preprocessing Dataset

First, we are going to do tokenization, with Penn Treebank Tokenizer(default tokenizer of nltk package).

After that, we are going to generate word-to-int mapping.

In [None]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
df['tokenized'] = df['reviewText'].apply(tokenizer.tokenize)

In [None]:
from collections import Counter

corpus = []
for text in df['tokenized']:
  corpus += [word for word in text]
count_words = Counter(corpus)
sorted_words = count_words.most_common()
vocab_to_int = {w: i + 1 for i, (w, c) in enumerate(sorted_words)}

In [None]:
X = []
for text in df['tokenized']:
  X.append([vocab_to_int[word] for word in text])

df['X'] = X
df['len_X'] = [len(_) for _ in df['X']]

In [None]:
import matplotlib.pyplot as plt

print(df['len_X'].describe())

df['len_X'].hist(bins = range(min(df['len_X']), max(df['len_X']) + 50, 50))
plt.title('Sentence length distribution', size=15)
plt.show

In [None]:
print(df['overall'].describe())

df['overall'].hist(bins = 5)
plt.title('Label distribution', size=15)
plt.show

In [None]:
import numpy as np

seq_len = 300
features = np.zeros((len(df['X']), seq_len), dtype=int)
for i, text in enumerate(df['X']):
  if len(text) <= seq_len:
    features[i, :len(text)] = text
  else:
      features[i, :] = text[:seq_len]

In [None]:
num_category = df['overall'].nunique()
onehot_y = pd.get_dummies(df['overall']).to_numpy()

### Preparing Dataloaders

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_remain, y_train, y_remain = train_test_split(features, onehot_y, test_size=0.2)
X_val, X_test, y_val, y_test = train_test_split(X_remain, y_remain, test_size=0.5)

In [None]:
from torch.utils.data import TensorDataset, DataLoader
import torch

train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

batch_size = 256

In [None]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [None]:
dataiter = iter(train_loader)
sample_X, sample_y = next(dataiter)
print('Sample input X size:', sample_X.size())
print('Sample input X:', sample_X)
print('Sample label Y size:', sample_y.size())
print('Sample label y:', sample_y)

## Model

We are going to use bidirectional LSTM.

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, use CPU")

In [None]:
import torch.nn as nn

class Bi_LSTM(nn.Module):
  def __init__(self, output_size, hidden_dim, n_layers, vocab_size, embedding_dim, drop_prob=0.5):
    super().__init__()

    self.output_size = output_size
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers
    self.num_category = num_category

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional=True)

    self.fc = nn.Linear(2 * hidden_dim, output_size)
    self.dropout = nn.Dropout(0.3)
  
  def forward(self, x, hidden):
    embeds = self.embedding(x)
    lstm_out, hidden = self.lstm(embeds, hidden)
    lstm_out = lstm_out[:, -1]

    out = self.fc(lstm_out)
    out = self.dropout(out)

    return out, hidden
  
  def init_hidden(self, batch_size):
    h0 = torch.zeros((2 * self.n_layers, batch_size, self.hidden_dim)).to(device)
    c0 = torch.zeros((2 * self.n_layers, batch_size, self.hidden_dim)).to(device)
    return (h0, c0)

In [None]:
vocab_size = len(vocab_to_int) + 1  # 1 for padding(0)
output_size = num_category
embedding_dim = 128
hidden_dim = 256
n_layers = 2

model = Bi_LSTM(output_size, hidden_dim, n_layers, vocab_size, embedding_dim).to(device)

print(model)

## Training

In [None]:
import sklearn.utils.class_weight as class_weight

class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(df['overall']), y=df['overall'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights

In [None]:
lr = 0.001
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 20
clip = 5

In [None]:
def acc(pred, label):
  return (torch.argmax(pred, 1) == torch.argmax(label, 1)).int().sum()

In [None]:
train_loss_history, val_loss_history = [], []
train_acc_history, val_acc_history = [], []
valid_loss_min = np.Inf

In [None]:
len(train_loader.dataset)

In [None]:
from tqdm import tqdm

for epoch in tqdm(range(epochs)):
  train_losses = []
  train_acc = 0
  total_train_num = 0
  model.train()
  h = model.init_hidden(batch_size)

  for inputs, labels in train_loader:
    h  = tuple([each.data for each in h])

    inputs, labels = inputs.to(device), labels.to(device)

    model.zero_grad()
    output, h = model(inputs, h)
    
    loss = criterion(output.float(), labels.float())
    loss.backward()
    train_losses.append(loss.item())

    train_acc += acc(output, labels)
    total_train_num += output.size(0)

    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()

  val_losses = []
  val_acc = 0.0
  total_val_num = 0
  model.eval()
  h_v = model.init_hidden(batch_size)

  for inputs_v, labels_v in val_loader:
    h_v = tuple([each.data for each in h_v])

    inputs_v, labels_v = inputs_v.to(device), labels_v.to(device)

    output_v, h_v = model(inputs_v, h_v)

    loss_v = criterion(output_v.float(), labels_v.float())
    val_losses.append(loss_v.item())

    val_acc += acc(output_v, labels_v)
    total_val_num += output_v.size(0)
  
  epoch_train_loss = np.mean(train_losses)
  epoch_val_loss = np.mean(val_losses)
  epoch_train_acc = train_acc / total_train_num * 100  # deviding by number of batches
  epoch_val_acc = val_acc / total_val_num * 100 # deviding by number of batches

  train_loss_history.append(epoch_train_loss)
  val_loss_history.append(epoch_val_loss)
  train_acc_history.append(epoch_train_acc)
  val_acc_history.append(epoch_val_acc)
  print(f'Epoch {epoch+1}') 
  print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
  print(f'train_accuracy : {epoch_train_acc} val_accuracy : {epoch_val_acc}')
  print(25*'==')


## Test

In [None]:
test_losses = []
num_correct = 0

test_h = model.init_hidden(batch_size)

model.eval()
num_tot = 0
for inputs, labels in tqdm(test_loader):
    test_h = tuple([each.data for each in test_h])

    inputs, labels = inputs.to(device), labels.to(device)
    
    output, test_h = model(inputs, test_h)
    test_loss = criterion(output.float(), labels.float())
    test_losses.append(test_loss.item())
    
    num_correct += (torch.argmax(output, 1) == torch.argmax(labels, 1)).int().sum()
    num_tot += output.size(0)

print("Test loss: {:.3f}".format(np.mean(test_losses)))

test_acc = num_correct/num_tot * 100
print("Test accuracy: {:.3f}".format(test_acc))

In [None]:
plt.plot(train_loss_history, label='training loss')
plt.plot(val_loss_history, label='validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot([x.item() for x in train_acc_history], label='training acc')
plt.plot([x.item() for x in val_acc_history], label='validation acc')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()