In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Fri Sep 23 13:48:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import sys
import warnings

from torch.utils.data import DataLoader, Dataset
from transformers import BertPreTrainedModel, BertModel, AutoConfig, AutoTokenizer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm, trange
from collections import defaultdict

# Load the Dataset

In [6]:
df_train_text = pd.read_parquet("./drive/MyDrive/CS760/train_text.parquet.snappy")
df_val_text = pd.read_parquet("./drive/MyDrive/CS760/val_text.parquet.snappy")
df_test_text = pd.read_parquet("./drive/MyDrive/CS760/test_text.parquet.snappy")

df_train_main = pd.read_parquet("./drive/MyDrive/CS760/train_main.parquet.snappy")
df_val_main = pd.read_parquet("./drive/MyDrive/CS760/val_main.parquet.snappy")
df_test_main = pd.read_parquet("./drive/MyDrive/CS760/test_main.parquet.snappy")


print(f"Shape of the training dataset : {df_train_text.shape}")
print(f"Shape of the validation dataset : {df_val_text.shape}")
print(f"Shape of the test dataset : {df_test_text.shape}")

print("-"*20)


print(f"Shape of the training dataset : {df_train_main.shape}")
print(f"Shape of the validation dataset : {df_val_main.shape}")
print(f"Shape of the test dataset : {df_test_main.shape}")

Shape of the training dataset : (2060626, 2)
Shape of the validation dataset : (257578, 2)
Shape of the test dataset : (257579, 2)
--------------------
Shape of the training dataset : (2060626, 13)
Shape of the validation dataset : (257578, 13)
Shape of the test dataset : (257579, 13)


In [7]:
df_train_text.head()

Unnamed: 0,r_id,r_text
0,5508740,Honestly one of the best meals I've ever had i...
1,1633913,*****I will start this review by saying that I...
2,1427664,Cute and small bakery with good service and th...
3,5536866,After looking into multiple tinting companies ...
4,3380896,A verbal snapshot of the things that instantly...


In [8]:
df_train_main.head()

Unnamed: 0,r_id,r_stars,r_stars_square,r_length,u_friends_count,u_review_count,u_month_age,b_stars,b_review_count,r_sen,r_sub,r_rea,r_useful
0,5508740,5,25,54,1,2,23.093032,4.0,7400,0.197348,0.293939,72.22,1
1,1633913,1,1,224,47,43,57.182693,3.0,27,0.088636,0.320328,89.28,5
2,1427664,4,16,18,35,112,44.484607,4.0,103,0.36875,0.4125,87.72,1
3,5536866,5,25,86,156,21,28.229579,5.0,5,0.15,0.485,49.86,2
4,3380896,5,25,79,6660,2492,40.995708,4.0,709,0.345833,0.541667,57.78,14


In [9]:
def merge_data(df1:pd.DataFrame, df2:pd.DataFrame, on:str, suffixes:tuple=None) -> pd.DataFrame:
  """ Function to merge the dataframe """
  
  if suffixes is None:
    suffixes = ('_x', '_y')

  df_merge = pd.merge(df1, df2, on=on, suffixes=suffixes)
  df_merge = df_merge[['r_text', 'r_useful']]

  return df_merge

df_train = merge_data(df_train_text, df_train_main, "r_id", suffixes=('_text', '_main'))
df_val = merge_data(df_val_text, df_val_main, "r_id", suffixes=('_text', '_main'))
df_test = merge_data(df_test_text, df_test_main, "r_id", suffixes=('_text', '_main'))

In [10]:
df_train.head()

Unnamed: 0,r_text,r_useful
0,Honestly one of the best meals I've ever had i...,1
1,*****I will start this review by saying that I...,5
2,Cute and small bakery with good service and th...,1
3,After looking into multiple tinting companies ...,2
4,A verbal snapshot of the things that instantly...,14


In [11]:
wc_train = df_train["r_text"].apply(lambda x: len(x.split()))
wc_val = df_val["r_text"].apply(lambda x: len(x.split()))
wc_test = df_test["r_text"].apply(lambda x: len(x.split()))

In [12]:
maxlen = np.max([np.mean(wc_train), np.mean(wc_val), np.mean(wc_test), 511]).astype(int)
maxlen

511

# Configuration

In [13]:
MODEL_OUT_DIR = "./bert_reg_model/"

## TRAINING + MODEL CONFIG
BATCH_SZ = 128
LR = 1e-3
EPOCHS=10
NUM_THREADS=1
MODEL_NAME='bert-base-uncased'

if not os.path.exists(MODEL_OUT_DIR):
  os.mkdir(MODEL_OUT_DIR)

# Dataset Class

In [14]:
class YelpDataset:
  def __init__(self, data, maxlen, tokenizer):
    self.df = data
    self.maxlen = maxlen
    self.tokenizer = tokenizer

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, idx:int):

    text = self.df.loc[idx, "r_text"]
    try:
      label = self.df.loc[idx, "r_useful"]
    except Exception:
      label = 0

    tokens = self.tokenizer.tokenize(text)
    tokens = tokens[:self.maxlen]
    tokens = ["[CLS]"] + tokens + ["[SEP]"]

    if len(tokens) < self.maxlen:
      tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] 
    else:
      tokens = tokens[:self.maxlen - 1] + ['[SEP]']
    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    input_ids = torch.Tensor(input_ids).int()
    attention_mask = (input_ids != 0).long() # Attention Mask
    target = torch.Tensor([label])
    
    return input_ids, attention_mask, target 



# Regression Model

In [15]:
class BertRegressor(BertPreTrainedModel):
  def __init__(self, config):
    super().__init__(config)
    
    self.bert = BertModel(config)
    
    for param in self.bert.parameters():
      param.requires_grad = False 

    self.cls_layer = nn.Linear(config.hidden_size, 128)
    self.relu = nn.ReLU()
    self.ff1 = nn.Linear(128, 128)
    self.tanh = nn.Tanh()
    self.drop = nn.Dropout(p=0.25)
    self.ff2 = nn.Linear(128, 1)

  def forward(self, input_ids, attention_mask):
    bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    logits = bert_out.last_hidden_state[:, 0, :]
    out = self.cls_layer(logits)
    out = self.relu(out)
    out = self.ff1(out)
    out = self.tanh(out)
    out = self.drop(out)
    out = self.ff2(out)
    return out

# Training and Evaluation Functions

In [16]:
def evaluate(model, criterion, dataloader, device):
  
  model.eval()
  mean_loss, count = 0, 0
  with torch.no_grad():
    for input_ids, attention_mask, target in (dataloader):
      input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
      target = target.to(device)
      
      out = model(input_ids, attention_mask)
      mean_loss += criterion(out, target.type_as(out)).item()
      count += 1
  
  return mean_loss / count

def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
  
  train_losses, val_losses = list(), list()
  for epoch in trange(epochs, desc="Epoch"):
    model.train()
    train_loss = 0

    for input_ids, attention_mask, target in (train_loader):
      optimizer.zero_grad()

      input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
      out = model(input_ids, attention_mask)

      loss = criterion(out, target.type_as(out))
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
    
    train_loss = train_loss / len(train_loader)
    print(f"Training Loss : {train_loss:.5f}")

    val_loss = evaluate(model=model, 
                        criterion=criterion,
                        dataloader=val_loader,
                        device=device)
    
    print(f"Validation Loss : {val_loss:.5f}")
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Epoch-{epoch} complete!")

  return train_losses, val_losses


def predict(model, dataloader, device):
  
  labels = defaultdict(lambda: list())

  with torch.no_grad():
    for input_ids, attention_mask, target in dataloader:  
      input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
      out = model(input_ids, attention_mask)

      labels['preds'] += out
      labels['actual'] += target

  return labels

# Configure Model and Optimizers and others

In [17]:
config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = BertRegressor.from_pretrained(MODEL_NAME, config=config)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(params=model.parameters(), lr=LR)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertRegressor: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertRegressor from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertRegressor from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertRegressor were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['ff1.weight', 'cl

# Prepare Data

In [18]:
train_dataset = YelpDataset(data=df_train, maxlen=maxlen, tokenizer=tokenizer)
val_dataset = YelpDataset(data=df_val, maxlen=maxlen, tokenizer=tokenizer)
test_dataset = YelpDataset(data=df_test, maxlen=maxlen, tokenizer=tokenizer)


train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SZ, num_workers=NUM_THREADS)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SZ, num_workers=NUM_THREADS)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SZ, num_workers=NUM_THREADS)

# Train the Model

In [None]:
train_losses, val_losses = train(model=model, 
                                 criterion=criterion,
                                 optimizer=optimizer,
                                 train_loader=train_loader,
                                 val_loader=val_loader,
                                 epochs=EPOCHS,
                                 device=device)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
