In [2]:
import csv
import random
import pickle
import os
from time import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import defaultdict
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertModel

In [3]:
# Load the pretrained DistilBERT model
# here we user two distilbert models, one for recipes, one for reviews
distilbert_review = DistilBertModel.from_pretrained("distilbert-base-uncased")
distilbert_recipe = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Freeze all parameters of the pretrained DistilBERT model
for param in distilbert_review.parameters():
    param.requires_grad = False

for param in distilbert_recipe.parameters():
    param.requires_grad = False

In [4]:
# freeze pretrained DistilBert, fine tune the newly-added fc layers
class TwoDistilBERT(nn.Module):
    def __init__(self, distilbert_review, distilbert_recipe, dropout=0.1, hidden_dim=25, output_dim=1):
        super(TwoDistilBERT, self).__init__()
        # Pretrained DistilBERT model
        self.distilbert_review = distilbert_review
        self.distilbert_recipe = distilbert_recipe
        self.dropout_review = nn.Dropout(dropout)
        self.dropout_recipe = nn.Dropout(dropout)

        # fine tune fc layers
        self.regressor_review = nn.Sequential(
                                    nn.Linear(768, hidden_dim),  # DistilBERT hidden size is 768
                                    nn.ReLU(),
                                )
        
        self.regressor_recipe = nn.Sequential(
                                    nn.Linear(768, hidden_dim),  # DistilBERT hidden size is 768
                                    nn.ReLU(),                                )

        # combine results of review and recipe
        self.combinator = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, review_ids, review_mask, recipe_ids, recipe_mask):
        # for review
        # Forward pass through DistilBERT
        review_outputs = self.distilbert_review(input_ids=review_ids, attention_mask=review_mask)
        
        # The last hidden state (batch_size, seq_len, hidden_dim)
        review_hidden_state = review_outputs.last_hidden_state
        
        # Use the [CLS] token representation (first token in sequence)
        review_cls_token_state = review_hidden_state[:, 0, :]  # (batch_size, hidden_dim)
        
        # Apply dropout for regularization
        review_cls_token_state = self.dropout_review(review_cls_token_state)
        
        # Pass through the custom linear layer
        review_output = self.regressor_review(review_cls_token_state)  # (batch_size, hidden_dim)

        # for recipe
        # Forward pass through DistilBERT
        recipe_outputs = self.distilbert_recipe(input_ids=recipe_ids, attention_mask=recipe_mask)
        
        # The last hidden state (batch_size, seq_len, hidden_dim)
        recipe_hidden_state = recipe_outputs.last_hidden_state
        
        # Use the [CLS] token representation (first token in sequence)
        recipe_cls_token_state = recipe_hidden_state[:, 0, :]  # (batch_size, hidden_dim)
        
        # Apply dropout for regularization
        recipe_cls_token_state = self.dropout_recipe(recipe_cls_token_state)
        
        # Pass through the custom linear layer
        recipe_output = self.regressor_recipe(recipe_cls_token_state)  # (batch_size, hidden_dim)

        # combine the results of review and recipe
        output = torch.cat((review_output, recipe_output), dim=-1)
        output = self.combinator(output)    # (batch_size, output_dim)
        
        return output

In [5]:
# tunable parameters
hidden_dim = 25
dropout = 0.1
learning_rate = 1e-3
max_epoch = 1
batch_size = 64

In [6]:
# dataset for rating prediction task
# here we use review_tokens, recipe_tokens as inputs, ratings as labels
class RegressionDataset(Dataset):
    def __init__(self, review_tokens, recipe_tokens, labels):
        self.review_tokens = review_tokens
        self.recipe_tokens = recipe_tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review_ids = torch.tensor(self.review_tokens[idx], dtype=torch.long)
        review_mask = torch.ones_like(review_ids, dtype=torch.long)
        recipe_ids = torch.tensor(self.recipe_tokens[idx], dtype=torch.long)
        recipe_mask = torch.ones_like(recipe_ids, dtype=torch.long)
        labels = torch.tensor(self.labels[idx], dtype=torch.float32)

        return review_ids, review_mask, recipe_ids, recipe_mask, labels


# padding for different seq_len
def collate_batch(batch):
    batch_review_ids, batch_review_mask, batch_recipe_ids, batch_recipe_mask, batch_labels = zip(*batch)
    batch_labels = torch.stack(batch_labels)

    # for review
    # max seq_len in this batch
    max_review_len = max([review_ids.shape[0] for review_ids in batch_review_ids])

    # pad each sequence to the max seq_len
    padded_batch_review_ids = [torch.cat((review_ids, torch.zeros(max_review_len - len(review_ids), dtype=torch.long))) for review_ids in batch_review_ids]
    padded_batch_review_ids = torch.stack(padded_batch_review_ids)

    padded_batch_review_mask = torch.ones_like(padded_batch_review_ids, dtype=torch.long)

    # for recipe
    # max seq_len in this batch
    max_recipe_len = max([recipe_ids.shape[0] for recipe_ids in batch_recipe_ids])

    # pad each sequence to the max seq_len
    padded_batch_recipe_ids = [torch.cat((recipe_ids, torch.zeros(max_recipe_len - len(recipe_ids), dtype=torch.long))) for recipe_ids in batch_recipe_ids]
    padded_batch_recipe_ids = torch.stack(padded_batch_recipe_ids)

    padded_batch_recipe_mask = torch.ones_like(padded_batch_recipe_ids, dtype=torch.long)

    return padded_batch_review_ids, padded_batch_review_mask, padded_batch_recipe_ids, padded_batch_recipe_mask, batch_labels

In [7]:
# read train+valid set from csv
def get_data(csv_file):
    df = pd.read_csv(csv_file, sep=',')
    all_review_tokens = []
    all_recipe_tokens = []
    all_labels = []
    for _, row in df.iterrows():
        review_tokens = eval(row['review_tokens'])
        recipe_tokens = eval(row['recipe_tokens'])
        rating = float(row['rating'])

        all_review_tokens.append(review_tokens)
        all_recipe_tokens.append(recipe_tokens)
        all_labels.append(rating)
    
    return all_review_tokens, all_recipe_tokens, all_labels

train_review_tokens, train_recipe_tokens, train_labels = get_data('train.csv')
valid_review_tokens, valid_recipe_tokens, valid_labels = get_data('valid.csv')

In [8]:
# build dataset and dataloader for train set
train_dataset = RegressionDataset(train_review_tokens, train_recipe_tokens, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

valid_dataset = RegressionDataset(valid_review_tokens, valid_recipe_tokens, valid_labels)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

In [9]:
# calculate MSE on valid/test set
def evaluate(model, dataloader):
    model.eval()

    total_loss = 0.0

    for review_ids, review_mask, recipe_ids, recipe_mask, labels in tqdm(dataloader):
        outputs = model(review_ids, review_mask, recipe_ids, recipe_mask)
        outputs = outputs.squeeze()

        loss = torch.nn.functional.mse_loss(outputs, labels, reduction='sum')
        total_loss += loss.item()
    
    mse = total_loss / len(dataloader.dataset)

    model.train()

    return mse

In [10]:
# early stop if validation loss starts to increase, check every 20 iterations
def train_step(model, train_dataloader, valid_dataloader, optimizer, loss_fn, pre_valid_mse=None):
    model.train()  # Set the model to training mode

    early_stop = False

    n_iter = 0
    valid_mse = 0.0

    for review_ids, review_mask, recipe_ids, recipe_mask, labels in tqdm(train_dataloader):
        n_iter += 1
        optimizer.zero_grad()  # Zero the gradients

        outputs = model(review_ids, review_mask, recipe_ids, recipe_mask)
        outputs = outputs.squeeze()

        loss = loss_fn(outputs, labels)

        loss.backward()  # Backward pass: compute gradients
        optimizer.step()  # Optimizer step: update weights

        # evaluate on valid set to check if we need to early stop, check every 20 iterations
        # if n_iter % 20 == 0:
        #     valid_mse = evaluate(model, valid_dataloader)
        #     print(f"valid_mse: {valid_mse:.4f}")
        #     if (pre_valid_mse is not None and pre_valid_mse < valid_mse) or np.isnan(valid_mse):
        #         early_stop = True
        #         break
        #     pre_valid_mse = valid_mse

    return early_stop, valid_mse

In [11]:
# Initialize the custom model
custom_model = TwoDistilBERT(distilbert_review, distilbert_recipe, dropout=dropout, hidden_dim=hidden_dim, output_dim=1)

# Loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.AdamW(custom_model.parameters(), lr=learning_rate)

# train
start_time = time()

pre_valid_mse = None
for epoch in range(max_epoch):
    print(f"epoch {epoch + 1}")

    early_stop, valid_mse = train_step(custom_model, train_dataloader, valid_dataloader, optimizer, loss_fn, pre_valid_mse)

    if early_stop:
        print(f"Early stop at epoch {epoch + 1}, valid_mse = {valid_mse:.4f}")
        break
    pre_valid_mse = valid_mse

end_time = time()
print(f"training time: {end_time - start_time}")

# save model
model_path = 'two_distilbert.pth'
torch.save(custom_model, model_path)
print("model saved")

epoch 1


100%|██████████| 313/313 [2:23:48<00:00, 27.57s/it]  


training time: 8628.244009494781
model saved


In [12]:
# evaluate on test set
model_path = 'two_distilbert.pth'
if os.path.exists(model_path):
    model = torch.load(model_path)
    print("model loaded")

    print("evaluate on test set")
    test_review_tokens, test_recipe_tokens, test_labels = get_data('test.csv')
    test_dataset = RegressionDataset(test_review_tokens, test_recipe_tokens, test_labels)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

    start_time = time()
    test_mse = evaluate(model, test_dataloader)
    end_time = time()
    
    print(f"test_mse: {test_mse:.4f}")
    print(f"evaluation time: {end_time - start_time}")
    
else:
    print("model doesn't exist")


model loaded
evaluate on test set


100%|██████████| 195/195 [1:31:46<00:00, 28.24s/it]

test_mse: 1.8542
evaluation time: 5506.724800109863





In [13]:
# evaluate on train set
model_path = 'two_distilbert.pth'
if os.path.exists(model_path):
    model = torch.load(model_path)
    print("model loaded")

    print("evaluate on training set")

    start_time = time()
    training_mse = evaluate(model, train_dataloader)
    end_time = time()
    
    print(f"training_mse: {training_mse:.4f}")
    print(f"evaluation time: {end_time - start_time}")
    
else:
    print("model doesn't exist")

model loaded
evaluate on training set


100%|██████████| 313/313 [2:09:10<00:00, 24.76s/it]  

training_mse: 0.9165
evaluation time: 7750.106264829636



