In [2]:
!pip install -q transformers torch scikit-learn tqdm emoji


In [3]:
import json
import re
import unicodedata
import emoji
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [4]:
TRAIN_PATH = "/content/eng_restaurant_train_alltasks.jsonl"
TEST_PATH  = "/content/eng_restaurant_test_task1.jsonl"

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 2284
Test reviews : 1000


In [5]:
def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')
    text = re.sub(r"\s+", " ", text).strip()
    return text

def remove_urls_html(text):
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    return text

def handle_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))

def clip_va(v, a):
    v = min(max(v, 1.0), 9.0)
    a = min(max(a, 1.0), 9.0)
    return round(v, 2), round(a, 2)


In [6]:
def preprocess_train(data):
    rows = []
    for item in data:
        if "Quadruplet" not in item:
            continue

        text = handle_emojis(remove_urls_html(normalize_text(item["Text"])))

        for quad in item["Quadruplet"]:
            aspect = quad["Aspect"]
            if aspect == "NULL":
                continue

            v, a = map(float, quad["VA"].split("#"))
            v, a = clip_va(v, a)

            rows.append({
                "text": text,
                "aspect": aspect.strip(),
                "valence": v,
                "arousal": a
            })
    return pd.DataFrame(rows)

def preprocess_test(data):
    rows = []
    for item in data:
        if "Aspect" not in item:
            continue

        text = handle_emojis(remove_urls_html(normalize_text(item["Text"])))

        for asp in item["Aspect"]:
            rows.append({
                "id": item["ID"],
                "text": text,
                "aspect": asp.strip()
            })
    return pd.DataFrame(rows)

train_df = preprocess_train(train_raw)
test_df  = preprocess_test(test_raw)

print("Train aspect samples:", len(train_df))
print("Test aspect samples :", len(test_df))

train_df.head()


Train aspect samples: 2779
Test aspect samples : 1504


Unnamed: 0,text,aspect,valence,arousal
0,"their sake list was extensive , but we were lo...",sake list,7.83,8.0
1,the spicy tuna roll was unusually good and the...,spicy tuna roll,7.5,7.62
2,the spicy tuna roll was unusually good and the...,rock shrimp tempura,8.25,8.38
3,we love th pink pony .,pink pony,7.17,7.0
4,this place has got to be the best japanese res...,place,7.88,8.12


In [7]:
train_df, temp_df = train_test_split(
    train_df, test_size=0.2, random_state=42
)

val_df, test_internal_df = train_test_split(
    temp_df, test_size=0.5, random_state=42
)

print("Train:", len(train_df))
print("Val  :", len(val_df))
print("Test :", len(test_internal_df))


Train: 2223
Val  : 278
Test : 278


In [8]:
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class AspectVADataset(Dataset):
    def __init__(self, df, train=True):
        self.df = df.reset_index(drop=True)
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        enc = tokenizer(
            row["text"],
            row["aspect"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

        if self.train:
            item["labels"] = torch.tensor(
                [row["valence"], row["arousal"]],
                dtype=torch.float
            )
        return item


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
train_loader = DataLoader(
    AspectVADataset(train_df, train=True),
    batch_size=16,
    shuffle=True
)

val_loader = DataLoader(
    AspectVADataset(val_df, train=True),
    batch_size=16
)

test_internal_loader = DataLoader(
    AspectVADataset(test_internal_df, train=True),
    batch_size=16
)


In [10]:
class RobertaForVA(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.regressor(pooled)


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RobertaForVA().to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=2e-5,
    weight_decay=0.01
)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def rmse_va(y_true, y_pred):
    squared_diff = (y_pred - y_true) ** 2
    return np.sqrt(squared_diff.sum(axis=1).mean())


In [13]:
best_val_rmse = float("inf")
patience = 2
patience_counter = 0

MAX_EPOCHS = 10

for epoch in range(MAX_EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        loss = torch.nn.functional.smooth_l1_loss(preds, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_preds, val_golds = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            preds = model(input_ids, attention_mask)
            val_preds.append(preds.cpu().numpy())
            val_golds.append(labels.cpu().numpy())

    val_preds = np.vstack(val_preds)
    val_golds = np.vstack(val_golds)

    val_rmse = rmse_va(val_golds, val_preds)

    print(
        f"Epoch {epoch+1} | "
        f"Train Loss: {total_loss/len(train_loader):.4f} | "
        f"Val RMSE_VA: {val_rmse:.4f}"
    )

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        patience_counter = 0
        torch.save(model.state_dict(), "best_roberta_va.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break


100%|██████████| 139/139 [01:05<00:00,  2.13it/s]


Epoch 1 | Train Loss: 1.2944 | Val RMSE_VA: 2.1638


100%|██████████| 139/139 [00:51<00:00,  2.69it/s]


Epoch 2 | Train Loss: 0.4865 | Val RMSE_VA: 1.5500


100%|██████████| 139/139 [00:51<00:00,  2.70it/s]


Epoch 3 | Train Loss: 0.3082 | Val RMSE_VA: 1.2077


100%|██████████| 139/139 [00:51<00:00,  2.69it/s]


Epoch 4 | Train Loss: 0.2325 | Val RMSE_VA: 1.1279


100%|██████████| 139/139 [00:51<00:00,  2.69it/s]


Epoch 5 | Train Loss: 0.1931 | Val RMSE_VA: 1.1694


100%|██████████| 139/139 [00:50<00:00,  2.73it/s]


Epoch 6 | Train Loss: 0.1538 | Val RMSE_VA: 1.2285
Early stopping triggered


In [14]:
model.load_state_dict(torch.load("best_roberta_va.pt"))
model.eval()


RobertaForVA(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [15]:
test_preds, test_golds = [], []

with torch.no_grad():
    for batch in test_internal_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        test_preds.append(preds.cpu().numpy())
        test_golds.append(labels.cpu().numpy())

test_preds = np.vstack(test_preds)
test_golds = np.vstack(test_golds)

print("=== INTERNAL TEST PERFORMANCE ===")
print("Official RMSE_VA:", rmse_va(test_golds, test_preds))


=== INTERNAL TEST PERFORMANCE ===
Official RMSE_VA: 1.0885552


In [16]:
test_loader = DataLoader(
    AspectVADataset(test_df, train=False),
    batch_size=16
)

model.eval()
outputs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        preds = model(input_ids, attention_mask)
        outputs.append(preds.cpu().numpy())

outputs = np.vstack(outputs)
outputs = np.clip(outputs, 1.0, 9.0)

test_df["VA"] = [f"{v:.2f}#{a:.2f}" for v, a in outputs]

# ---------- WRITE JSONL (ONE OBJECT PER LINE) ----------
output_path = "pred_eng_restaurant.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for rid in test_df["id"].unique():   # preserves ID order
        group = test_df[test_df["id"] == rid]

        record = {
            "ID": rid,
            "Aspect_VA": [
                {"Aspect": row["aspect"], "VA": row["VA"]}
                for _, row in group.iterrows()
            ]
        }

        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"{output_path} generated ✅")


pred_eng_restaurant.jsonl generated ✅


In [17]:
from google.colab import files
files.download("pred_eng_restaurant.jsonl")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>