In [1]:
!pip install -q transformers torch scikit-learn tqdm emoji


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m22.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import re
import unicodedata
import emoji
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [37]:
TRAIN_PATH = "/content/zho_restaurant_train_alltasks.jsonl"
TEST_PATH  = "/content/zho_restaurant_test_task1.jsonl"

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 6050
Test reviews : 1000


In [38]:
def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def remove_urls_html(text):
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    return text

def handle_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))

def clip_va(v, a):
    v = min(max(v, 1.0), 9.0)
    a = min(max(a, 1.0), 9.0)
    return round(v, 2), round(a, 2)


In [39]:
def mark_aspect_zh(text, aspect):
    if aspect in text:
        return text.replace(aspect, f"【{aspect}】")
    return text


In [40]:
def preprocess_train(data):
    rows = []
    for item in data:
        if "Quadruplet" not in item:
            continue

        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        for quad in item["Quadruplet"]:
            aspect = quad["Aspect"]
            if aspect == "NULL":
                continue

            text_marked = mark_aspect_zh(text, aspect)

            v, a = map(float, quad["VA"].split("#"))
            v, a = clip_va(v, a)

            rows.append({
                "text": text_marked,
                "aspect": aspect,
                "valence": v,
                "arousal": a
            })
    return pd.DataFrame(rows)

def preprocess_test(data):
    rows = []
    for item in data:
        if "Aspect" not in item:
            continue

        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        for asp in item["Aspect"]:
            text_marked = mark_aspect_zh(text, asp)

            rows.append({
                "id": item["ID"],
                "text": text_marked,
                "aspect": asp
            })
    return pd.DataFrame(rows)

train_df = preprocess_train(train_raw)
test_df  = preprocess_test(test_raw)

print("Train aspect samples:", len(train_df))
print("Test aspect samples :", len(test_df))

train_df.head()


Train aspect samples: 8354
Test aspect samples : 1929


Unnamed: 0,text,aspect,valence,arousal
0,【肉粿】沒有很焦脆。,肉粿,4.0,5.0
1,【肉粿】每一塊都好脆好恰好喜歡。,肉粿,6.25,6.0
2,【肉粿】每一塊都好脆好恰好喜歡。,肉粿,6.12,6.0
3,【肉粿】每一塊都好脆好恰好喜歡。,肉粿,6.62,6.62
4,【口感】有點微妙。,口感,4.75,4.75


In [41]:
train_df, temp_df = train_test_split(
    train_df, test_size=0.2, random_state=42
)

val_df, test_internal_df = train_test_split(
    temp_df, test_size=0.5, random_state=42
)

print("Train:", len(train_df))
print("Val  :", len(val_df))
print("Test :", len(test_internal_df))


Train: 6683
Val  : 835
Test : 836


In [42]:
MODEL_NAME = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


class AspectVADataset(Dataset):
    def __init__(self, df, train=True):
        self.df = df.reset_index(drop=True)
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        enc = tokenizer(
            row["text"],
            row["aspect"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

        if self.train:
            item["labels"] = torch.tensor(
                [row["valence"], row["arousal"]],
                dtype=torch.float
            )
        return item



In [43]:
train_loader = DataLoader(
    AspectVADataset(train_df, train=True),
    batch_size=16,
    shuffle=True
)

val_loader = DataLoader(
    AspectVADataset(val_df, train=True),
    batch_size=16
)

test_internal_loader = DataLoader(
    AspectVADataset(test_internal_df, train=True),
    batch_size=16
)


In [44]:
class RobertaForVA(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.regressor(pooled)


In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RobertaForVA().to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=2e-5,
    weight_decay=0.01
)


In [46]:
def rmse_va(y_true, y_pred):
    squared_diff = (y_pred - y_true) ** 2
    return np.sqrt(squared_diff.sum(axis=1).mean())


In [47]:
best_val_rmse = float("inf")
patience = 2
patience_counter = 0
MAX_EPOCHS = 10

for epoch in range(MAX_EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        loss = torch.nn.functional.smooth_l1_loss(preds, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_preds, val_golds = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            preds = model(input_ids, attention_mask)
            val_preds.append(preds.cpu().numpy())
            val_golds.append(labels.cpu().numpy())

    val_preds = np.vstack(val_preds)
    val_golds = np.vstack(val_golds)

    val_rmse = rmse_va(val_golds, val_preds)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val RMSE_VA: {val_rmse:.4f}")

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        patience_counter = 0
        torch.save(model.state_dict(), "best_zh_roberta_va.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break


100%|██████████| 418/418 [02:16<00:00,  3.07it/s]


Epoch 1 | Train Loss: 0.2527 | Val RMSE_VA: 0.7989


100%|██████████| 418/418 [02:22<00:00,  2.93it/s]


Epoch 2 | Train Loss: 0.0994 | Val RMSE_VA: 0.8013


100%|██████████| 418/418 [02:24<00:00,  2.90it/s]


Epoch 3 | Train Loss: 0.0830 | Val RMSE_VA: 0.9215
Early stopping triggered


In [1]:
model.load_state_dict(torch.load("best_zh_roberta_va.pt"))
model.eval()


NameError: name 'model' is not defined

In [32]:
test_preds, test_golds = [], []

with torch.no_grad():
    for batch in test_internal_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        test_preds.append(preds.cpu().numpy())
        test_golds.append(labels.cpu().numpy())

test_preds = np.vstack(test_preds)
test_golds = np.vstack(test_golds)

print("=== INTERNAL TEST PERFORMANCE ===")
print("Official RMSE_VA:", rmse_va(test_golds, test_preds))


=== INTERNAL TEST PERFORMANCE ===
Official RMSE_VA: 0.8975309


In [34]:
test_loader = DataLoader(
    AspectVADataset(test_df, train=False),
    batch_size=16
)

outputs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        preds = model(input_ids, attention_mask)
        outputs.append(preds.cpu().numpy())

outputs = np.vstack(outputs)
outputs = np.clip(outputs, 1.0, 9.0)

test_df = test_df.reset_index(drop=True)
test_df["VA"] = [f"{v:.2f}#{a:.2f}" for v, a in outputs]

with open("pred_zho_laptop.jsonl", "w", encoding="utf-8") as f:
    for rid in test_df["id"].unique():
        group = test_df[test_df["id"] == rid]
        record = {
            "ID": rid,
            "Aspect_VA": [
                {"Aspect": row["aspect"], "VA": row["VA"]}
                for _, row in group.iterrows()
            ]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("pred_zho_laptop.jsonl generated ✅")


pred_zho_laptop.jsonl generated ✅


In [36]:
from google.colab import files
files.download("pred_zho_laptop.jsonl")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>