In [1]:
!pip install -q transformers torch scikit-learn tqdm emoji


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m22.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import re
import unicodedata
import emoji
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [3]:
TRAIN_PATH = "/content/zho_finance_train_alltasks.jsonl"
TEST_PATH  = "/content/zho_finance_test_task1.jsonl"

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 1000
Test reviews : 842


In [4]:
def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def remove_urls_html(text):
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    return text

def handle_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))

def clip_va(v, a):
    v = min(max(v, 1.0), 9.0)
    a = min(max(a, 1.0), 9.0)
    return round(v, 2), round(a, 2)


In [5]:
def mark_aspect_zh(text, aspect):
    if aspect in text:
        return text.replace(aspect, f"【{aspect}】")
    return text


In [7]:
def preprocess_train(data):
    rows = []

    for item in data:
        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        for av in item["Aspect_VA"]:
            aspect = av["Aspect"]

            # mark aspect in Chinese text
            text_marked = mark_aspect_zh(text, aspect)

            v, a = map(float, av["VA"].split("#"))
            v, a = clip_va(v, a)

            rows.append({
                "text": text_marked,
                "aspect": aspect,
                "valence": v,
                "arousal": a
            })

    return pd.DataFrame(rows)


def preprocess_test(data):
    rows = []
    for item in data:
        if "Aspect" not in item:
            continue

        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        for asp in item["Aspect"]:
            text_marked = mark_aspect_zh(text, asp)

            rows.append({
                "id": item["ID"],
                "text": text_marked,
                "aspect": asp
            })
    return pd.DataFrame(rows)

train_df = preprocess_train(train_raw)
test_df  = preprocess_test(test_raw)

print("Train aspect samples:", len(train_df))
print("Test aspect samples :", len(test_df))

train_df.head()


Train aspect samples: 2633
Test aspect samples : 2354


Unnamed: 0,text,aspect,valence,arousal
0,"人壽、證券及票券子公司之業務持續穩健成長,【全年稅後淨利】分別達13.64億元、7.42億元...",全年稅後淨利,6.17,5.33
1,"人壽、證券及票券子公司之業務持續穩健成長,全年稅後淨利分別達13.64億元、7.42億元及5...",人壽及證券子公司之稅後淨利,6.0,5.17
2,"人壽、證券及票券子公司之業務持續穩健成長,全年稅後淨利分別達13.64億元、7.42億元及5...",資產管理、創投及投信子公司稅後淨利,5.88,5.12
3,"【人壽、證券及票券子公司之業務】持續穩健成長,全年稅後淨利分別達13.64億元、7.42億元...",人壽、證券及票券子公司之業務,6.0,5.17
4,"優質成長,【營收】、獲利、本業、業外同步提升。",營收,6.25,5.62


In [8]:
train_df, temp_df = train_test_split(
    train_df, test_size=0.2, random_state=42
)

val_df, test_internal_df = train_test_split(
    temp_df, test_size=0.5, random_state=42
)

print("Train:", len(train_df))
print("Val  :", len(val_df))
print("Test :", len(test_internal_df))


Train: 2106
Val  : 263
Test : 264


In [9]:
MODEL_NAME = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


class AspectVADataset(Dataset):
    def __init__(self, df, train=True):
        self.df = df.reset_index(drop=True)
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        enc = tokenizer(
            row["text"],
            row["aspect"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

        if self.train:
            item["labels"] = torch.tensor(
                [row["valence"], row["arousal"]],
                dtype=torch.float
            )
        return item



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
train_loader = DataLoader(
    AspectVADataset(train_df, train=True),
    batch_size=16,
    shuffle=True
)

val_loader = DataLoader(
    AspectVADataset(val_df, train=True),
    batch_size=16
)

test_internal_loader = DataLoader(
    AspectVADataset(test_internal_df, train=True),
    batch_size=16
)


In [11]:
class RobertaForVA(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.regressor(pooled)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RobertaForVA().to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=2e-5,
    weight_decay=0.01
)


pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

In [13]:
def rmse_va(y_true, y_pred):
    squared_diff = (y_pred - y_true) ** 2
    return np.sqrt(squared_diff.sum(axis=1).mean())


In [14]:
best_val_rmse = float("inf")
patience = 2
patience_counter = 0
MAX_EPOCHS = 10

for epoch in range(MAX_EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        loss = torch.nn.functional.smooth_l1_loss(preds, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_preds, val_golds = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            preds = model(input_ids, attention_mask)
            val_preds.append(preds.cpu().numpy())
            val_golds.append(labels.cpu().numpy())

    val_preds = np.vstack(val_preds)
    val_golds = np.vstack(val_golds)

    val_rmse = rmse_va(val_golds, val_preds)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val RMSE_VA: {val_rmse:.4f}")

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        patience_counter = 0
        torch.save(model.state_dict(), "best_zh_roberta_va.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break


100%|██████████| 132/132 [00:42<00:00,  3.11it/s]


Epoch 1 | Train Loss: 0.3677 | Val RMSE_VA: 0.7003


100%|██████████| 132/132 [00:43<00:00,  3.01it/s]


Epoch 2 | Train Loss: 0.0899 | Val RMSE_VA: 0.6515


100%|██████████| 132/132 [00:44<00:00,  3.00it/s]


Epoch 3 | Train Loss: 0.0691 | Val RMSE_VA: 0.6665


100%|██████████| 132/132 [00:45<00:00,  2.93it/s]


Epoch 4 | Train Loss: 0.0504 | Val RMSE_VA: 0.6421


100%|██████████| 132/132 [00:45<00:00,  2.90it/s]


Epoch 5 | Train Loss: 0.0385 | Val RMSE_VA: 0.5298


100%|██████████| 132/132 [00:45<00:00,  2.88it/s]


Epoch 6 | Train Loss: 0.0312 | Val RMSE_VA: 0.6539


100%|██████████| 132/132 [00:46<00:00,  2.85it/s]


Epoch 7 | Train Loss: 0.0250 | Val RMSE_VA: 0.5969
Early stopping triggered


In [15]:
model.load_state_dict(torch.load("best_zh_roberta_va.pt"))
model.eval()


RobertaForVA(
  (roberta): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [16]:
test_preds, test_golds = [], []

with torch.no_grad():
    for batch in test_internal_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        test_preds.append(preds.cpu().numpy())
        test_golds.append(labels.cpu().numpy())

test_preds = np.vstack(test_preds)
test_golds = np.vstack(test_golds)

print("=== INTERNAL TEST PERFORMANCE ===")
print("Official RMSE_VA:", rmse_va(test_golds, test_preds))


=== INTERNAL TEST PERFORMANCE ===
Official RMSE_VA: 0.50409573


In [17]:
test_loader = DataLoader(
    AspectVADataset(test_df, train=False),
    batch_size=16
)

outputs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        preds = model(input_ids, attention_mask)
        outputs.append(preds.cpu().numpy())

outputs = np.vstack(outputs)
outputs = np.clip(outputs, 1.0, 9.0)

test_df = test_df.reset_index(drop=True)
test_df["VA"] = [f"{v:.2f}#{a:.2f}" for v, a in outputs]

with open("pred_zho_finance.jsonl", "w", encoding="utf-8") as f:
    for rid in test_df["id"].unique():
        group = test_df[test_df["id"] == rid]
        record = {
            "ID": rid,
            "Aspect_VA": [
                {"Aspect": row["aspect"], "VA": row["VA"]}
                for _, row in group.iterrows()
            ]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("pred_zho_finance.jsonl generated ✅")


pred_zho_finance.jsonl generated ✅


In [18]:
from google.colab import files
files.download("pred_zho_finance.jsonl")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>