# DimABSA 2026 – Subtask 1 (DimASR)
## Chinese Finance (zho_finance) – 4‑Epoch PyTorch Baseline

This notebook trains a DistilBERT multilingual regression model for **DimASR** on the **Chinese Finance** dataset:

- Train: `Chi_finance_train_task1.jsonl`
- Dev:   `Chi_finance_dev_task1.jsonl`

It outputs predictions in the required JSONL format for **Subtask 1** and saves them as:

```text
pred_zho_finance.jsonl
```

You can then place this file inside `subtask_1/` and zip it as `subtask_1.zip` for Codabench submission.


In [6]:

import os, json, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

print('CUDA available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Working directory:', os.getcwd())
print('Files:', os.listdir())


CUDA available: False
Working directory: c:\SemEval_task3\datasets\Chi
Files: ['Chi_finance_dev_task1.jsonl', 'Chi_finance_task1_notebook.ipynb', 'Chi_finance_train_task1.jsonl', 'Chi_laptop_dev_task1.jsonl', 'Chi_laptop_dev_task2.jsonl', 'Chi_laptop_dev_task3.jsonl', 'Chi_restaurant_dev_task1.jsonl', 'Chi_restaurant_dev_task2.jsonl', 'Chi_restaurant_dev_task3.jsonl']


In [7]:

TRAIN = 'Chi_finance_train_task1.jsonl'
DEV   = 'Chi_finance_dev_task1.jsonl'

def read_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    print(f'{path} -> {len(data)} records')
    return data

train_json = read_jsonl(TRAIN)
dev_json   = read_jsonl(DEV)


Chi_finance_train_task1.jsonl -> 1000 records
Chi_finance_dev_task1.jsonl -> 200 records


In [8]:
def build_train_df(data):
    rows = []
    for ex in data:
        tid = ex["ID"]
        text = ex["Text"]

        for pair in ex.get("Aspect_VA", []):
            asp = pair["Aspect"]
            v, a = pair["VA"].split("#")

            rows.append({
                "ID": tid,
                "Text": text,
                "Aspect": asp,
                "valence": float(v),
                "arousal": float(a)
            })

    print("TRAIN rows:", len(rows))
    return pd.DataFrame(rows)


# Dev DF from Aspect list
def build_dev_df(data):
    rows = []
    for ex in data:
        tid = ex['ID']
        text = ex['Text']
        for asp in ex['Aspect']:
            rows.append({
                'ID': tid,
                'Text': text,
                'Aspect': asp,
            })
    print('DEV rows:', len(rows))
    return pd.DataFrame(rows)

train_df = build_train_df(train_json)
dev_df   = build_dev_df(dev_json)

print('Train DF head:')
print(train_df.head())
print('\nDev DF head:')
print(dev_df.head())


TRAIN rows: 2633
DEV rows: 563
Train DF head:
             ID                                               Text  \
0  5880111:S010  人壽、證券及票券子公司之業務持續穩健成長，全年稅後淨利分別達13.64億元、7.42億元及5...   
1  5880111:S010  人壽、證券及票券子公司之業務持續穩健成長，全年稅後淨利分別達13.64億元、7.42億元及5...   
2  5880111:S010  人壽、證券及票券子公司之業務持續穩健成長，全年稅後淨利分別達13.64億元、7.42億元及5...   
3  5880111:S010  人壽、證券及票券子公司之業務持續穩健成長，全年稅後淨利分別達13.64億元、7.42億元及5...   
4  2365114:S011                              優質成長，營收、獲利、本業、業外同步提升。   

              Aspect  valence  arousal  
0             全年稅後淨利     6.17     5.33  
1      人壽及證券子公司之稅後淨利     6.00     5.17  
2  資產管理、創投及投信子公司稅後淨利     5.88     5.12  
3     人壽、證券及票券子公司之業務     6.00     5.17  
4                 營收     6.25     5.62  

Dev DF head:
             ID                                               Text     Aspect
0  3481114:S057  因應智慧城市與戶外顯示需求的快速發展，推動多領域顯示應用面板的全面升級，本公司積極將Mini...       智慧城市
1  3481114:S057  因應智慧城市與戶外顯示需求的快速發展，推動多領域顯示應用面板的全面升級，本公司積極將Mini...  多領域顯示應用面板
2  3481114:S057  因應智慧城市與戶外顯示需求的快速發展，推動多

In [9]:

from transformers import AutoTokenizer

MODEL_NAME = 'distilbert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def combine_text(text, aspect):
    return f"{text} [ASP] {aspect}"

class DimASRDataset(Dataset):
    def __init__(self, df, is_train=True):
        self.df = df.reset_index(drop=True)
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        combined = combine_text(row['Text'], row['Aspect'])
        enc = tokenizer(
            combined,
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['ID'] = row['ID']
        item['Aspect'] = row['Aspect']
        if self.is_train:
            item['labels'] = torch.tensor(
                [row['valence'], row['arousal']],
                dtype=torch.float32
            )
        return item

train_ds = DimASRDataset(train_df, is_train=True)
dev_ds   = DimASRDataset(dev_df,   is_train=False)


In [10]:

from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer)

def collate_fn(batch):
    ids = [x['ID'] for x in batch]
    aspects = [x['Aspect'] for x in batch]
    for x in batch:
        x.pop('ID')
        x.pop('Aspect')
    padded = collator(batch)
    padded['ID'] = ids
    padded['Aspect'] = aspects
    return padded

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  collate_fn=collate_fn)
dev_loader   = DataLoader(dev_ds,   batch_size=16, shuffle=False, collate_fn=collate_fn)

print('Train batches:', len(train_loader))
print('Dev batches:', len(dev_loader))


Train batches: 330
Dev batches: 36


In [11]:

class DimASRModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.base = AutoModel.from_pretrained(MODEL_NAME)
        self.reg  = nn.Linear(768, 2)  # valence, arousal

    def forward(self, input_ids, attention_mask):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]
        return self.reg(cls)

model = DimASRModel().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:

EPOCHS = 4
print('Starting training for', EPOCHS, 'epochs...')

for ep in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {ep+1}/{EPOCHS}'):
        ids  = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        y    = batch['labels'].to(device)

        preds = model(ids, mask)
        loss = loss_fn(preds, y)

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += loss.item()

    avg_loss = total_loss / max(1, len(train_loader))
    print(f'Epoch {ep+1} average loss: {avg_loss:.4f}')


Starting training for 4 epochs...


Epoch 1/4:   0%|          | 0/330 [00:00<?, ?it/s]

Epoch 1 average loss: 1.0080


Epoch 2/4:   0%|          | 0/330 [00:00<?, ?it/s]

Epoch 2 average loss: 0.2055


Epoch 3/4:   0%|          | 0/330 [00:00<?, ?it/s]

Epoch 3 average loss: 0.1711


Epoch 4/4:   0%|          | 0/330 [00:00<?, ?it/s]

Epoch 4 average loss: 0.1292


In [13]:

print('Running inference on dev set...')

model.eval()
preds = []
with torch.no_grad():
    for batch in tqdm(dev_loader, desc='Inference'):
        ids  = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        logits = model(ids, mask).cpu().numpy()

        for i, (ID, asp) in enumerate(zip(batch['ID'], batch['Aspect'])):
            v, a = logits[i]
            preds.append((ID, asp, f"{v:.2f}#{a:.2f}"))

print('Total predictions:', len(preds))


Running inference on dev set...


Inference:   0%|          | 0/36 [00:00<?, ?it/s]

Total predictions: 563


In [14]:

OUT = 'pred_zho_finance.jsonl'

sub = {}
for ID, asp, va in preds:
    sub.setdefault(ID, []).append({'Aspect': asp, 'VA': va})

with open(OUT, 'w', encoding='utf8') as f:
    for ex in dev_json:
        rec = {
            'ID': ex['ID'],
            'Aspect_VA': sub.get(ex['ID'], [])
        }
        f.write(json.dumps(rec, ensure_ascii=False) + '\n')

print('Saved predictions to', OUT)
print('Exists:', os.path.exists(OUT))
print('Size (bytes):', os.path.getsize(OUT))


Saved predictions to pred_zho_finance.jsonl
Exists: True
Size (bytes): 35433
