# DimABSA 2026 – Subtask 1 (DimASR)
## Chinese Laptop (zho_laptop) – 4-Epoch PyTorch Baseline

This notebook trains a multilingual DistilBERT regression model for **DimASR** on the **Chinese Laptop** dataset:

- Train: `Chi_laptop_train_alltasks.jsonl`  (Quadruplet format)
- Dev:   `Chi_laptop_dev_task1.jsonl`      (Aspect list format)

It outputs predictions in the required JSONL format for **Subtask 1** and saves them as:

```text
pred_zho_laptop.jsonl
```

Place this file inside `subtask_1/` as `pred_zho_laptop.jsonl` for Codabench submission.


In [2]:

import os, json, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

print('CUDA available:', torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Working directory:', os.getcwd())
print('Files:', os.listdir())


CUDA available: False
Working directory: c:\SemEval_task3\datasets\Chi
Files: ['Chi_finance_dev_task1.jsonl', 'Chi_finance_task1_notebook.ipynb', 'Chi_finance_train_task1.jsonl', 'Chi_laptop_dev_task1.jsonl', 'Chi_laptop_dev_task2.jsonl', 'Chi_laptop_dev_task3.jsonl', 'Chi_laptop_task1_notebook.ipynb', 'Chi_laptop_train_alltasks.jsonl', 'Chi_restaurant_dev_task1.jsonl', 'Chi_restaurant_dev_task2.jsonl', 'Chi_restaurant_dev_task3.jsonl', 'Chi_restaurant_train_alltasks.jsonl']


In [3]:

TRAIN = 'Chi_laptop_train_alltasks.jsonl'
DEV   = 'Chi_laptop_dev_task1.jsonl'

def read_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    print(f'{path} -> {len(data)} records')
    return data

train_json = read_jsonl(TRAIN)
dev_json   = read_jsonl(DEV)


Chi_laptop_train_alltasks.jsonl -> 3490 records
Chi_laptop_dev_task1.jsonl -> 261 records


In [4]:

# Train DF from Quadruplet format
def build_train_df(data):
    rows = []
    for ex in data:
        tid = ex['ID']
        text = ex['Text']
        for q in ex.get('Quadruplet', []):
            try:
                v_str, a_str = q['VA'].split('#')
                rows.append({
                    'ID': tid,
                    'Text': text,
                    'Aspect': q['Aspect'],
                    'valence': float(v_str),
                    'arousal': float(a_str),
                })
            except Exception as e:
                print('Skipping invalid VA in train:', q.get('VA'), 'error:', e)
    print('TRAIN rows:', len(rows))
    return pd.DataFrame(rows)

# Dev DF from Aspect list
def build_dev_df(data):
    rows = []
    for ex in data:
        tid = ex['ID']
        text = ex['Text']
        for asp in ex['Aspect']:
            rows.append({
                'ID': tid,
                'Text': text,
                'Aspect': asp,
            })
    print('DEV rows:', len(rows))
    return pd.DataFrame(rows)

train_df = build_train_df(train_json)
dev_df   = build_dev_df(dev_json)

print('Train DF head:')
print(train_df.head())
print('\nDev DF head:')
print(dev_df.head())


TRAIN rows: 6502
DEV rows: 431
Train DF head:
             ID                          Text   Aspect  valence  arousal
0  6700135:S006  恭喜入手新機，這台筆電規格不錯又輕巧，可惜螢幕只有FHD       規格     6.00     5.00
1  6700135:S006  恭喜入手新機，這台筆電規格不錯又輕巧，可惜螢幕只有FHD  螢幕只有FHD     4.17     5.00
2  6700135:S006  恭喜入手新機，這台筆電規格不錯又輕巧，可惜螢幕只有FHD       筆電     6.17     5.00
3  6699557:S020       重量真的輕，鍵盤分離還可開啟藍芽使用也很方便。       重量     6.83     6.00
4  6699557:S020       重量真的輕，鍵盤分離還可開啟藍芽使用也很方便。       鍵盤     5.75     4.75

Dev DF head:
             ID                     Text  Aspect
0  6952865:S310  ASUS筆電總是不會讓人失望！AI的時代來了！  ASUS筆電
1  6949617:S106   技嘉NB之前用過一次覺得還不錯~CP值算高!    技嘉NB
2  6949617:S106   技嘉NB之前用過一次覺得還不錯~CP值算高!     CP值
3  6931799:S046          筆電跟MD都很完美，五分奉上~      筆電
4  6946311:S117      鍵盤和觸控板設計與美感頗具吸引力...      鍵盤


In [5]:

MODEL_NAME = 'distilbert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def combine_text(text, aspect):
    return f"{text} [ASP] {aspect}"

class DimASRDataset(Dataset):
    def __init__(self, df, is_train=True):
        self.df = df.reset_index(drop=True)
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        combined = combine_text(row['Text'], row['Aspect'])
        enc = tokenizer(
            combined,
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['ID'] = row['ID']
        item['Aspect'] = row['Aspect']
        if self.is_train:
            item['labels'] = torch.tensor(
                [row['valence'], row['arousal']],
                dtype=torch.float32
            )
        return item

train_ds = DimASRDataset(train_df, is_train=True)
dev_ds   = DimASRDataset(dev_df,   is_train=False)


In [6]:

collator = DataCollatorWithPadding(tokenizer)

def collate_fn(batch):
    ids = [x['ID'] for x in batch]
    aspects = [x['Aspect'] for x in batch]
    for x in batch:
        x.pop('ID')
        x.pop('Aspect')
    padded = collator(batch)
    padded['ID'] = ids
    padded['Aspect'] = aspects
    return padded

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  collate_fn=collate_fn)
dev_loader   = DataLoader(dev_ds,   batch_size=16, shuffle=False, collate_fn=collate_fn)

print('Train batches:', len(train_loader))
print('Dev batches:', len(dev_loader))


Train batches: 813
Dev batches: 27


In [7]:

class DimASRModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.base = AutoModel.from_pretrained(MODEL_NAME)
        self.reg  = nn.Linear(768, 2)  # valence, arousal

    def forward(self, input_ids, attention_mask):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]
        return self.reg(cls)

model = DimASRModel().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()


In [8]:

EPOCHS = 3
print('Starting training for', EPOCHS, 'epochs...')

for ep in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {ep+1}/{EPOCHS}'):
        ids  = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        y    = batch['labels'].to(device)

        preds = model(ids, mask)
        loss = loss_fn(preds, y)

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += loss.item()

    avg_loss = total_loss / max(1, len(train_loader))
    print(f'Epoch {ep+1} average loss: {avg_loss:.4f}')


Starting training for 3 epochs...


Epoch 1/3:   0%|          | 0/813 [00:00<?, ?it/s]

Epoch 1 average loss: 1.2178


Epoch 2/3:   0%|          | 0/813 [00:00<?, ?it/s]

Epoch 2 average loss: 0.4872


Epoch 3/3:   0%|          | 0/813 [00:00<?, ?it/s]

Epoch 3 average loss: 0.3411


In [9]:

print('Running inference on dev set...')

model.eval()
preds = []
with torch.no_grad():
    for batch in tqdm(dev_loader, desc='Inference'):
        ids  = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        logits = model(ids, mask).cpu().numpy()

        for i, (ID, asp) in enumerate(zip(batch['ID'], batch['Aspect'])):
            v, a = logits[i]
            preds.append((ID, asp, f"{v:.2f}#{a:.2f}"))

print('Total predictions:', len(preds))


Running inference on dev set...


Inference:   0%|          | 0/27 [00:00<?, ?it/s]

Total predictions: 431


In [10]:

OUT = 'pred_zho_laptop.jsonl'

sub = {}
for ID, asp, va in preds:
    sub.setdefault(ID, []).append({'Aspect': asp, 'VA': va})

with open(OUT, 'w', encoding='utf8') as f:
    for ex in dev_json:
        rec = {
            'ID': ex['ID'],
            'Aspect_VA': sub.get(ex['ID'], [])
        }
        f.write(json.dumps(rec, ensure_ascii=False) + '\n')

print('Saved predictions to', OUT)
print('Exists:', os.path.exists(OUT))
print('Size (bytes):', os.path.getsize(OUT))


Saved predictions to pred_zho_laptop.jsonl
Exists: True
Size (bytes): 30087
