# DimABSA Subtask1 â€” PyTorch Training (with padding + correct dev parsing + debug)

In [1]:

import os, json, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

print('CUDA available:', torch.cuda.is_available())
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('Working dir:', os.getcwd())
print('Files:', os.listdir())


CUDA available: False
Working dir: c:\SemEval_task3\datasets\eng
Files: ['1_e_res.ipynb', 'DimABSA_PyTorch_Debug.ipynb', 'eng_laptop_dev_task1.jsonl', 'eng_laptop_dev_task2.jsonl', 'eng_laptop_dev_task3.jsonl', 'eng_laptop_train_alltasks.jsonl', 'eng_restaurant_dev_task1.jsonl', 'eng_restaurant_dev_task2.jsonl', 'eng_restaurant_dev_task3.jsonl', 'eng_restaurant_train_alltasks.jsonl']


In [2]:

TRAIN='eng_laptop_train_alltasks.jsonl'
DEV='eng_laptop_dev_task1.jsonl'

def read_jsonl(p):
    out=[]
    with open(p,'r',encoding='utf8') as f:
        for l in f:
            l=l.strip()
            if l:
                out.append(json.loads(l))
    print(p, '-> records:', len(out))
    return out

train_json=read_jsonl(TRAIN)
dev_json=read_jsonl(DEV)


eng_laptop_train_alltasks.jsonl -> records: 4076
eng_laptop_dev_task1.jsonl -> records: 200


In [3]:

# TRAIN DF (from Quadruplets)
def build_train_df(data):
    rows=[]
    for ex in data:
        tid=ex['ID']; text=ex['Text']
        for q in ex.get("Quadruplet",[]):
            v,a=q['VA'].split('#')
            rows.append({
                'ID':tid,'Text':text,'Aspect':q['Aspect'],
                'valence':float(v),'arousal':float(a)
            })
    print("TRAIN rows:", len(rows))
    return pd.DataFrame(rows)

train_df=build_train_df(train_json)

# DEV DF (from Aspect list)
def build_dev_df(data):
    rows=[]
    for ex in data:
        for asp in ex["Aspect"]:
            rows.append({
                "ID":ex["ID"],
                "Text":ex["Text"],
                "Aspect":asp
            })
    print("DEV rows:", len(rows))
    return pd.DataFrame(rows)

dev_df=build_dev_df(dev_json)

print(train_df.head())
print(dev_df.head())


TRAIN rows: 5773
DEV rows: 275
                  ID                                               Text  \
0  laptop_quad_dev_1  this unit is ` ` pretty ` ` and stylish , so m...   
1  laptop_quad_dev_1  this unit is ` ` pretty ` ` and stylish , so m...   
2  laptop_quad_dev_2  for now i ' m okay with upping the experience ...   
3  laptop_quad_dev_3  seems unlikely but whatever , i ' ll go with it .   
4  laptop_quad_dev_4  this version has been my least favorite versio...   

    Aspect  valence  arousal  
0     unit     7.12     7.12  
1     unit     7.12     7.12  
2   device     5.50     5.25  
3     NULL     5.00     5.12  
4  version     3.30     6.60  
                      ID                                               Text  \
0  lap26_aspect_va_dev_1                    The touchscreen works very well   
1  lap26_aspect_va_dev_2                         I am so disappointed in HP   
2  lap26_aspect_va_dev_3  The keyboard is big enough to use for real typing   
3  lap26_aspect_

In [4]:

tokenizer=AutoTokenizer.from_pretrained('distilbert-base-uncased')

def combine(t,a): return f"{t} [ASP] {a}"

class DimSet(Dataset):
    def __init__(self,df,train=True):
        self.df=df; self.train=train
    def __len__(self): return len(self.df)
    def __getitem__(self,idx):
        r=self.df.iloc[idx]
        enc=tokenizer(combine(r["Text"],r["Aspect"]),
                      truncation=True,max_length=128,return_tensors='pt')
        out={k:v.squeeze(0) for k,v in enc.items()}
        out["ID"]=r["ID"]; out["Aspect"]=r["Aspect"]
        if self.train:
            out["labels"]=torch.tensor([r["valence"],r["arousal"]],dtype=torch.float32)
        return out

train_ds=DimSet(train_df,True)
dev_ds=DimSet(dev_df,False)


In [5]:

collator=DataCollatorWithPadding(tokenizer)

def collate(batch):
    IDs=[x["ID"] for x in batch]
    ASP=[x["Aspect"] for x in batch]
    for x in batch:
        x.pop("ID"); x.pop("Aspect")
    pad=collator(batch)
    pad["ID"]=IDs; pad["Aspect"]=ASP
    return pad

train_loader=DataLoader(train_ds,batch_size=8,shuffle=True,collate_fn=collate)
dev_loader=DataLoader(dev_ds,batch_size=16,shuffle=False,collate_fn=collate)

print("Train batches:", len(train_loader))
print("Dev batches:", len(dev_loader))


Train batches: 722
Dev batches: 18


In [6]:

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.base=AutoModel.from_pretrained('distilbert-base-uncased')
        self.reg=nn.Linear(768,2)
    def forward(self,i,m):
        out=self.base(input_ids=i,attention_mask=m)
        cls=out.last_hidden_state[:,0]
        return self.reg(cls)

model=Model().to(device)
opt=torch.optim.AdamW(model.parameters(),lr=2e-5)
loss_fn=nn.MSELoss()


In [7]:

EPOCHS=5
print("Training start...")

for ep in range(EPOCHS):
    model.train()
    for b in tqdm(train_loader):
        ids=b["input_ids"].to(device)
        mask=b["attention_mask"].to(device)
        y=b["labels"].to(device)

        pred=model(ids,mask)
        loss=loss_fn(pred,y)

        opt.zero_grad()
        loss.backward()
        opt.step()

    print("Epoch",ep+1,"loss:",loss.item())


Training start...


  0%|          | 0/722 [00:00<?, ?it/s]

Epoch 1 loss: 0.8658702969551086


  0%|          | 0/722 [00:00<?, ?it/s]

Epoch 2 loss: 0.4105672240257263


  0%|          | 0/722 [00:00<?, ?it/s]

Epoch 3 loss: 0.06414695084095001


  0%|          | 0/722 [00:00<?, ?it/s]

Epoch 4 loss: 0.23486857116222382


  0%|          | 0/722 [00:00<?, ?it/s]

Epoch 5 loss: 0.31302016973495483


In [8]:

print("Inference...")

model.eval()
preds=[]
with torch.no_grad():
    for b in tqdm(dev_loader):
        ids=b["input_ids"].to(device)
        mask=b["attention_mask"].to(device)
        logits=model(ids,mask).cpu().numpy()
        for i,(ID,A) in enumerate(zip(b["ID"],b["Aspect"])):
            v,a=logits[i]
            preds.append((ID,A,f"{v:.2f}#{a:.2f}"))

print("Total predictions:", len(preds))


Inference...


  0%|          | 0/18 [00:00<?, ?it/s]

Total predictions: 275


In [9]:

OUT='pred_eng_laptop.jsonl'
sub={}
for ID,ASP,VA in preds:
    sub.setdefault(ID,[]).append({"Aspect":ASP,"VA":VA})

with open(OUT,'w',encoding='utf8') as f:
    for ex in dev_json:
        f.write(json.dumps({"ID":ex["ID"],"Aspect_VA":sub.get(ex["ID"],[])})+"\n")

print("Saved:", OUT)
print("Exists:", os.path.exists(OUT))
print("Size:", os.path.getsize(OUT))


Saved: pred_eng_laptop.jsonl
Exists: True
Size: 22079
