# Tatar Restaurant – DimASR (Subtask 1) using XLM‑RoBERTa
This trains a regression model predicting VA scores for aspects.


In [1]:

import os, json, torch, torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
TRAIN='tat_restaurant_train_alltasks.jsonl'
DEV='tat_restaurant_dev_task1.jsonl'


In [2]:

def read_jsonl(path):
    data=[]
    with open(path,encoding='utf8') as f:
        for line in f:
            line=line.strip()
            if line:
                data.append(json.loads(line))
    print(path,'->',len(data),'records')
    return data

train_json = read_jsonl(TRAIN)
dev_json   = read_jsonl(DEV)


tat_restaurant_train_alltasks.jsonl -> 1240 records
tat_restaurant_dev_task1.jsonl -> 56 records


In [3]:

def build_train_df(data):
    rows=[]
    for ex in data:
        tid=ex['ID']; text=ex['Text']
        if 'Quadruplet' in ex:
            for q in ex['Quadruplet']:
                if 'VA' not in q: continue
                try:
                    v,a=q['VA'].split('#')
                    rows.append({
                        'ID':tid,'Text':text,'Aspect':q['Aspect'],
                        'valence':float(v),'arousal':float(a)
                    })
                except: pass
        # skip top-level category entries without aspect
    print('TRAIN rows:',len(rows))
    return pd.DataFrame(rows)

def build_dev_df(data):
    rows=[]
    for ex in data:
        for asp in ex['Aspect']:
            rows.append({'ID':ex['ID'],'Text':ex['Text'],'Aspect':asp})
    print('DEV rows:',len(rows))
    return pd.DataFrame(rows)

train_df = build_train_df(train_json)
dev_df   = build_dev_df(dev_json)
train_df.head(), dev_df.head()


TRAIN rows: 2487
DEV rows: 81


(         ID                                               Text  \
 0   225:3_0           Тәрәзә күренешләре мактаудан да биегрәк.   
 1   225:4_1                            Интерьер истә калмаган.   
 2  225:10_2  Меню бераз күңелсез булып тоелды, ләкин без ти...   
 3  225:13_3  Ит заказ биргәнгә караганда бераз көчлерәк кыз...   
 4  225:13_3  Ит заказ биргәнгә караганда бераз көчлерәк кыз...   
 
                Aspect  valence  arousal  
 0  Тәрәзә күренешләре     8.30     7.60  
 1            Интерьер     5.00     3.20  
 2                Меню     4.62     3.88  
 3                  Ит     7.00     6.20  
 4                  Ит     6.90     6.20  ,
          ID                                               Text     Aspect
 0  354:12_0  Хинкали ризыгы 4 данәдән тора, яхшы зурлыкта, ...    Хинкали
 1  1180:1_1  Барысы да бик ошады, музыка бәйләнчек түгел, б...     музыка
 2  1180:1_1  Барысы да бик ошады, музыка бәйләнчек түгел, б...        кыз
 3  1180:1_1  Барысы да бик ошады, му

In [4]:

MODEL='xlm-roberta-base'
tokenizer=AutoTokenizer.from_pretrained(MODEL)

def combine(t,a): return f"{t} [ASP] {a}"

class DS(Dataset):
    def __init__(self,df,train=True):
        self.df=df; self.train=train
    def __len__(self): return len(self.df)
    def __getitem__(self,idx):
        r=self.df.iloc[idx]
        enc=tokenizer(
            combine(r['Text'],r['Aspect']),
            truncation=True,max_length=128,
            return_tensors='pt'
        )
        out={k:v.squeeze(0) for k,v in enc.items()}
        out['ID']=r['ID']; out['Aspect']=r['Aspect']
        if self.train:
            out['labels']=torch.tensor([r['valence'],r['arousal']],dtype=torch.float32)
        return out

train_ds=DS(train_df,True)
dev_ds=DS(dev_df,False)


In [5]:

collator=DataCollatorWithPadding(tokenizer)

def collate(batch):
    IDs=[x['ID'] for x in batch]
    ASP=[x['Aspect'] for x in batch]
    for x in batch:
        x.pop('ID'); x.pop('Aspect')
    pad=collator(batch)
    pad['ID']=IDs; pad['Aspect']=ASP
    return pad

train_loader=DataLoader(train_ds,batch_size=8,shuffle=True,collate_fn=collate)
dev_loader=DataLoader(dev_ds,batch_size=16,shuffle=False,collate_fn=collate)


In [6]:

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.base=AutoModel.from_pretrained(MODEL)
        self.reg=nn.Linear(self.base.config.hidden_size,2)
    def forward(self,i,m):
        o=self.base(input_ids=i,attention_mask=m)
        cls=o.last_hidden_state[:,0]
        return self.reg(cls)

model=Model().to(device)
opt=torch.optim.AdamW(model.parameters(),lr=2e-5)
loss_fn=nn.MSELoss()


In [7]:

EPOCHS=4
for ep in range(EPOCHS):
    model.train(); tot=0
    for b in tqdm(train_loader,desc=f'Epoch {ep+1}'):
        ids=b['input_ids'].to(device)
        mask=b['attention_mask'].to(device)
        y=b['labels'].to(device)
        pred=model(ids,mask)
        loss=loss_fn(pred,y)
        opt.zero_grad(); loss.backward(); opt.step()
        tot+=loss.item()
    print('Epoch',ep+1,'Loss',tot/len(train_loader))


Epoch 1:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 1 Loss 4.2996452675946655


Epoch 2:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 2 Loss 2.227132238327882


Epoch 3:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 3 Loss 2.120745369954891


Epoch 4:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 4 Loss 1.8823689600854059


In [8]:

model.eval(); preds=[]
with torch.no_grad():
    for b in tqdm(dev_loader):
        ids=b['input_ids'].to(device)
        mask=b['attention_mask'].to(device)
        out=model(ids,mask).cpu().numpy()
        for i,(ID,A) in enumerate(zip(b['ID'],b['Aspect'])):
            v,a=out[i]
            preds.append((ID,A,f"{v:.2f}#{a:.2f}"))
len(preds)


  0%|          | 0/6 [00:00<?, ?it/s]

81

In [9]:

OUT='pred_tat_restaurant.jsonl'
sub={}
for ID,A,VA in preds:
    sub.setdefault(ID,[]).append({'Aspect':A,'VA':VA})

with open(OUT,'w',encoding='utf8') as f:
    for ex in dev_json:
        f.write(json.dumps({
            'ID':ex['ID'],
            'Aspect_VA':sub.get(ex['ID'],[])
        },ensure_ascii=False)+'\n')

os.path.exists(OUT), os.path.getsize(OUT)


(True, 6347)