# Russian Restaurant – DimASR (Subtask 1) with XLM-RoBERTa
This notebook trains an `xlm-roberta-base` regression model for Russian restaurant data.
- Train: `rus_restaurant_train_alltasks.jsonl` (Quadruplet format)
- Dev:   `rus_restaurant_dev_task1.jsonl`     (Aspect list)
It outputs predictions in `pred_rus_restaurant.jsonl` ready for SemEval Subtask 1.


In [1]:

import os, json, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA available:', torch.cuda.is_available())

TRAIN = 'rus_restaurant_train_alltasks.jsonl'
DEV   = 'rus_restaurant_dev_task1.jsonl'


CUDA available: False


In [2]:

def read_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if line:
                data.append(json.loads(line))
    print(path, '->', len(data), 'records')
    return data

train_json = read_jsonl(TRAIN)
dev_json   = read_jsonl(DEV)


rus_restaurant_train_alltasks.jsonl -> 1240 records
rus_restaurant_dev_task1.jsonl -> 56 records


In [3]:

def build_train_df(data):
    rows = []
    for ex in data:
        tid = ex.get('ID')
        text = ex.get('Text', '')
        # Prefer Quadruplet if available
        quads = ex.get('Quadruplet', [])
        if quads:
            for q in quads:
                va = q.get('VA')
                asp = q.get('Aspect')
                if not va or asp is None:
                    continue
                try:
                    v_str, a_str = va.split('#')
                    rows.append({
                        'ID': tid,
                        'Text': text,
                        'Aspect': asp,
                        'valence': float(v_str),
                        'arousal': float(a_str),
                    })
                except Exception as e:
                    print('Skipping invalid VA:', va, 'error:', e)
        else:
            # Fallback: some lines may have single Category/VA but no Quadruplet
            va = ex.get('VA')
            asp = ex.get('Aspect')
            if va and asp:
                try:
                    v_str, a_str = va.split('#')
                    rows.append({
                        'ID': tid,
                        'Text': text,
                        'Aspect': asp,
                        'valence': float(v_str),
                        'arousal': float(a_str),
                    })
                except Exception as e:
                    print('Skipping top-level VA:', va, 'error:', e)
    print('TRAIN rows:', len(rows))
    return pd.DataFrame(rows)

def build_dev_df(data):
    rows = []
    for ex in data:
        tid = ex['ID']
        text = ex['Text']
        for asp in ex['Aspect']:
            rows.append({
                'ID': tid,
                'Text': text,
                'Aspect': asp
            })
    print('DEV rows:', len(rows))
    return pd.DataFrame(rows)

train_df = build_train_df(train_json)
dev_df   = build_dev_df(dev_json)

print('Train DF head:')
print(train_df.head())
print('\nDev DF head:')
print(dev_df.head())


TRAIN rows: 2487
DEV rows: 81
Train DF head:
         ID                                               Text        Aspect  \
0   225:3_0                 Виды из окна - выше всяких похвал.  Виды из окна   
1   225:4_1                            Интерьер не запомнился.      Интерьер   
2  225:10_2  Меню показалось немного скучным, но мы быстро ...          Меню   
3  225:13_3  Мясо было прожарено чуть сильнее,чем заказывал...          Мясо   
4  225:13_3  Мясо было прожарено чуть сильнее,чем заказывал...          Мясо   

   valence  arousal  
0     8.30     7.60  
1     5.00     3.20  
2     4.62     3.88  
3     7.00     6.20  
4     6.90     6.20  

Dev DF head:
         ID                                               Text   Aspect
0  354:12_0  Блюдо хинкали состоит из 4 штук, хороших разме...  хинкали
1  1180:1_1  ОЧЕНЬ все понравилось, музыка ненавязчивая, де...   музыка
2  1180:1_1  ОЧЕНЬ все понравилось, музыка ненавязчивая, де...  девушка
3  1180:1_1  ОЧЕНЬ все понравилось, музы

In [4]:

MODEL = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def combine(text, aspect):
    return f"{text} [ASP] {aspect}"

class RusRestaurantDataset(Dataset):
    def __init__(self, df, is_train=True):
        self.df = df.reset_index(drop=True)
        self.is_train = is_train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        combined = combine(row['Text'], row['Aspect'])
        enc = tokenizer(
            combined,
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['ID'] = row['ID']
        item['Aspect'] = row['Aspect']
        if self.is_train:
            item['labels'] = torch.tensor(
                [row['valence'], row['arousal']],
                dtype=torch.float32
            )
        return item

train_ds = RusRestaurantDataset(train_df, is_train=True)
dev_ds   = RusRestaurantDataset(dev_df,   is_train=False)


In [5]:

collator = DataCollatorWithPadding(tokenizer)

def collate_fn(batch):
    ids = [x['ID'] for x in batch]
    aspects = [x['Aspect'] for x in batch]
    for x in batch:
        x.pop('ID')
        x.pop('Aspect')
    padded = collator(batch)
    padded['ID'] = ids
    padded['Aspect'] = aspects
    return padded

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  collate_fn=collate_fn)
dev_loader   = DataLoader(dev_ds,   batch_size=16, shuffle=False, collate_fn=collate_fn)

print('Train batches:', len(train_loader))
print('Dev batches:', len(dev_loader))


Train batches: 311
Dev batches: 6


In [6]:

class DimASRRusRestaurantModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.base = AutoModel.from_pretrained(MODEL)
        hidden = self.base.config.hidden_size
        self.reg = nn.Linear(hidden, 2)  # valence, arousal

    def forward(self, input_ids, attention_mask):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]
        return self.reg(cls)

model = DimASRRusRestaurantModel().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()


In [7]:

EPOCHS = 4
print('Starting training for', EPOCHS, 'epochs...')

for ep in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {ep+1}/{EPOCHS}'):
        ids  = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        y    = batch['labels'].to(device)

        preds = model(ids, mask)
        loss = loss_fn(preds, y)

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += loss.item()

    avg_loss = total_loss / max(1, len(train_loader))
    print(f'Epoch {ep+1} average loss: {avg_loss:.4f}')


Starting training for 4 epochs...


Epoch 1/4:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 1 average loss: 5.2418


Epoch 2/4:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 2 average loss: 1.6595


Epoch 3/4:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 3 average loss: 1.1681


Epoch 4/4:   0%|          | 0/311 [00:00<?, ?it/s]

Epoch 4 average loss: 0.9563


In [8]:

print('Running inference on dev set...')

model.eval()
preds = []
with torch.no_grad():
    for batch in tqdm(dev_loader, desc='Inference'):
        ids  = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        logits = model(ids, mask).cpu().numpy()

        for i, (ID, asp) in enumerate(zip(batch['ID'], batch['Aspect'])):
            v, a = logits[i]
            preds.append((ID, asp, f"{v:.2f}#{a:.2f}"))

print('Total predictions:', len(preds))


Running inference on dev set...


Inference:   0%|          | 0/6 [00:00<?, ?it/s]

Total predictions: 81


In [9]:

OUT = 'pred_rus_restaurant.jsonl'
sub = {}

for ID, asp, va in preds:
    sub.setdefault(ID, []).append({'Aspect': asp, 'VA': va})

with open(OUT, 'w', encoding='utf8') as f:
    for ex in dev_json:
        rec = {
            'ID': ex['ID'],
            'Aspect_VA': sub.get(ex['ID'], [])
        }
        f.write(json.dumps(rec, ensure_ascii=False) + '\n')

print('Saved predictions to', OUT)
print('Exists:', os.path.exists(OUT))
print('Size (bytes):', os.path.getsize(OUT))


Saved predictions to pred_rus_restaurant.jsonl
Exists: True
Size (bytes): 6284
