In [1]:
!pip install -q transformers torch scikit-learn tqdm emoji


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m31.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import re
import unicodedata
import emoji
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [18]:
TRAIN_PATH = "/content/ukr_restaurant_train_alltasks.jsonl"
TEST_PATH  = "/content/ukr_restaurant_test_task1.jsonl"

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 1240
Test reviews : 1072


In [19]:
def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def remove_urls_html(text):
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    return text

def handle_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))

def clip_va(v, a):
    v = min(max(v, 1.0), 9.0)
    a = min(max(a, 1.0), 9.0)
    return round(v, 2), round(a, 2)


In [20]:
def mark_aspect(text, aspect):
    return f"<ASP>{aspect}</ASP> {text}"


In [21]:
def preprocess_train(data):
    rows = []

    for item in data:
        if "Quadruplet" not in item:
            continue

        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        for quad in item["Quadruplet"]:
            aspect = quad.get("Aspect", "NULL")
            if aspect == "NULL":
                continue

            text_marked = mark_aspect(text, aspect)

            v, a = map(float, quad["VA"].split("#"))
            v, a = clip_va(v, a)

            rows.append({
                "text": text_marked,
                "aspect": aspect,
                "valence": v,
                "arousal": a
            })

    return pd.DataFrame(rows)


def preprocess_test(data):
    rows = []

    for item in data:
        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        for asp in item["Aspect"]:
            text_marked = mark_aspect(text, asp)

            rows.append({
                "id": item["ID"],
                "text": text_marked,
                "aspect": asp
            })

    return pd.DataFrame(rows)

train_df = preprocess_train(train_raw)
test_df  = preprocess_test(test_raw)

print("Train aspect samples:", len(train_df))
print("Test aspect samples :", len(test_df))

train_df.head()




Train aspect samples: 2487
Test aspect samples : 1637


Unnamed: 0,text,aspect,valence,arousal
0,<ASP>Види з вікна</ASP> Види з вікна-вище всяк...,Види з вікна,8.3,7.6
1,<ASP>Інтер'єр</ASP> Інтер'єр не запам'ятався.,Інтер'єр,5.0,3.2
2,"<ASP>Меню</ASP> Меню здалося трохи нудним, але...",Меню,4.62,3.88
3,<ASP>М'ясо</ASP> М'ясо було просмажене трохи с...,М'ясо,7.0,6.2
4,<ASP>М'ясо</ASP> М'ясо було просмажене трохи с...,М'ясо,6.9,6.2


In [22]:
train_df, temp_df = train_test_split(
    train_df, test_size=0.2, random_state=42
)

val_df, test_internal_df = train_test_split(
    temp_df, test_size=0.5, random_state=42
)

print("Train:", len(train_df))
print("Val  :", len(val_df))
print("Test :", len(test_internal_df))


Train: 1989
Val  : 249
Test : 249


In [23]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class AspectVADataset(Dataset):
    def __init__(self, df, train=True):
        self.df = df.reset_index(drop=True)
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        enc = tokenizer(
            row["text"],
            row["aspect"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

        if self.train:
            item["labels"] = torch.tensor(
                [row["valence"], row["arousal"]],
                dtype=torch.float
            )

        return item




In [24]:
train_loader = DataLoader(
    AspectVADataset(train_df, train=True),
    batch_size=16,
    shuffle=True
)

val_loader = DataLoader(
    AspectVADataset(val_df, train=True),
    batch_size=16
)

test_internal_loader = DataLoader(
    AspectVADataset(test_internal_df, train=True),
    batch_size=16
)


In [25]:
class XLMRForVA(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.regressor(pooled)


In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = XLMRForVA().to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=2e-5,
    weight_decay=0.01
)


In [27]:
def rmse_va(y_true, y_pred):
    squared_diff = (y_pred - y_true) ** 2
    return np.sqrt(squared_diff.sum(axis=1).mean())


In [31]:
best_val_rmse = float("inf")
patience = 2
patience_counter = 0
MAX_EPOCHS = 10

for epoch in range(MAX_EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        loss = torch.nn.functional.smooth_l1_loss(preds, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_preds, val_golds = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            preds = model(input_ids, attention_mask)
            val_preds.append(preds.cpu().numpy())
            val_golds.append(labels.cpu().numpy())

    val_preds = np.vstack(val_preds)
    val_golds = np.vstack(val_golds)

    val_rmse = rmse_va(val_golds, val_preds)

    print(
        f"Epoch {epoch+1} | "
        f"Train Loss: {total_loss/len(train_loader):.4f} | "
        f"Val RMSE_VA: {val_rmse:.4f}"
    )

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        patience_counter = 0
        torch.save(model.state_dict(), "best_ru_xlmr_va.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break


100%|██████████| 125/125 [00:52<00:00,  2.40it/s]


Epoch 1 | Train Loss: 0.6341 | Val RMSE_VA: 2.2257


100%|██████████| 125/125 [00:51<00:00,  2.40it/s]


Epoch 2 | Train Loss: 0.6165 | Val RMSE_VA: 2.1524


100%|██████████| 125/125 [00:51<00:00,  2.41it/s]


Epoch 3 | Train Loss: 0.5891 | Val RMSE_VA: 2.0912


100%|██████████| 125/125 [00:52<00:00,  2.40it/s]


Epoch 4 | Train Loss: 0.4895 | Val RMSE_VA: 2.2236


100%|██████████| 125/125 [00:51<00:00,  2.43it/s]


Epoch 5 | Train Loss: 0.4024 | Val RMSE_VA: 1.5485


100%|██████████| 125/125 [00:51<00:00,  2.41it/s]


Epoch 6 | Train Loss: 0.3137 | Val RMSE_VA: 1.4302


100%|██████████| 125/125 [00:51<00:00,  2.41it/s]


Epoch 7 | Train Loss: 0.2658 | Val RMSE_VA: 1.6469


100%|██████████| 125/125 [00:51<00:00,  2.43it/s]


Epoch 8 | Train Loss: 0.2340 | Val RMSE_VA: 1.5242
Early stopping triggered


In [32]:
model.load_state_dict(torch.load("best_ru_xlmr_va.pt"))
model.eval()


XLMRForVA(
  (encoder): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
          

In [33]:
test_preds, test_golds = [], []

with torch.no_grad():
    for batch in test_internal_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        test_preds.append(preds.cpu().numpy())
        test_golds.append(labels.cpu().numpy())

test_preds = np.vstack(test_preds)
test_golds = np.vstack(test_golds)

print("=== INTERNAL TEST PERFORMANCE ===")
print("Official RMSE_VA:", rmse_va(test_golds, test_preds))


=== INTERNAL TEST PERFORMANCE ===
Official RMSE_VA: 1.479094


In [35]:
test_loader = DataLoader(
    AspectVADataset(test_df, train=False),
    batch_size=16
)

outputs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        preds = model(input_ids, attention_mask)
        outputs.append(preds.cpu().numpy())

outputs = np.vstack(outputs)
outputs = np.clip(outputs, 1.0, 9.0)

test_df = test_df.reset_index(drop=True)
test_df["VA"] = [f"{v:.2f}#{a:.2f}" for v, a in outputs]

with open("pred_ukr_restaurant.jsonl", "w", encoding="utf-8") as f:
    for rid in test_df["id"].unique():
        group = test_df[test_df["id"] == rid]
        record = {
            "ID": rid,
            "Aspect_VA": [
                {"Aspect": row["aspect"], "VA": row["VA"]}
                for _, row in group.iterrows()
            ]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("pred_ukr_restaurant.jsonl generated ✅")


pred_ukr_restaurant.jsonl generated ✅


In [36]:
from google.colab import files
files.download("pred_ukr_restaurant.jsonl")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>