In [1]:
!pip install -q transformers torch scikit-learn tqdm emoji seqeval


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [9]:
!pip install fugashi ipadic unidic_lite

Collecting unidic_lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic_lite
  Building wheel for unidic_lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic_lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658817 sha256=ab11e22df6eb506b9974fe918bbbb24cfdf6f7822c6429d0ecea7c2b331c3429
  Stored in directory: /root/.cache/pip/wheels/5e/1f/0f/4d43887e5476d956fae828ee9b6687becd5544d68b51ed633d
Successfully built unidic_lite
Installing collected packages: unidic_lite
Successfully installed unidic_lite-1.0.8


In [2]:
import json
import re
import unicodedata
import emoji
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [58]:
TRAIN_PATH = "/content/ukr_restaurant_train_alltasks.jsonl"
TEST_PATH  = "/content/ukr_restaurant_test_task2.jsonl"


In [59]:
def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 1240
Test reviews : 630


In [60]:
def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [61]:
# %%
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, Doc

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

def lemmatize_uk(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return " ".join(t.lemma for t in doc.tokens if t.lemma.isalpha())


In [62]:
# %%
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

label2id = {"O":0, "B-ASP":1, "I-ASP":2, "B-OPN":3, "I-OPN":4}
id2label = {v:k for k,v in label2id.items()}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [63]:
# %%
def build_bio(tokens, aspects, opinions):
    labels = ["O"] * len(tokens)

    def mark(span, tag):
        s_toks = span.split()
        for i in range(len(tokens) - len(s_toks) + 1):
            if tokens[i:i+len(s_toks)] == s_toks:
                labels[i] = f"B-{tag}"
                for j in range(1, len(s_toks)):
                    labels[i+j] = f"I-{tag}"

    for a in aspects:
        mark(a, "ASP")
    for o in opinions:
        mark(o, "OPN")

    return labels


In [64]:
# %%
def preprocess_train(data):
    rows = []
    for it in data:
        text = clean_text(it["Text"])
        aspects, opinions = [], []

        for q in it["Quadruplet"]:
            if q["Aspect"] != "NULL":
                aspects.append(q["Aspect"])
                opinions.append(q["Opinion"])

        tokens = text.split()
        if not tokens:
            continue

        labels = build_bio(tokens, aspects, opinions)
        if len(labels) != len(tokens):
            labels = ["O"] * len(tokens)

        rows.append({"text": text, "labels": labels})

    return rows

train_data = preprocess_train(train_raw)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

print("Train:", len(train_data), "Val:", len(val_data))


Train: 992 Val: 248


In [65]:
# %%
test_data = [{"id": x["ID"], "text": clean_text(x["Text"])} for x in test_raw]


In [66]:
# %%
class DimASTEDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        it = self.data[idx]
        enc = tokenizer(
            it["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        labels = [label2id[l] for l in it["labels"]] + [0]*(128-len(it["labels"]))

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(labels[:128])
        }


In [67]:
# %%
class DimASTEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.classifier = torch.nn.Linear(768, len(label2id))

    def forward(self, ids, mask):
        return self.classifier(self.encoder(ids, mask).last_hidden_state)


In [68]:
# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DimASTEModel().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
class_weights = torch.tensor([0.02,1.5,1.5,1.5,1.5], device=device)

loader = DataLoader(DimASTEDataset(train_data), batch_size=8, shuffle=True)

for e in range(8):
    model.train()
    total = 0
    for b in tqdm(loader):
        optimizer.zero_grad()
        logits = model(b["input_ids"].to(device), b["attention_mask"].to(device))
        loss = torch.nn.functional.cross_entropy(
            logits.view(-1, len(label2id)),
            b["labels"].to(device).view(-1),
            weight=class_weights
        )
        loss.backward()
        optimizer.step()
        total += loss.item()
    print(f"BIO Epoch {e+1} | Loss {total/len(loader):.4f}")


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

100%|██████████| 124/124 [00:34<00:00,  3.55it/s]


BIO Epoch 1 | Loss 1.0002


100%|██████████| 124/124 [00:33<00:00,  3.68it/s]


BIO Epoch 2 | Loss 0.8010


100%|██████████| 124/124 [00:33<00:00,  3.66it/s]


BIO Epoch 3 | Loss 0.7064


100%|██████████| 124/124 [00:34<00:00,  3.62it/s]


BIO Epoch 4 | Loss 0.6065


100%|██████████| 124/124 [00:34<00:00,  3.60it/s]


BIO Epoch 5 | Loss 0.5074


100%|██████████| 124/124 [00:34<00:00,  3.57it/s]


BIO Epoch 6 | Loss 0.4362


100%|██████████| 124/124 [00:33<00:00,  3.65it/s]


BIO Epoch 7 | Loss 0.3659


100%|██████████| 124/124 [00:34<00:00,  3.58it/s]

BIO Epoch 8 | Loss 0.2951





In [69]:
# %%
def decode_with_smoothing(probs, b_th, i_th):
    labels = []
    prev = "O"
    for p in probs:
        if p[label2id["B-ASP"]] > b_th:
            lab = "B-ASP"
        elif p[label2id["B-OPN"]] > b_th:
            lab = "B-OPN"
        elif prev.startswith("B") and p[label2id["I-"+prev[2:]]] > i_th:
            lab = "I-"+prev[2:]
        else:
            lab = "O"
        labels.append(lab)
        prev = lab
    return labels


In [83]:
# %% [UKRAINIAN STRING NORMALIZATION]

import string
import re

def normalize_uk_span(s):
    if s is None:
        return ""
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)
    s = s.strip(string.punctuation + "«»“”„")
    return s


In [87]:
# %% [SPAN VALIDATION — ADD THIS HERE]

def valid_span(s):
    if s is None:
        return False
    s = s.strip()
    if len(s) < 3:
        return False
    if not any(c.isalpha() for c in s):
        return False
    return True


In [96]:
def extract_triplets(text, probs, b_th, i_th):
    enc = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    offsets = enc["offset_mapping"][0].tolist()
    labels = decode_with_smoothing(probs, b_th, i_th)

    aspects, opinions = [], []
    cur_text = ""
    cur_type = None
    last_end = None

    # -------- Span reconstruction --------
    for lab, (s, e) in zip(labels, offsets):
        if s == e:
            continue

        token_text = text[s:e]

        if lab.startswith("B-"):
            if cur_text:
                if cur_type == "ASP":
                    aspects.append(cur_text.strip())
                else:
                    opinions.append(cur_text.strip())

            cur_text = token_text
            cur_type = lab[2:]
            last_end = e

        elif lab.startswith("I-") and cur_type == lab[2:]:
            if last_end == s:
                cur_text += token_text
            else:
                cur_text += " " + token_text
            last_end = e

        else:
            if cur_text:
                if cur_type == "ASP":
                    aspects.append(cur_text.strip())
                else:
                    opinions.append(cur_text.strip())
            cur_text = ""
            cur_type = None
            last_end = None

    if cur_text:
        if cur_type == "ASP":
            aspects.append(cur_text.strip())
        else:
            opinions.append(cur_text.strip())

    # -------- Drop junk spans --------
    aspects  = [a for a in aspects  if valid_span(a)]
    opinions = [o for o in opinions if valid_span(o)]

    # -------- Nearest-aspect pairing (ONLY) --------
    pairs = []

    for o in opinions:
        o_pos = text.find(o)
        if o_pos == -1:
            continue

        best_a = None
        best_dist = float("inf")

        for a in aspects:
            a_pos = text.find(a)
            if a_pos == -1:
                continue

            dist = abs(o_pos - a_pos)
            if dist < best_dist:
                best_dist = dist
                best_a = a

        if best_a is not None:
            pairs.append({
                "Aspect": normalize_uk_span(best_a),
                "Opinion": normalize_uk_span(o)
            })

    return pairs


In [90]:
# %%
class AspectVAModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, ids, mask):
        pooled = self.encoder(ids, mask).last_hidden_state[:,0]
        return self.regressor(pooled)


In [91]:
# %%
def preprocess_va(data):
    rows = []
    for it in data:
        text = clean_text(it["Text"])
        for q in it["Quadruplet"]:
            if q["Aspect"] != "NULL":
                v,a = map(float, q["VA"].split("#"))
                rows.append({
                    "text": f"<ASP>{lemmatize_uk(q['Aspect'])}</ASP> {text}",
                    "va": torch.tensor([v,a], dtype=torch.float)
                })
    return rows

va_data = preprocess_va(train_raw)
va_model = AspectVAModel().to(device)
va_opt = torch.optim.AdamW(va_model.parameters(), lr=2e-5)

va_loader = DataLoader(va_data, batch_size=16, shuffle=True)

for e in range(3):
    va_model.train()
    for b in tqdm(va_loader):
        va_opt.zero_grad()
        enc = tokenizer(b["text"], padding=True, truncation=True, return_tensors="pt")
        preds = va_model(enc["input_ids"].to(device), enc["attention_mask"].to(device))
        loss = torch.nn.functional.smooth_l1_loss(preds, b["va"].to(device))
        loss.backward()
        va_opt.step()
    print(f"VA Epoch {e+1} done")


100%|██████████| 156/156 [00:55<00:00,  2.84it/s]


VA Epoch 1 done


100%|██████████| 156/156 [00:53<00:00,  2.90it/s]


VA Epoch 2 done


100%|██████████| 156/156 [00:53<00:00,  2.91it/s]

VA Epoch 3 done





In [97]:
# %%
model.eval(); va_model.eval()

with open("pred_ukr_restaurant.jsonl", "w", encoding="utf-8") as f:
    with torch.no_grad():
        for it in test_data:
            enc = tokenizer(it["text"], return_tensors="pt", truncation=True)
            probs = torch.softmax(
                model(enc["input_ids"].to(device), enc["attention_mask"].to(device))[0], -1
            )

            trips = extract_triplets(
                it["text"], probs, b_th=0.30, i_th=0.25
            )

            out = []
            for t in trips:
                enc2 = tokenizer(
                    f"<ASP>{t['Aspect']}</ASP> {it['text']}",
                    return_tensors="pt", truncation=True
                )
                va = va_model(enc2["input_ids"].to(device), enc2["attention_mask"].to(device))[0].cpu().numpy()
                va = np.clip(va, 1.5, 8.5)

                out.append({
                    "Aspect": t["Aspect"],
                    "Opinion": t["Opinion"],
                    "VA": f"{va[0]:.2f}#{va[1]:.2f}"
                })

            f.write(json.dumps({"ID": it["id"], "Triplet": out}, ensure_ascii=False) + "\n")

print("pred_ukr_restaurant.jsonl generated ✅")


pred_ukr_restaurant.jsonl generated ✅


In [98]:
# %% [OFFICIAL cF1 EVALUATION — UKRAINIAN]

import numpy as np
import torch

# -------------------------
# Official metric helpers
# -------------------------
D_MAX = 128.0

def va_distance(pred_va, gold_va):
    return ((pred_va[0]-gold_va[0])**2 + (pred_va[1]-gold_va[1])**2) / D_MAX

def ctp(pred_va, gold_va):
    return max(0.0, 1.0 - va_distance(pred_va, gold_va))

# -------------------------
# Build GOLD triplets
# -------------------------
gold_triplets = {}

for item in train_raw:
    text = clean_text(item["Text"])
    trips = []

    for q in item["Quadruplet"]:
        if q["Aspect"] == "NULL":
            continue
        v, a = map(float, q["VA"].split("#"))
        trips.append({
            "Aspect": normalize_uk_span(q["Aspect"]),
            "Opinion": normalize_uk_span(q["Opinion"]),
            "VA": np.array([v, a])
        })

    gold_triplets[text] = trips

# -------------------------
# Predict triplets on VAL
# -------------------------
def predict_triplets_val(val_data):
    preds = {}
    model.eval()
    va_model.eval()

    with torch.no_grad():
        for item in val_data:
            text = item["text"]

            enc = tokenizer(text, return_tensors="pt", truncation=True)
            probs = torch.softmax(
                model(
                    enc["input_ids"].to(device),
                    enc["attention_mask"].to(device)
                )[0],
                dim=-1
            )

            triplets = extract_triplets(
                text,
                probs,
                b_th=0.30,
                i_th=0.25
            )

            out = []
            for t in triplets:
                enc2 = tokenizer(
                    f"<ASP>{t['Aspect']}</ASP> {text}",
                    return_tensors="pt",
                    truncation=True
                )
                va = va_model(
                    enc2["input_ids"].to(device),
                    enc2["attention_mask"].to(device)
                )[0].cpu().numpy()

                out.append({
                    "Aspect": t["Aspect"],
                    "Opinion": t["Opinion"],
                    "VA": va
                })

            preds[text] = out

    return preds

pred_triplets = predict_triplets_val(val_data)

# -------------------------
# Compute official cF1
# -------------------------
def compute_cF1(preds, golds):
    ctp_sum = 0.0
    pred_count = 0
    gold_count = 0

    for text in golds:
        gold_trips = golds[text]
        pred_trips = preds.get(text, [])

        gold_count += len(gold_trips)
        pred_count += len(pred_trips)

        used = set()
        for p in pred_trips:
            for i, g in enumerate(gold_trips):
                if i in used:
                    continue
                if p["Aspect"] == g["Aspect"] and p["Opinion"] == g["Opinion"]:
                    ctp_sum += ctp(p["VA"], g["VA"])
                    used.add(i)
                    break

    if pred_count == 0 or gold_count == 0:
        return 0.0, 0.0, 0.0

    cP = ctp_sum / pred_count
    cR = ctp_sum / gold_count
    cF1 = 2 * cP * cR / (cP + cR) if (cP + cR) > 0 else 0.0

    return cP, cR, cF1

# -------------------------
# Run evaluation
# -------------------------
cP, cR, cF1 = compute_cF1(pred_triplets, gold_triplets)

print("=== OFFICIAL Subtask-2 Validation (cF1) — UKRAINIAN ===")
print(f"cPrecision : {cP:.4f}")
print(f"cRecall    : {cR:.4f}")
print(f"cF1        : {cF1:.4f}")

# -------------------------
# Coverage diagnostics
# -------------------------
total_pred = sum(len(v) for v in pred_triplets.values())
total_gold = sum(len(v) for v in gold_triplets.values())

print("\nTriplet coverage:")
print("Predicted triplets:", total_pred)
print("Gold triplets     :", total_gold)
print("Recall ceiling    :", total_pred / max(1, total_gold))


=== OFFICIAL Subtask-2 Validation (cF1) — UKRAINIAN ===
cPrecision : 0.0013
cRecall    : 0.0003
cF1        : 0.0005

Triplet coverage:
Predicted triplets: 640
Gold triplets     : 2487
Recall ceiling    : 0.25733815842380375


In [99]:
from google.colab import files
files.download("pred_ukr_restaurant.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>