In [1]:
!pip install -q transformers torch scikit-learn tqdm emoji seqeval


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [9]:
!pip install fugashi ipadic unidic_lite

Collecting unidic_lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic_lite
  Building wheel for unidic_lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic_lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658817 sha256=ab11e22df6eb506b9974fe918bbbb24cfdf6f7822c6429d0ecea7c2b331c3429
  Stored in directory: /root/.cache/pip/wheels/5e/1f/0f/4d43887e5476d956fae828ee9b6687becd5544d68b51ed633d
Successfully built unidic_lite
Installing collected packages: unidic_lite
Successfully installed unidic_lite-1.0.8


In [2]:
import json
import re
import unicodedata
import emoji
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [21]:
TRAIN_PATH = "/content/rus_restaurant_train_alltasks.jsonl"
TEST_PATH  = "/content/rus_restaurant_test_task2.jsonl"


In [22]:
def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 1240
Test reviews : 630


In [23]:
def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [42]:
# %% [RUSSIAN LEMMATIZATION — PYTHON 3.12 SAFE]
!pip install -q natasha
from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, Doc

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

def lemmatize_ru(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return " ".join(t.lemma for t in doc.tokens if t.lemma.isalpha())


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [24]:
# %%
MODEL_NAME = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

label2id = {"O":0, "B-ASP":1, "I-ASP":2, "B-OPN":3, "I-OPN":4}
id2label = {v:k for k,v in label2id.items()}


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [25]:
# %%
def build_bio(tokens, aspects, opinions):
    labels = ["O"] * len(tokens)

    def mark(span, tag):
        span_tokens = span.split()
        for i in range(len(tokens) - len(span_tokens) + 1):
            if tokens[i:i+len(span_tokens)] == span_tokens:
                labels[i] = f"B-{tag}"
                for j in range(1, len(span_tokens)):
                    labels[i+j] = f"I-{tag}"

    for a in aspects:
        mark(a, "ASP")
    for o in opinions:
        mark(o, "OPN")

    return labels


In [26]:
# %%
def preprocess_train(data):
    rows = []
    for it in data:
        text = clean_text(it["Text"])
        aspects, opinions = [], []

        for q in it["Quadruplet"]:
            if q["Aspect"] != "NULL":
                aspects.append(q["Aspect"])
                opinions.append(q["Opinion"])

        tokens = text.split()
        if not tokens:
            continue

        labels = build_bio(tokens, aspects, opinions)
        if len(labels) != len(tokens):
            labels = ["O"] * len(tokens)

        rows.append({"text": text, "labels": labels})

    return rows

train_data = preprocess_train(train_raw)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

print("Train:", len(train_data), "Val:", len(val_data))




Train: 992 Val: 248


In [27]:
# %%
test_data = [{"id": x["ID"], "text": clean_text(x["Text"])} for x in test_raw]


In [28]:
# %%
class DimASTEDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        it = self.data[idx]
        enc = tokenizer(
            it["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        labels = [label2id[l] for l in it["labels"]] + [0]*(128-len(it["labels"]))

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(labels[:128])
        }


In [29]:
# %%
class DimASTEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.classifier = torch.nn.Linear(768, len(label2id))

    def forward(self, ids, mask):
        return self.classifier(self.encoder(ids, mask).last_hidden_state)


In [30]:
# %%
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DimASTEModel().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
class_weights = torch.tensor([0.02,1.5,1.5,1.5,1.5], device=device)

loader = DataLoader(DimASTEDataset(train_data), batch_size=8, shuffle=True)

BIO_EPOCHS = 8
for e in range(BIO_EPOCHS):
    model.train()
    total = 0
    for b in tqdm(loader):
        optimizer.zero_grad()
        logits = model(b["input_ids"].to(device), b["attention_mask"].to(device))
        loss = torch.nn.functional.cross_entropy(
            logits.view(-1, len(label2id)),
            b["labels"].to(device).view(-1),
            weight=class_weights
        )
        loss.backward()
        optimizer.step()
        total += loss.item()
    print(f"BIO Epoch {e+1}/{BIO_EPOCHS} | Loss {total/len(loader):.4f}")


pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  2%|▏         | 2/124 [00:00<00:31,  3.92it/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

100%|██████████| 124/124 [00:30<00:00,  4.03it/s]


BIO Epoch 1/8 | Loss 0.9589


100%|██████████| 124/124 [00:28<00:00,  4.30it/s]


BIO Epoch 2/8 | Loss 0.6377


100%|██████████| 124/124 [00:30<00:00,  4.04it/s]


BIO Epoch 3/8 | Loss 0.4644


100%|██████████| 124/124 [00:30<00:00,  4.12it/s]


BIO Epoch 4/8 | Loss 0.3432


100%|██████████| 124/124 [00:29<00:00,  4.26it/s]


BIO Epoch 5/8 | Loss 0.2475


100%|██████████| 124/124 [00:28<00:00,  4.30it/s]


BIO Epoch 6/8 | Loss 0.1745


100%|██████████| 124/124 [00:30<00:00,  4.05it/s]


BIO Epoch 7/8 | Loss 0.1317


100%|██████████| 124/124 [00:28<00:00,  4.28it/s]

BIO Epoch 8/8 | Loss 0.0999





In [31]:
# %%
def decode_with_smoothing(probs, b_th, i_th):
    labels = []
    prev = "O"
    for p in probs:
        if p[label2id["B-ASP"]] > b_th:
            lab = "B-ASP"
        elif p[label2id["B-OPN"]] > b_th:
            lab = "B-OPN"
        elif prev.startswith("B") and p[label2id["I-"+prev[2:]]] > i_th:
            lab = "I-"+prev[2:]
        else:
            lab = "O"
        labels.append(lab)
        prev = lab
    return labels


In [51]:
# %% [EXTRACT TRIPLETS — WINDOW + LEMMA]

def extract_triplets(text, labels):
    tokens = text.split()
    aspects, opinions = [], []
    cur, cur_t = [], None

    for tok, lab in zip(tokens, labels):
        if lab.startswith("B-"):
            if cur:
                if cur_t == "ASP":
                    aspects.append(" ".join(cur))
                else:
                    opinions.append(" ".join(cur))
            cur = [tok]
            cur_t = lab[2:]
        elif lab.startswith("I-") and cur_t == lab[2:]:
            cur.append(tok)
        else:
            if cur:
                if cur_t == "ASP":
                    aspects.append(" ".join(cur))
                else:
                    opinions.append(" ".join(cur))
            cur, cur_t = [], None

    if cur:
        if cur_t == "ASP":
            aspects.append(" ".join(cur))
        else:
            opinions.append(" ".join(cur))

    pairs = []
    for o in opinions:
        o_pos = text.find(o)
        if o_pos == -1:
            continue

        best_a = None
        best_dist = 1e9

        for a in aspects:
            a_pos = text.find(a)
            if a_pos == -1:
                continue

            dist = abs(o_pos - a_pos)
            if dist < best_dist:
                best_dist = dist
                best_a = a

        if best_a is not None:
            pairs.append({
                "Aspect": lemmatize_ru(best_a),
                "Opinion": lemmatize_ru(o)
            })


    return pairs


In [44]:
# %%
best_th, best_f1 = 0.3, 0

model.eval()
with torch.no_grad():
    for th in np.arange(0.15, 0.45, 0.05):
        tp = fp = fn = 0
        for it in val_data:
            enc = tokenizer(it["text"], return_tensors="pt", truncation=True)
            probs = torch.softmax(
                model(enc["input_ids"].to(device), enc["attention_mask"].to(device))[0], -1
            )
            labs = decode_with_smoothing(probs, th-0.05, th-0.10)
            preds = {(p["Aspect"], p["Opinion"]) for p in extract_triplets(it["text"], labs)}

            gold = set()
            for r in train_raw:
                if clean_text(r["Text"]) == it["text"]:
                    gold = {(q["Aspect"], q["Opinion"]) for q in r["Quadruplet"] if q["Aspect"]!="NULL"}
                    break

            tp += len(preds & gold)
            fp += len(preds - gold)
            fn += len(gold - preds)

        f1 = 2*tp/(2*tp+fp+fn+1e-9)
        if f1 > best_f1:
            best_f1, best_th = f1, th

print("Best threshold:", best_th)


Best threshold: 0.15


In [45]:
# %%
class AspectVAModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, ids, mask):
        pooled = self.encoder(ids, mask).last_hidden_state[:,0]
        return self.regressor(pooled)


In [46]:
# %%
def preprocess_va(data):
    rows = []
    for it in data:
        text = clean_text(it["Text"])
        for q in it["Quadruplet"]:
            if q["Aspect"] != "NULL":
                v,a = map(float, q["VA"].split("#"))
                rows.append({
                    "text": f"<ASP>{q['Aspect']}</ASP> {text}",
                    "va": torch.tensor([v,a], dtype=torch.float)
                })
    return rows

va_data = preprocess_va(train_raw)


In [47]:
# %%
class VADataset(Dataset):
    def __init__(self, data): self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        it = self.data[idx]
        enc = tokenizer(it["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": it["va"]
        }


In [48]:
# %%
va_model = AspectVAModel().to(device)
va_opt = torch.optim.AdamW(va_model.parameters(), lr=2e-5)

va_loader = DataLoader(VADataset(va_data), batch_size=16, shuffle=True)

VA_EPOCHS = 3
for e in range(VA_EPOCHS):
    va_model.train()
    total = 0
    for b in tqdm(va_loader):
        va_opt.zero_grad()
        preds = va_model(b["input_ids"].to(device), b["attention_mask"].to(device))
        loss = torch.nn.functional.smooth_l1_loss(preds, b["labels"].to(device))
        loss.backward()
        va_opt.step()
        total += loss.item()
    print(f"VA Epoch {e+1}/{VA_EPOCHS} | Loss {total/len(va_loader):.4f}")


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 156/156 [01:01<00:00,  2.52it/s]


VA Epoch 1/3 | Loss 0.9064


100%|██████████| 156/156 [01:01<00:00,  2.53it/s]


VA Epoch 2/3 | Loss 0.4272


100%|██████████| 156/156 [01:01<00:00,  2.56it/s]

VA Epoch 3/3 | Loss 0.2754





In [54]:
# %%
model.eval(); va_model.eval()

with open("pred_rus_restaurant.jsonl", "w", encoding="utf-8") as f:
    with torch.no_grad():
        for it in test_data:
            enc = tokenizer(it["text"], return_tensors="pt", truncation=True)
            probs = torch.softmax(
                model(enc["input_ids"].to(device), enc["attention_mask"].to(device))[0], -1
            )
            labs = decode_with_smoothing(probs, best_th+0.05, best_th)
            trips = extract_triplets(it["text"], labs)

            out = []
            for t in trips:
                enc2 = tokenizer(f"<ASP>{t['Aspect']}</ASP> {it['text']}", return_tensors="pt", truncation=True)
                va = va_model(enc2["input_ids"].to(device), enc2["attention_mask"].to(device))[0].cpu().numpy()
                va = np.clip(va, 1.5, 8.5)

                out.append({
                    "Aspect": t["Aspect"],
                    "Opinion": t["Opinion"],
                    "VA": f"{va[0]:.2f}#{va[1]:.2f}"
                })

            f.write(json.dumps({"ID": it["id"], "Triplet": out}, ensure_ascii=False) + "\n")

print("pred_rus_restaurant.jsonl generated ✅")


pred_rus_restaurant.jsonl generated ✅


In [56]:
# %% [OFFICIAL cF1 EVALUATION — RUSSIAN]

import numpy as np
import torch

# -------------------------
# Official metric helpers
# -------------------------
D_MAX = 128.0

def va_distance(pred_va, gold_va):
    return ((pred_va[0] - gold_va[0])**2 + (pred_va[1] - gold_va[1])**2) / D_MAX

def ctp(pred_va, gold_va):
    return max(0.0, 1.0 - va_distance(pred_va, gold_va))

# -------------------------
# Build GOLD triplets
# -------------------------
gold_triplets = {}

for item in train_raw:
    text = clean_text(item["Text"])
    trips = []

    for q in item["Quadruplet"]:
        if q["Aspect"] == "NULL":
            continue
        v, a = map(float, q["VA"].split("#"))
        trips.append({
            "Aspect": lemmatize_ru(q["Aspect"]),
            "Opinion": lemmatize_ru(q["Opinion"]),
            "VA": np.array([v, a])
        })

    gold_triplets[text] = trips

# -------------------------
# Predict triplets on VAL
# -------------------------
def predict_triplets_val(val_data):
    preds = {}
    model.eval()
    va_model.eval()

    with torch.no_grad():
        for item in val_data:
            text = item["text"]

            enc = tokenizer(text, return_tensors="pt", truncation=True)
            logits = model(
                enc["input_ids"].to(device),
                enc["attention_mask"].to(device)
            )

            probs = torch.softmax(logits[0], dim=-1)
            labels = decode_with_smoothing(
                probs,
                best_th +0.05,
                best_th
            )

            triplets = extract_triplets(text, labels)

            out = []
            for t in triplets:
                enc2 = tokenizer(
                    f"<ASP>{t['Aspect']}</ASP> {text}",
                    return_tensors="pt",
                    truncation=True
                )
                va = va_model(
                    enc2["input_ids"].to(device),
                    enc2["attention_mask"].to(device)
                )[0].cpu().numpy()

                out.append({
                    "Aspect": t["Aspect"],
                    "Opinion": t["Opinion"],
                    "VA": va
                })

            preds[text] = out

    return preds

pred_triplets = predict_triplets_val(val_data)

# -------------------------
# Compute official cF1
# -------------------------
def compute_cF1(preds, golds):
    ctp_sum = 0.0
    pred_count = 0
    gold_count = 0

    for text in golds:
        gold_trips = golds[text]
        pred_trips = preds.get(text, [])

        gold_count += len(gold_trips)
        pred_count += len(pred_trips)

        used = set()
        for p in pred_trips:
            for i, g in enumerate(gold_trips):
                if i in used:
                    continue
                if p["Aspect"] == g["Aspect"] and p["Opinion"] == g["Opinion"]:
                    ctp_sum += ctp(p["VA"], g["VA"])
                    used.add(i)
                    break

    if pred_count == 0 or gold_count == 0:
        return 0.0, 0.0, 0.0

    cP = ctp_sum / pred_count
    cR = ctp_sum / gold_count
    cF1 = 2 * cP * cR / (cP + cR) if (cP + cR) > 0 else 0.0

    return cP, cR, cF1

# -------------------------
# Run evaluation
# -------------------------
cP, cR, cF1 = compute_cF1(pred_triplets, gold_triplets)

print("=== OFFICIAL Subtask-2 Validation (cF1) — RUSSIAN ===")
print(f"cPrecision : {cP:.4f}")
print(f"cRecall    : {cR:.4f}")
print(f"cF1        : {cF1:.4f}")

# -------------------------
# Coverage diagnostics
# -------------------------
total_pred = sum(len(v) for v in pred_triplets.values())
total_gold = sum(len(v) for v in gold_triplets.values())

print("\nTriplet coverage:")
print("Predicted triplets:", total_pred)
print("Gold triplets     :", total_gold)
print("Recall ceiling    :", total_pred / max(1, total_gold))


=== OFFICIAL Subtask-2 Validation (cF1) — RUSSIAN ===
cPrecision : 0.0458
cRecall    : 0.0164
cF1        : 0.0242

Triplet coverage:
Predicted triplets: 892
Gold triplets     : 2487
Recall ceiling    : 0.3586650583031765


In [57]:
from google.colab import files
files.download("pred_rus_restaurant.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>