In [1]:
!pip install -q transformers torch scikit-learn tqdm emoji seqeval


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
import json
import re
import unicodedata
import emoji
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [49]:
TRAIN_PATH = "/content/eng_laptop_train_alltasks.jsonl"
TEST_PATH  = "/content/eng_laptop_test_task2.jsonl"


In [50]:
def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 4076
Test reviews : 1000


In [5]:
def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [6]:
def build_bio(text, aspects, opinions):
    tokens = text.split()
    labels = ["O"] * len(tokens)

    def mark(span, tag):
        span_toks = span.split()
        for i in range(len(tokens)):
            if tokens[i:i+len(span_toks)] == span_toks:
                labels[i] = f"B-{tag}"
                for j in range(1, len(span_toks)):
                    labels[i+j] = f"I-{tag}"

    for a in aspects:
        mark(a, "ASP")
    for o in opinions:
        mark(o, "OPN")

    return labels


In [51]:
def preprocess_train(data):
    samples = []
    for item in data:
        text = clean_text(item["Text"])
        aspects, opinions = [], []

        for q in item["Quadruplet"]:
            if q["Aspect"] != "NULL":
                aspects.append(q["Aspect"])
                opinions.append(q["Opinion"])

        labels = build_bio(text, aspects, opinions)
        samples.append({"text": text, "labels": labels})

    return samples

train_data = preprocess_train(train_raw)
train_data, val_data = train_test_split(
    train_data, test_size=0.2, random_state=42
)

print("Train samples:", len(train_data))
print("Val samples  :", len(val_data))


Train samples: 3260
Val samples  : 816


In [52]:
test_data = [
    {"id": x["ID"], "text": clean_text(x["Text"])}
    for x in test_raw
]


In [53]:
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

label2id = {"O":0, "B-ASP":1, "I-ASP":2, "B-OPN":3, "I-OPN":4}
id2label = {v:k for k,v in label2id.items()}


In [54]:
class DimASTEDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = tokenizer(
            item["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        labels = [label2id[l] for l in item["labels"]]
        labels += [0] * (128 - len(labels))

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(labels[:128])
        }


In [55]:
class DimASTEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.classifier = torch.nn.Linear(768, len(label2id))

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids, attention_mask)
        return self.classifier(out.last_hidden_state)


In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DimASTEModel().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

class_weights = torch.tensor(
    [0.02, 1.5, 1.5, 1.5, 1.5],
    device=device
)

train_loader = DataLoader(
    DimASTEDataset(train_data),
    batch_size=8,
    shuffle=True
)

for epoch in range(8):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(ids, mask)
        loss = torch.nn.functional.cross_entropy(
            logits.view(-1, len(label2id)),
            labels.view(-1),
            weight=class_weights
        )

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | BIO Loss: {total_loss/len(train_loader):.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 408/408 [01:25<00:00,  4.79it/s]


Epoch 1 | BIO Loss: 0.4547


100%|██████████| 408/408 [01:23<00:00,  4.91it/s]


Epoch 2 | BIO Loss: 0.2638


100%|██████████| 408/408 [01:22<00:00,  4.92it/s]


Epoch 3 | BIO Loss: 0.1960


100%|██████████| 408/408 [01:22<00:00,  4.92it/s]


Epoch 4 | BIO Loss: 0.1450


100%|██████████| 408/408 [01:22<00:00,  4.92it/s]


Epoch 5 | BIO Loss: 0.1193


100%|██████████| 408/408 [01:22<00:00,  4.92it/s]


Epoch 6 | BIO Loss: 0.0970


100%|██████████| 408/408 [01:22<00:00,  4.92it/s]


Epoch 7 | BIO Loss: 0.0778


100%|██████████| 408/408 [01:22<00:00,  4.92it/s]

Epoch 8 | BIO Loss: 0.0719





In [57]:
!pip install -q spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m133.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [58]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [40]:
def aspect_pos_ok(span, doc):
    span = span.lower()
    for token in doc:
        if token.text.lower() in span.split():
            # Always allow proper nouns (brands, models)
            if token.pos_ == "PROPN":
                return True
            # Allow nouns and verbs as before
            if token.pos_ in {"NOUN", "VERB"}:
                return True
    return False



def opinion_pos_ok(span, doc):
    span = span.lower()
    for token in doc:
        if token.text.lower() in span.split():
            if token.pos_ in {"ADJ", "ADV"}:
                return True
            if token.pos_ == "VERB" and token.dep_ not in {"aux", "auxpass"}:
                return True
    return False



In [59]:
STOPWORDS = set("""
a an the for of to in on and but or with as at by from
""".split())


def valid_aspect_basic(a):
    a = a.strip()
    if len(a) < 2:
        return False
    if not any(c.isalpha() for c in a):
        return False
    if a.lower() in STOPWORDS:
        return False
    return True


def extract_triplets(text, labels):
    tokens = text.split()

    aspects = []
    opinions = []

    cur = []
    cur_type = None

    # ---- BIO span extraction (with I-without-B recovery) ----
    for t, l in zip(tokens, labels):
        if l.startswith("B-"):
            if cur:
                if cur_type == "ASP":
                    aspects.append(" ".join(cur))
                elif cur_type == "OPN":
                    opinions.append(" ".join(cur))
            cur = [t]
            cur_type = l[2:]

        elif l.startswith("I-"):
            if cur_type == l[2:]:
                cur.append(t)
            else:
                cur = [t]
                cur_type = l[2:]

        else:
            if cur:
                if cur_type == "ASP":
                    aspects.append(" ".join(cur))
                elif cur_type == "OPN":
                    opinions.append(" ".join(cur))
            cur = []
            cur_type = None

    if cur:
        if cur_type == "ASP":
            aspects.append(" ".join(cur))
        elif cur_type == "OPN":
            opinions.append(" ".join(cur))

    if not aspects or not opinions:
        return []

    # ---- spaCy processing ----
    doc = nlp(text)

    # ---- Aspect filtering: noun OR verb ----
    aspects = [
        a for a in aspects
        if valid_aspect_basic(a) and aspect_pos_ok(a, doc)
    ]

    # ---- Opinion filtering: adjective OR adverb ----
    opinions = [
        o for o in opinions
        if opinion_pos_ok(o, doc)
    ]

    if not aspects or not opinions:
        return []

    # ---- Nearest-aspect pairing (stable heuristic) ----
    triplets = []

    for o in opinions:
        o_pos = text.find(o)
        if o_pos == -1:
            continue

        closest_a = None
        min_dist = float("inf")

        for a in aspects:
            a_pos = text.find(a)
            if a_pos == -1:
                continue
            dist = abs(o_pos - a_pos)
            if dist < min_dist:
                min_dist = dist
                closest_a = a

        if closest_a is not None:
            triplets.append({
                "Aspect": closest_a,
                "Opinion": o
            })

    return triplets


In [60]:
class AspectVAModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids, attention_mask)
        pooled = out.last_hidden_state[:,0,:]
        return self.regressor(pooled)


In [61]:
def preprocess_va(data):
    rows = []
    for item in data:
        text = clean_text(item["Text"])
        for q in item["Quadruplet"]:
            if q["Aspect"] != "NULL":
                v, a = map(float, q["VA"].split("#"))
                rows.append({
                    "text": f"<ASP>{q['Aspect']}</ASP> {text}",
                    "va": [v, a]
                })
    return rows

va_data = preprocess_va(train_raw)


In [62]:
class VADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = tokenizer(
            item["text"],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(item["va"], dtype=torch.float)
        }


In [63]:
va_model = AspectVAModel().to(device)
va_optimizer = torch.optim.AdamW(va_model.parameters(), lr=2e-5)

va_loader = DataLoader(VADataset(va_data), batch_size=16, shuffle=True)

for epoch in range(3):
    va_model.train()
    for batch in tqdm(va_loader):
        va_optimizer.zero_grad()
        preds = va_model(
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device)
        )
        loss = torch.nn.functional.smooth_l1_loss(
            preds, batch["labels"].to(device)
        )
        loss.backward()
        va_optimizer.step()

    print(f"VA Epoch {epoch+1} done")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 283/283 [01:37<00:00,  2.91it/s]


VA Epoch 1 done


100%|██████████| 283/283 [01:36<00:00,  2.92it/s]


VA Epoch 2 done


100%|██████████| 283/283 [01:36<00:00,  2.93it/s]

VA Epoch 3 done





In [64]:
def predict_va(text, aspect):
    inp = f"<ASP>{aspect}</ASP> {text}"
    enc = tokenizer(inp, return_tensors="pt", truncation=True)
    with torch.no_grad():
        va = va_model(
            enc["input_ids"].to(device),
            enc["attention_mask"].to(device)
        )[0].cpu().numpy()
    return np.clip(va, 1.0, 9.0)

model.eval()
va_model.eval()

with open("pred_eng_laptop.jsonl", "w", encoding="utf-8") as f:
    for item in test_data:
        enc = tokenizer(item["text"], return_tensors="pt")
        logits = model(
            enc["input_ids"].to(device),
            enc["attention_mask"].to(device)
        )

        probs = torch.softmax(logits, dim=-1)[0]
        labels = []
        for p in probs:
            if p[label2id["B-ASP"]] > 0.30:
                labels.append("B-ASP")
            elif p[label2id["B-OPN"]] > 0.30:
                labels.append("B-OPN")
            else:
                labels.append("O")

        triplets = extract_triplets(item["text"], labels)

        out = []
        for t in triplets:
            va = predict_va(item["text"], t["Aspect"])
            out.append({
                "Aspect": t["Aspect"],
                "Opinion": t["Opinion"],
                "VA": f"{va[0]:.2f}#{va[1]:.2f}"
            })

        f.write(json.dumps({"ID": item["id"], "Triplet": out}) + "\n")

print("pred_eng_laptop.jsonl generated ✅")


pred_eng_laptop.jsonl generated ✅


In [65]:
# =========================
# cF1 VALIDATION CELL
# =========================

D_MAX = 128.0

def va_distance(pred_va, gold_va):
    return ((pred_va[0]-gold_va[0])**2 + (pred_va[1]-gold_va[1])**2) / D_MAX

def ctp(pred_va, gold_va):
    return max(0.0, 1.0 - va_distance(pred_va, gold_va))


# ---- Build GOLD triplets from train_raw ----
gold_triplets = {}

for item in train_raw:
    text = clean_text(item["Text"])
    trips = []

    for q in item["Quadruplet"]:
        if q["Aspect"] == "NULL":
            continue
        v, a = map(float, q["VA"].split("#"))
        trips.append({
            "Aspect": q["Aspect"].strip(),
            "Opinion": q["Opinion"].strip(),
            "VA": np.array([v, a])
        })

    gold_triplets[text] = trips


# ---- Predict triplets on VAL data ----
def predict_triplets_val(val_data):
    preds = {}

    model.eval()
    va_model.eval()

    with torch.no_grad():
        for item in val_data:
            text = item["text"]

            enc = tokenizer(text, return_tensors="pt", truncation=True)
            logits = model(
                enc["input_ids"].to(device),
                enc["attention_mask"].to(device)
            )

            probs = torch.softmax(logits, dim=-1)[0]
            labels = []

            for p in probs:
                if p[label2id["B-ASP"]] > 0.25:
                    labels.append("B-ASP")
                elif p[label2id["B-OPN"]] > 0.25:
                    labels.append("B-OPN")
                else:
                    labels.append("O")

            triplets = extract_triplets(text, labels)

            out = []
            for t in triplets:
                va = predict_va(text, t["Aspect"])
                out.append({
                    "Aspect": t["Aspect"],
                    "Opinion": t["Opinion"],
                    "VA": va
                })

            preds[text] = out

    return preds


pred_triplets = predict_triplets_val(val_data)


# ---- Compute official cF1 ----
def compute_cF1(preds, golds):
    ctp_sum = 0.0
    pred_count = 0
    gold_count = 0

    for text in golds:
        gold_trips = golds[text]
        pred_trips = preds.get(text, [])

        gold_count += len(gold_trips)
        pred_count += len(pred_trips)

        used = set()
        for p in pred_trips:
            for i, g in enumerate(gold_trips):
                if i in used:
                    continue
                if p["Aspect"] == g["Aspect"] and p["Opinion"] == g["Opinion"]:
                    ctp_sum += ctp(p["VA"], g["VA"])
                    used.add(i)
                    break

    if pred_count == 0 or gold_count == 0:
        return 0.0, 0.0, 0.0

    cP = ctp_sum / pred_count
    cR = ctp_sum / gold_count
    cF1 = 2 * cP * cR / (cP + cR) if (cP + cR) > 0 else 0.0

    return cP, cR, cF1


# ---- Run evaluation ----
cP, cR, cF1 = compute_cF1(pred_triplets, gold_triplets)

print("=== Subtask-2 Validation Performance (cF1) ===")
print(f"cPrecision : {cP:.4f}")
print(f"cRecall    : {cR:.4f}")
print(f"cF1        : {cF1:.4f}")

# ---- Debug coverage ----
total_pred = sum(len(v) for v in pred_triplets.values())
total_gold = sum(len(v) for v in gold_triplets.values())

print("\nTriplet coverage:")
print("Predicted triplets:", total_pred)
print("Gold triplets     :", total_gold)
print("Recall ceiling    :", total_pred / total_gold)


=== Subtask-2 Validation Performance (cF1) ===
cPrecision : 0.2437
cRecall    : 0.0514
cF1        : 0.0849

Triplet coverage:
Predicted triplets: 949
Gold triplets     : 4500
Recall ceiling    : 0.21088888888888888


In [66]:
from google.colab import files
files.download("pred_eng_laptop.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>