In [1]:
!pip install -q transformers torch scikit-learn tqdm emoji seqeval


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [9]:
!pip install fugashi ipadic unidic_lite

Collecting unidic_lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic_lite
  Building wheel for unidic_lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic_lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658817 sha256=ab11e22df6eb506b9974fe918bbbb24cfdf6f7822c6429d0ecea7c2b331c3429
  Stored in directory: /root/.cache/pip/wheels/5e/1f/0f/4d43887e5476d956fae828ee9b6687becd5544d68b51ed633d
Successfully built unidic_lite
Installing collected packages: unidic_lite
Successfully installed unidic_lite-1.0.8


In [2]:
import json
import re
import unicodedata
import emoji
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [4]:
TRAIN_PATH = "/content/jpn_hotel_train_alltasks.jsonl"
TEST_PATH  = "/content/jpn_hotel_test_task2.jsonl"


In [5]:
def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 1600
Test reviews : 800


In [7]:
def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [10]:
MODEL_NAME = "cl-tohoku/bert-base-japanese-v3"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


label2id = {"O":0, "B-ASP":1, "I-ASP":2, "B-OPN":3, "I-OPN":4}
id2label = {v:k for k,v in label2id.items()}

In [12]:
def build_bio(tokens, aspects, opinions):
  labels = ["O"] * len(tokens)


def mark(span, tag):
  span_tokens = tokenizer.tokenize(span)
  for i in range(len(tokens)):
    if tokens[i:i+len(span_tokens)] == span_tokens:
      labels[i] = f"B-{tag}"
    for j in range(1, len(span_tokens)):
      labels[i+j] = f"I-{tag}"


    for a in aspects:
      mark(a, "ASP")
    for o in opinions:
      mark(o, "OPN")


  return labels

In [13]:
def preprocess_train(data):
    samples = []
    for item in data:
        text = clean_text(item["Text"])
        aspects, opinions = [], []

        for q in item["Quadruplet"]:
            if q["Aspect"] != "NULL":
                aspects.append(q["Aspect"])
                opinions.append(q["Opinion"])

        labels = build_bio(text, aspects, opinions)
        samples.append({"text": text, "labels": labels})

    return samples

train_data = preprocess_train(train_raw)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)


print("Train samples:", len(train_data))
print("Val samples :", len(val_data))


Train samples: 1280
Val samples : 320


In [14]:
test_data = [
    {"id": x["ID"], "text": clean_text(x["Text"])}
    for x in test_raw
]


In [None]:
class DimASTEDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = tokenizer(
            item["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        labels = [label2id[l] for l in item["labels"]]
        labels += [0] * (128 - len(labels))

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(labels[:128])
        }


In [18]:
# %% [CELL 7] SAFE BIO construction

def build_bio(tokens, aspects, opinions):
    labels = ["O"] * len(tokens)

    def mark(span, tag):
        if not span:
            return
        span_tokens = tokenizer.tokenize(span)
        if not span_tokens:
            return
        for i in range(len(tokens) - len(span_tokens) + 1):
            if tokens[i:i+len(span_tokens)] == span_tokens:
                labels[i] = f"B-{tag}"
                for j in range(1, len(span_tokens)):
                    labels[i+j] = f"I-{tag}"

    for a in aspects:
        mark(a, "ASP")
    for o in opinions:
        mark(o, "OPN")

    return labels

# %% [CELL 8] Preprocess train (SAFE)

def preprocess_train(data):
    rows = []
    for it in data:
        text = clean_text(it['Text'])
        aspects, opinions = [], []
        for q in it['Quadruplet']:
            if q['Aspect'] != 'NULL':
                aspects.append(q['Aspect'])
                opinions.append(q['Opinion'])

        tokens = tokenizer.tokenize(text)
        if len(tokens) == 0:
            continue

        labels = build_bio(tokens, aspects, opinions)
        if labels is None or len(labels) != len(tokens):
            labels = ["O"] * len(tokens)

        rows.append({'text': text, 'labels': labels})
    return rows

train_data = preprocess_train(train_raw)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
print("Train samples:", len(train_data), "Val samples:", len(val_data))

# %% [CELL 9] Test data
test_data = [{'id':x['ID'], 'text':clean_text(x['Text'])} for x in test_raw]

# %% [CELL 10] Dataset (SAFE)
class DimASTEDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        it = self.data[idx]
        enc = tokenizer(it['text'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        labels = it.get('labels', []) or []
        label_ids = [label2id.get(l, 0) for l in labels]
        label_ids += [0] * (128 - len(label_ids))
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'labels': torch.tensor(label_ids[:128])
        }

# %% [CELL 11] BIO model
class DimASTEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.classifier = torch.nn.Linear(768, len(label2id))
    def forward(self, ids, mask):
        return self.classifier(self.encoder(ids, mask).last_hidden_state)

# %% [CELL 12] Train BIO
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DimASTEModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
class_weights = torch.tensor([0.02,1.5,1.5,1.5,1.5], device=device)
loader = DataLoader(DimASTEDataset(train_data), batch_size=8, shuffle=True)
BIO_EPOCHS = 8
for e in range(BIO_EPOCHS):
    model.train(); total = 0
    for b in tqdm(loader):
        optimizer.zero_grad()
        logits = model(b['input_ids'].to(device), b['attention_mask'].to(device))
        loss = torch.nn.functional.cross_entropy(
            logits.view(-1, len(label2id)),
            b['labels'].to(device).view(-1),
            weight=class_weights
        )
        loss.backward(); optimizer.step()
        total += loss.item()
    print(f"BIO Epoch {e+1}/{BIO_EPOCHS} | Loss: {total/len(loader):.4f}")

# %% [CELL 13] Decode with smoothing

def decode_with_smoothing(probs, b_th, i_th):
    labels = []
    prev = 'O'
    for p in probs:
        if p[label2id['B-ASP']] > b_th:
            lab = 'B-ASP'
        elif p[label2id['B-OPN']] > b_th:
            lab = 'B-OPN'
        elif prev.startswith('B') and p[label2id['I-'+prev[2:]]] > i_th:
            lab = 'I-' + prev[2:]
        else:
            lab = 'O'
        labels.append(lab)
        prev = lab
    return labels



# %% [CELL 14] Extract triplets (NEAREST-PAIRING – CRITICAL)

def extract_triplets(text, labels):
    toks = tokenizer.tokenize(text)

    aspects = []
    opinions = []

    cur = []
    cur_type = None

    # ---- BIO span extraction ----
    for tok, lab in zip(toks, labels):
        if lab.startswith("B-"):
            if cur:
                if cur_type == "ASP":
                    aspects.append("".join(cur))
                elif cur_type == "OPN":
                    opinions.append("".join(cur))
            cur = [tok]
            cur_type = lab[2:]

        elif lab.startswith("I-") and cur_type == lab[2:]:
            cur.append(tok)

        else:
            if cur:
                if cur_type == "ASP":
                    aspects.append("".join(cur))
                elif cur_type == "OPN":
                    opinions.append("".join(cur))
            cur = []
            cur_type = None

    if cur:
        if cur_type == "ASP":
            aspects.append("".join(cur))
        elif cur_type == "OPN":
            opinions.append("".join(cur))

    if not aspects or not opinions:
        return []

    # ---- NEAREST PAIRING ----
    pairs = []
    for o in opinions:
        o_pos = text.find(o)
        if o_pos == -1:
            continue

        best_a = None
        best_dist = 1e9

        for a in aspects:
            a_pos = text.find(a)
            if a_pos == -1:
                continue

            dist = abs(o_pos - a_pos)
            if dist < best_dist:
                best_dist = dist
                best_a = a

        if best_a is not None:
            pairs.append({"Aspect": best_a, "Opinion": o})

    return pairs


# %% [CELL 15] Threshold tuning (VAL)
best_th, best_f1 = 0.3, 0
model.eval()
with torch.no_grad():
    for th in np.arange(0.15, 0.45, 0.05):
        tp = fp = fn = 0
        for it in val_data:
            enc = tokenizer(it['text'], return_tensors='pt', truncation=True)
            probs = torch.softmax(model(enc['input_ids'].to(device), enc['attention_mask'].to(device))[0], -1)
            labs = decode_with_smoothing(probs, th, th-0.05)
            preds = {(p['Aspect'], p['Opinion']) for p in extract_triplets(it['text'], labs)}
            gold = set()
            for r in train_raw:
                if clean_text(r['Text']) == it['text']:
                    gold = {(q['Aspect'], q['Opinion']) for q in r['Quadruplet'] if q['Aspect']!='NULL'}
                    break
            tp += len(preds & gold)
            fp += len(preds - gold)
            fn += len(gold - preds)
        f1 = 2*tp/(2*tp+fp+fn+1e-9)
        if f1 > best_f1:
            best_f1, best_th = f1, th
print("Best threshold:", best_th)

# %% [CELL 16] VA model
class AspectVAModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)
    def forward(self, ids, mask):
        return self.regressor(self.encoder(ids, mask).last_hidden_state[:,0])

# %% [CELL 17] VA data
def preprocess_va(data):
    rows = []
    for it in data:
        text = clean_text(it['Text'])
        for q in it['Quadruplet']:
            if q['Aspect'] != 'NULL':
                v,a = map(float, q['VA'].split('#'))
                rows.append({'text': f"<ASP>{q['Aspect']}</ASP> {text}", 'va': torch.tensor([v,a])})
    return rows
va_data = preprocess_va(train_raw)

# %% [CELL 18] VA Dataset
class VADataset(Dataset):
    def __init__(self, data): self.data=data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        it = self.data[idx]
        enc = tokenizer(it['text'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'labels': it['va']
        }

# %% [CELL 19] Train VA
va_model = AspectVAModel().to(device)
va_opt = torch.optim.AdamW(va_model.parameters(), lr=2e-5)
va_loader = DataLoader(VADataset(va_data), batch_size=16, shuffle=True)
VA_EPOCHS = 3
for e in range(VA_EPOCHS):
    va_model.train(); total = 0
    for b in tqdm(va_loader):
        va_opt.zero_grad()
        preds = va_model(b['input_ids'].to(device), b['attention_mask'].to(device))
        loss = torch.nn.functional.smooth_l1_loss(preds, b['labels'].to(device))
        loss.backward(); va_opt.step(); total += loss.item()
    print(f"VA Epoch {e+1}/{VA_EPOCHS} | Loss: {total/len(va_loader):.4f}")

# %% [CELL 20] Final prediction JSONL
model.eval(); va_model.eval()
with open('pred_jpn_hotel.jsonl', 'w', encoding='utf-8') as f:
    with torch.no_grad():
        for it in test_data:
            enc = tokenizer(it['text'], return_tensors='pt', truncation=True)
            probs = torch.softmax(model(enc['input_ids'].to(device), enc['attention_mask'].to(device))[0], -1)
            labs = decode_with_smoothing(probs, best_th, best_th-0.05)
            trips = extract_triplets(it['text'], labs)
            out = []
            for t in trips:
                enc2 = tokenizer(f"<ASP>{t['Aspect']}</ASP> {it['text']}", return_tensors='pt', truncation=True)
                va = va_model(enc2['input_ids'].to(device), enc2['attention_mask'].to(device))[0].cpu().numpy()
                va = np.clip(va, 1.0, 9.0)
                out.append({'Aspect':t['Aspect'], 'Opinion':t['Opinion'], 'VA':f"{va[0]:.2f}#{va[1]:.2f}"})
            f.write(json.dumps({'ID':it['id'], 'Triplet':out}, ensure_ascii=False)+'\n')
print('pred_jpn_hotel.jsonl generated ✅')


Best threshold: 0.45000000000000007


100%|██████████| 158/158 [00:58<00:00,  2.69it/s]


VA Epoch 1/3 | Loss: 0.5093


100%|██████████| 158/158 [00:57<00:00,  2.73it/s]


VA Epoch 2/3 | Loss: 0.1524


100%|██████████| 158/158 [00:58<00:00,  2.72it/s]


VA Epoch 3/3 | Loss: 0.1019
pred_jpn_hotel.jsonl generated ✅


In [20]:
from google.colab import files
files.download("pred_jpn_hotel.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
# %% [OFFICIAL cF1 EVALUATION CELL – SUBTASK 2]

import numpy as np
import torch

# -------------------------
# Official metric helpers
# -------------------------
D_MAX = 128.0

def va_distance(pred_va, gold_va):
    return ((pred_va[0] - gold_va[0])**2 + (pred_va[1] - gold_va[1])**2) / D_MAX

def ctp(pred_va, gold_va):
    return max(0.0, 1.0 - va_distance(pred_va, gold_va))

# -------------------------
# Build GOLD triplets (VAL)
# -------------------------
gold_triplets = {}

for item in train_raw:
    text = clean_text(item["Text"])
    trips = []

    for q in item["Quadruplet"]:
        if q["Aspect"] == "NULL":
            continue
        v, a = map(float, q["VA"].split("#"))
        trips.append({
            "Aspect": q["Aspect"].strip(),
            "Opinion": q["Opinion"].strip(),
            "VA": np.array([v, a])
        })

    gold_triplets[text] = trips

# -------------------------
# Predict triplets on VAL
# -------------------------
def predict_triplets_val(val_data):
    preds = {}
    model.eval()
    va_model.eval()

    with torch.no_grad():
        for item in val_data:
            text = item["text"]

            enc = tokenizer(text, return_tensors="pt", truncation=True)
            logits = model(
                enc["input_ids"].to(device),
                enc["attention_mask"].to(device)
            )

            probs = torch.softmax(logits[0], dim=-1)
            labels = decode_with_smoothing(probs, best_th, best_th - 0.05)
            triplets = extract_triplets(text, labels)

            out = []
            for t in triplets:
                enc2 = tokenizer(
                    f"<ASP>{t['Aspect']}</ASP> {text}",
                    return_tensors="pt",
                    truncation=True
                )
                va = va_model(
                    enc2["input_ids"].to(device),
                    enc2["attention_mask"].to(device)
                )[0].cpu().numpy()

                out.append({
                    "Aspect": t["Aspect"],
                    "Opinion": t["Opinion"],
                    "VA": va
                })

            preds[text] = out

    return preds

pred_triplets = predict_triplets_val(val_data)

# -------------------------
# Compute official cF1
# -------------------------
def compute_cF1(preds, golds):
    ctp_sum = 0.0
    pred_count = 0
    gold_count = 0

    for text in golds:
        gold_trips = golds[text]
        pred_trips = preds.get(text, [])

        gold_count += len(gold_trips)
        pred_count += len(pred_trips)

        used = set()
        for p in pred_trips:
            for i, g in enumerate(gold_trips):
                if i in used:
                    continue
                if p["Aspect"] == g["Aspect"] and p["Opinion"] == g["Opinion"]:
                    ctp_sum += ctp(p["VA"], g["VA"])
                    used.add(i)
                    break

    if pred_count == 0 or gold_count == 0:
        return 0.0, 0.0, 0.0

    cP = ctp_sum / pred_count
    cR = ctp_sum / gold_count
    cF1 = 2 * cP * cR / (cP + cR) if (cP + cR) > 0 else 0.0

    return cP, cR, cF1

# -------------------------
# Run evaluation
# -------------------------
cP, cR, cF1 = compute_cF1(pred_triplets, gold_triplets)

print("=== OFFICIAL Subtask-2 Validation (cF1) ===")
print(f"cPrecision : {cP:.4f}")
print(f"cRecall    : {cR:.4f}")
print(f"cF1        : {cF1:.4f}")

# Coverage diagnostics
total_pred = sum(len(v) for v in pred_triplets.values())
total_gold = sum(len(v) for v in gold_triplets.values())

print("\nTriplet coverage:")
print("Predicted triplets:", total_pred)
print("Gold triplets     :", total_gold)
print("Recall ceiling    :", total_pred / max(1, total_gold))


=== OFFICIAL Subtask-2 Validation (cF1) ===
cPrecision : 0.2808
cRecall    : 0.0540
cF1        : 0.0906

Triplet coverage:
Predicted triplets: 483
Gold triplets     : 2512
Recall ceiling    : 0.19227707006369427
