In [1]:
!pip install transformers peft datasets sentencepiece accelerate tqdm -q


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from peft import LoraConfig, get_peft_model

train_path = "/content/drive/MyDrive/Colab Notebooks/eng_restaurant_train_alltasks.jsonl"
dev_path   = "/content/drive/MyDrive/Colab Notebooks/eng_restaurant_dev_task3.jsonl"

def load_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line.strip()))
    return rows

train_data = load_jsonl(train_path)
dev_data   = load_jsonl(dev_path)

len(train_data), len(dev_data)


(2284, 200)

In [4]:
def extract_quadruplets(item):
    """
    Extract Quadruplets from training data.
    Keys: Aspect, Category, Opinion, VA
    """
    quads = []
    if "Quadruplet" not in item:
        return quads

    for q in item["Quadruplet"]:
        if q.get("Opinion", "").upper() == "NULL":
            continue
        quads.append({
            "Aspect": q["Aspect"],
            "Category": q["Category"],
            "Opinion": q["Opinion"],
            "VA": q["VA"]
        })
    return quads


train_for_sub3 = []

for row in train_data:
    quads = extract_quadruplets(row)
    if len(quads) > 0:
        train_for_sub3.append({
            "ID": row["ID"],
            "Text": row["Text"],
            "Quadruplet": quads
        })

print("Original training rows:", len(train_data))
print("Usable Subtask-3 rows:", len(train_for_sub3))


Original training rows: 2284
Usable Subtask-3 rows: 1812


In [None]:
def quad_to_text(quads):
    out = ""
    for q in quads:
        out += (
            f"<aspect> {q['Aspect']} "
            f"<category> {q['Category']} "
            f"<opinion> {q['Opinion']} "
            f"<va> {q['VA']} </s> "
        )
    return out.strip()


In [None]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-base")

class QuadDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        r = self.data[idx]
        return r["Text"], quad_to_text(r["Quadruplet"])

def collate(batch):
    src, tgt = zip(*batch)
    enc = tokenizer(list(src), padding=True, truncation=True,
                    max_length=256, return_tensors="pt")
    labels = tokenizer(list(tgt), padding=True, truncation=True,
                       max_length=256, return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    enc["labels"] = labels
    return enc

train_loader = DataLoader(
    QuadDataset(train_for_sub3),
    batch_size=4,
    shuffle=True,
    collate_fn=collate
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")

lora = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora)
model.to(device)
model.print_trainable_parameters()


Device: cuda


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 1,769,472 || all params: 584,170,752 || trainable%: 0.3029


In [None]:
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4
)

epochs = 20
model.train()

for ep in range(epochs):
    total = 0
    for batch in tqdm(train_loader, desc=f"Epoch {ep+1}/{epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        out = model(**batch)
        out.loss.backward()
        optimizer.step()
        total += out.loss.item()
    print("Epoch loss:", total)


Epoch 1/20:   0%|          | 0/453 [00:00<?, ?it/s]



Epoch 1 Loss: 3699.1533


Epoch 2/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 2 Loss: 1986.6254


Epoch 3/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 3 Loss: 1160.8622


Epoch 4/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 4 Loss: 758.5693


Epoch 5/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 5 Loss: 589.2099


Epoch 6/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 6 Loss: 488.2555


Epoch 7/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 7 Loss: 435.8868


Epoch 8/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 8 Loss: 406.0157


Epoch 9/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 9 Loss: 382.8647


Epoch 10/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 10 Loss: 356.7390


Epoch 11/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 11 Loss: 340.7737


Epoch 12/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 12 Loss: 320.1462


Epoch 13/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 13 Loss: 310.4466


Epoch 14/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 14 Loss: 299.7041


Epoch 15/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 15 Loss: 283.8623


Epoch 16/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 16 Loss: 276.3731


Epoch 17/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 17 Loss: 268.6099


Epoch 18/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 18 Loss: 257.9635


Epoch 19/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 19 Loss: 251.2515


Epoch 20/20:   0%|          | 0/453 [00:00<?, ?it/s]

Epoch 20 Loss: 248.0381


In [None]:
# Collect categories exactly as they appear in training data
category_set = set()

for row in train_data:
    for q in row.get("Quadruplet", []):
        cat = q.get("Category")
        if cat:
            category_set.add(cat.upper())

len(category_set)


In [None]:
VALID_CATEGORY_SET=category_set
def snap_to_text(text, phrase):
    if phrase in text:
        return phrase
    tl, pl = text.lower(), phrase.lower()
    if pl in tl:
        i = tl.index(pl)
        return text[i:i+len(phrase)]
    return None

def valid_va(va):
    try:
        v,a = map(float, va.split("#"))
        return 1 <= v <= 9 and 1 <= a <= 9
    except:
        return False

In [None]:
def parse_quads(text, original_text):
    parts = text.split("</s>")
    quads = []

    for seg in parts:
        try:
            asp = seg.split("<aspect>")[1].split("<category>")[0].strip()
            cat = seg.split("<category>")[1].split("<opinion>")[0].strip().upper()
            opn = seg.split("<opinion>")[1].split("<va>")[0].strip()
            va  = seg.split("<va>")[1].strip()
        except:
            continue

        if cat not in VALID_CATEGORY_SET:
            continue

        asp = snap_to_text(original_text, asp)
        opn = snap_to_text(original_text, opn)

        if asp is None or opn is None:
            continue
        if not valid_va(va):
            continue

        quads.append({
            "Aspect": asp,
            "Category": cat,
            "Opinion": opn,
            "VA": va
        })

    # Deduplicate + cap
    seen, uniq = set(), []
    for q in quads:
        k = (q["Aspect"], q["Opinion"], q["Category"])
        if k not in seen:
            seen.add(k)
            uniq.append(q)

    return uniq[:3]  # MAX 3 QUADS


In [None]:
model.eval()
preds = []

for item in tqdm(dev_data):
    inp = tokenizer(item["Text"], return_tensors="pt").to(device)
    out = model.generate(**inp, max_length=256, num_beams=5)
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)

    quads = parse_quads(decoded, item["Text"])
    preds.append({"ID": item["ID"], "Quadruplet": quads})

with open("pred_eng_laptop.jsonl", "w", encoding="utf-8") as f:
    for p in preds:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

"Saved pred_eng_laptop.jsonl"


Generating:   0%|          | 0/200 [00:00<?, ?it/s]

'Saved pred_eng_laptop.jsonl'

In [15]:
from google.colab import files
files.download("pred_eng_laptop.jsonl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>