In [18]:
!pip install -q transformers torch scikit-learn tqdm emoji fugashi ipadic


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/13.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/13.4 MB[0m [31m166.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m11.1/13.4 MB[0m [31m158.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m13.4/13.4 MB[0m [31m156.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m694.9/694.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for ipadic (setup.py) ... [?25l[?25hdone


In [19]:
import json
import re
import unicodedata
import emoji
import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [20]:
TRAIN_PATH = "/content/jpn_hotel_train_alltasks.jsonl"
TEST_PATH  = "/content/jpn_hotel_test_task1.jsonl"

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_raw = load_jsonl(TRAIN_PATH)
test_raw  = load_jsonl(TEST_PATH)

print("Train reviews:", len(train_raw))
print("Test reviews :", len(test_raw))


Train reviews: 1600
Test reviews : 800


In [21]:
def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def remove_urls_html(text):
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    return text

def handle_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))

def clip_va(v, a):
    v = min(max(v, 1.0), 9.0)
    a = min(max(a, 1.0), 9.0)
    return round(v, 2), round(a, 2)


In [22]:
def mark_aspect_ja(text, aspect):
    if aspect in text:
        return text.replace(aspect, f"<ASP>{aspect}</ASP>")
    return text


In [23]:
def preprocess_train(data):
    rows = []

    for item in data:
        # --- normalize Japanese text ---
        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        # --- Quadruplet-based training ---
        if "Quadruplet" not in item:
            continue

        for quad in item["Quadruplet"]:
            aspect = quad.get("Aspect", "NULL")

            # Skip implicit aspects
            if aspect == "NULL":
                continue

            # Mark aspect in Japanese text
            text_marked = mark_aspect_ja(text, aspect)

            # Parse VA
            v, a = map(float, quad["VA"].split("#"))
            v, a = clip_va(v, a)

            rows.append({
                "text": text_marked,
                "aspect": aspect,
                "valence": v,
                "arousal": a
            })

    return pd.DataFrame(rows)


def preprocess_test(data):
    rows = []

    for item in data:
        text = normalize_text(item["Text"])
        text = remove_urls_html(text)
        text = handle_emojis(text)

        for asp in item["Aspect"]:
            text_marked = mark_aspect_ja(text, asp)

            rows.append({
                "id": item["ID"],
                "text": text_marked,
                "aspect": asp
            })

    return pd.DataFrame(rows)


train_df = preprocess_train(train_raw)
test_df  = preprocess_test(test_raw)

print("Train aspect samples:", len(train_df))
print("Test aspect samples :", len(test_df))

train_df.head()



Train aspect samples: 2515
Test aspect samples : 1092


Unnamed: 0,text,aspect,valence,arousal
0,<ASP>スタッフ</ASP>も親切でまた近いうちに伺いますね,スタッフ,6.33,6.17
1,ちょっぴり残念だったのは、2泊目の朝食...2泊目は、<ASP>品数</ASP>少なく、固形...,品数,3.5,5.38
2,ちょっぴり残念だったのは、2泊目の<ASP>朝食</ASP>...2泊目は、品数少なく、固形...,朝食,3.0,5.67
3,ちょっぴり残念だったのは、2泊目の<ASP>朝食</ASP>...2泊目は、品数少なく、固形...,朝食,2.62,5.88
4,お弁当が以前と少し変わったようで、<ASP>おかず</ASP>が増えた気がします,おかず,6.75,5.62


In [24]:
train_df, temp_df = train_test_split(
    train_df, test_size=0.2, random_state=42
)

val_df, test_internal_df = train_test_split(
    temp_df, test_size=0.5, random_state=42
)

print("Train:", len(train_df))
print("Val  :", len(val_df))
print("Test :", len(test_internal_df))


Train: 2012
Val  : 251
Test : 252


In [26]:
!pip install -q unidic-lite fugashi


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone


In [27]:
MODEL_NAME = "cl-tohoku/bert-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



class AspectVADataset(Dataset):
    def __init__(self, df, train=True):
        self.df = df.reset_index(drop=True)
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        enc = tokenizer(
            row["text"],
            row["aspect"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0)
        }

        if self.train:
            item["labels"] = torch.tensor(
                [row["valence"], row["arousal"]],
                dtype=torch.float
            )

        return item



In [28]:
train_loader = DataLoader(
    AspectVADataset(train_df, train=True),
    batch_size=16,
    shuffle=True
)

val_loader = DataLoader(
    AspectVADataset(val_df, train=True),
    batch_size=16
)

test_internal_loader = DataLoader(
    AspectVADataset(test_internal_df, train=True),
    batch_size=16
)


In [29]:
class BertForVA(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.regressor = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.regressor(pooled)


In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForVA().to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=2e-5,
    weight_decay=0.01
)


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [31]:
def rmse_va(y_true, y_pred):
    squared_diff = (y_pred - y_true) ** 2
    return np.sqrt(squared_diff.sum(axis=1).mean())


In [32]:
best_val_rmse = float("inf")
patience = 2
patience_counter = 0
MAX_EPOCHS = 10

for epoch in range(MAX_EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        loss = torch.nn.functional.smooth_l1_loss(preds, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_preds, val_golds = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            preds = model(input_ids, attention_mask)
            val_preds.append(preds.cpu().numpy())
            val_golds.append(labels.cpu().numpy())

    val_preds = np.vstack(val_preds)
    val_golds = np.vstack(val_golds)

    val_rmse = rmse_va(val_golds, val_preds)

    print(
        f"Epoch {epoch+1} | "
        f"Train Loss: {total_loss/len(train_loader):.4f} | "
        f"Val RMSE_VA: {val_rmse:.4f}"
    )

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        patience_counter = 0
        torch.save(model.state_dict(), "best_ja_bert_va.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break


100%|██████████| 126/126 [00:43<00:00,  2.92it/s]


Epoch 1 | Train Loss: 0.5915 | Val RMSE_VA: 1.1603


100%|██████████| 126/126 [00:43<00:00,  2.93it/s]


Epoch 2 | Train Loss: 0.1752 | Val RMSE_VA: 0.9054


100%|██████████| 126/126 [00:44<00:00,  2.84it/s]


Epoch 3 | Train Loss: 0.1130 | Val RMSE_VA: 0.8689


100%|██████████| 126/126 [00:45<00:00,  2.79it/s]


Epoch 4 | Train Loss: 0.0866 | Val RMSE_VA: 0.8807


100%|██████████| 126/126 [00:45<00:00,  2.79it/s]


Epoch 5 | Train Loss: 0.0730 | Val RMSE_VA: 0.7974


100%|██████████| 126/126 [00:45<00:00,  2.78it/s]


Epoch 6 | Train Loss: 0.0674 | Val RMSE_VA: 0.8945


100%|██████████| 126/126 [00:45<00:00,  2.78it/s]


Epoch 7 | Train Loss: 0.0557 | Val RMSE_VA: 0.8296
Early stopping triggered


In [33]:
model.load_state_dict(torch.load("best_ja_bert_va.pt"))
model.eval()


BertForVA(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [34]:
test_preds, test_golds = [], []

with torch.no_grad():
    for batch in test_internal_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model(input_ids, attention_mask)
        test_preds.append(preds.cpu().numpy())
        test_golds.append(labels.cpu().numpy())

test_preds = np.vstack(test_preds)
test_golds = np.vstack(test_golds)

print("=== INTERNAL TEST PERFORMANCE ===")
print("Official RMSE_VA:", rmse_va(test_golds, test_preds))


=== INTERNAL TEST PERFORMANCE ===
Official RMSE_VA: 0.8007909


In [35]:
test_loader = DataLoader(
    AspectVADataset(test_df, train=False),
    batch_size=16
)

outputs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        preds = model(input_ids, attention_mask)
        outputs.append(preds.cpu().numpy())

outputs = np.vstack(outputs)
outputs = np.clip(outputs, 1.0, 9.0)

test_df = test_df.reset_index(drop=True)
test_df["VA"] = [f"{v:.2f}#{a:.2f}" for v, a in outputs]

with open("pred_jpn_hotel.jsonl", "w", encoding="utf-8") as f:
    for rid in test_df["id"].unique():
        group = test_df[test_df["id"] == rid]
        record = {
            "ID": rid,
            "Aspect_VA": [
                {"Aspect": row["aspect"], "VA": row["VA"]}
                for _, row in group.iterrows()
            ]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("pred_jpn_hotel.jsonl generated ✅")


pred_jpn_hotel.jsonl generated ✅


In [36]:
from google.colab import files
files.download("pred_jpn_hotel.jsonl")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>