# 質問をスコア化

### Setup

In [1]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
import numpy as np
import onnx

import wandb
from pytorch_lightning.loggers import WandbLogger

import os
import datetime
import json
from sklearn.model_selection import train_test_split
import pandas as pd

2024-12-12 09:20:52.149157: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-12-12 09:20:52.149172: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Config

In [2]:
# EDIT ME!
now = datetime.datetime.now()
CURRENT_ID = int(now.strftime("%Y%m%d%H%M"))
# CURRENT_ID = 202412092216
CURRENT_ID

202412120920

In [3]:
DIR_ROOT = "../../"
DIR_DATA = os.path.join(DIR_ROOT, "data")   
DIR_QUESTIONS = os.path.join(DIR_DATA, "questions")
DIR_SAVED_MODELS = os.path.join(DIR_DATA, "saved_models")

QUESTION_ID = 202405311421

# DATA CONFIG
TRAIN_RATIO = 0.8

# MODEL CONFIG
MAX_EPOCHS = 30
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
ALPHA = 10.0 # Non-Question Loss(0~0.05)
BETA = 1.0 # Ranking Loss
MARGIN = 0.5
DIM_USE = 512
HIDDEN_DIMS = [DIM_USE, 256]

### Model

In [4]:
class Model(pl.LightningModule):
    def __init__(
        self,
        use_model_url="https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/multilingual/2",
        use_dim=DIM_USE, 
        hidden_dims=HIDDEN_DIMS,
        learning_rate=LEARNING_RATE,
        alpha=ALPHA,
        beta=BETA,
        margin=MARGIN
    ):
        super().__init__()
        self.save_hyperparameters()
        
        # Universal Sentence Encoder
        with tf.device("/CPU:0"):
            self.use = hub.load(use_model_url)
        
        # MLP layers
        layers = []
        input_dim = use_dim 
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(0.2)
            ])
            input_dim = hidden_dim
        
        # 最終出力層
        layers.append(nn.Linear(input_dim, 1))
        
        self.mlp = nn.Sequential(*layers)
        
    def encode_text(self, texts):
        # USEでテキストをエンコード
        embeddings = self.use(texts)
        return torch.tensor(embeddings.numpy(), device=self.device)
    
    def forward(self, x):
        # x: バッチのテキスト
        embeddings = self.encode_text(x)
        scores = self.mlp(embeddings).squeeze(-1)
        return scores
    
    def training_step(self, batch, batch_idx):
        texts, levels = batch
        scores = self(texts)
        loss = self.compute_loss(scores, levels, stage='train')
        
        self.log('train/loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        texts, levels = batch
        scores = self(texts)
        loss = self.compute_loss(scores, levels, stage='val')   
        
        self.log('val/loss', loss)
        return loss
    
    def compute_loss(self, y_pred, y_true, stage):
        # 基本的なMSE損失
        base_loss = F.mse_loss(y_pred, y_true.float())
        
        # 非疑問文に対するヒンジ損失
        is_non_question = (y_true <= 0).float()
        non_question_loss = torch.mean(
            is_non_question * torch.maximum(
                torch.tensor(0.0, device=self.device),
                y_pred + self.hparams.margin
            )
        )
        
        # 順序関係を保持するためのランキング損失
        level_diff = y_true.unsqueeze(1) - y_true.unsqueeze(0)
        pred_diff = y_pred.unsqueeze(1) - y_pred.unsqueeze(0)
        
        ranking_loss = torch.mean(
            torch.maximum(
                torch.tensor(0.0, device=self.device),
                -pred_diff * torch.sign(level_diff) + 
                self.hparams.margin * torch.abs(level_diff)
            )
        )
        
        # 最終的な損失は各項の重み付き和
        total_loss = (
            base_loss +
            self.hparams.alpha * non_question_loss +
            self.hparams.beta * ranking_loss
        )
        
        # 各損失項もログに記録
        self.log(f'{stage}/base_loss', base_loss)
        self.log(f'{stage}/non_question_loss', non_question_loss)
        self.log(f'{stage}/ranking_loss', ranking_loss)
        
        return total_loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.hparams.learning_rate
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=5,
            verbose=True
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val/loss"
            }
        }


### Dataset

In [5]:
class QuestionDataset(torch.utils.data.Dataset):
    def __init__(self, texts, levels):
        self.texts = texts
        self.levels = levels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.levels[idx]

In [6]:
class QuestionDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_texts,
        train_levels,
        val_texts,
        val_levels,
        batch_size=32
    ):
        super().__init__()
        self.train_texts = train_texts
        self.train_levels = train_levels
        self.val_texts = val_texts
        self.val_levels = val_levels
        self.batch_size = batch_size
    
    def setup(self, stage=None):
        self.train_dataset = QuestionDataset(
            self.train_texts,
            self.train_levels
        )
        self.val_dataset = QuestionDataset(
            self.val_texts,
            self.val_levels
        )
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )
    
    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=4
        )

### 訓練

In [7]:
with open(os.path.join(DIR_QUESTIONS, f"{QUESTION_ID}.json")) as f:
    data = json.load(f)

texts = [] 
labels = []

for questions_of_theme in data.values():
    for level_id, questions_of_level in enumerate(questions_of_theme.values()):
        for question in questions_of_level:
            texts.append(question)
            labels.append(int(level_id))

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, 
    labels, 
    train_size=TRAIN_RATIO,
    random_state=42
)

In [None]:
# データモジュールの準備
data_module = QuestionDataModule(
    train_texts,
    train_labels,
    val_texts,
    val_labels,
    batch_size=BATCH_SIZE,
)

# モデルの初期化
model = Model()

In [None]:
config = {
    "architecture": "USE_MLP",
    "hidden_dims": HIDDEN_DIMS,
    "learning_rate": LEARNING_RATE,
    "alpha": ALPHA,
    "beta": BETA,
    "margin": MARGIN,
    "batch_size": BATCH_SIZE,
    "project_id": CURRENT_ID,
    "question_id": QUESTION_ID
}

wandb_logger = WandbLogger(
    project="gyuwan-question-level-prediction",
    config=config,
    log_model=True
)

In [None]:
trainer = pl.Trainer(
    max_epochs=30,
    accelerator='auto',
    devices=1,
    logger=wandb_logger,
    callbacks=[
        pl.callbacks.EarlyStopping(
            monitor='val/loss',
            patience=10,
            mode='min'
        ),
        pl.callbacks.ModelCheckpoint(
            monitor='val/loss',
            dirpath=os.path.join(DIR_SAVED_MODELS, "question_score", f"{CURRENT_ID}"),
            filename='question-scorer-{epoch:02d}-{val_loss:.2f}',
            save_top_k=3,
            mode='min'
        )
    ]
)

# 学習の実行
trainer.fit(model, data_module)

In [None]:
trainer.save_checkpoint(os.path.join(DIR_SAVED_MODELS, "question_score", f"{CURRENT_ID}", "final.ckpt"))

### テスト

In [9]:
MODEL_ID = 202412111510

In [10]:
model = Model.load_from_checkpoint(os.path.join(DIR_SAVED_MODELS, "question_score", f"{MODEL_ID}", "final.ckpt"))
model.eval()

2024-12-12 09:21:13.231211: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-12 09:21:13.231346: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-12-12 09:21:13.231376: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-12-12 09:21:13.231397: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-12-12 09:21:13.231417: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

Model(
  (mlp): Sequential(
    (0): Linear(in_features=512, out_features=512, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [11]:
model.eval()
predictions = []

for i in range(0, len(val_texts), BATCH_SIZE):
    batch_texts = val_texts[i:i+BATCH_SIZE]
    with torch.no_grad():
        scores = model(batch_texts)
    predictions.extend(scores.cpu().numpy())

df = pd.DataFrame({
    "text": val_texts,
    "label": val_labels,
    "prediction": predictions
})

# +- 1の範囲で正解とする
df["is_correct"] = (df["label"] - 1 <= df["prediction"]) & (df["prediction"] <= df["label"] + 1)

# 正解率
accuracy = df["is_correct"].mean()
accuracy

0.8537466717383035

In [None]:
# index 以外の値を表示
# ランダムに10件を表示
df.sample(5)

In [None]:
sample_questions = [
    "顔認譍技術を使って、写真に写る人物の情報を自動的に解析する方法は考えられますか?",
    "どうやったら燃費が良くなるように運転できますか?",
    "今のどういう意味か分かった人いる？",
    "これってどういう意味？",
    "なるほど？",
    "なるほど",
    "よくわからん",
    "納豆ネバネバ",
    "これは面白いですね!",
    "何が起きているのかわかりません。",
    "もっと詳しく説明してください。",
    "えっ、本当ですか?",
    "私もやってみたいです!",
    "私にはこの意味がわかりません。",
    "なるほど、なるほど。",
    "いつからこんなことが始まっていたんですか?",
    "もっと情報が欲しいです。",
    "驚いた! とても興味深いです。"
]
sample_scores = model(sample_questions)

pd.DataFrame({
    "text": sample_questions,
    "prediction": sample_scores.detach().numpy()
}).sort_values("prediction", ascending=False)

### Export to ONNX

In [13]:
# ONNX形式で保存
onnx_path = os.path.join(DIR_SAVED_MODELS, "question_score", f"{MODEL_ID}", "model.onnx")
dummy_input = ("What is the meaning of life?",)
torch.onnx.export(
    model,
    dummy_input,
    onnx_path,
    input_names=["text"],
    output_names=["score"],
    dynamic_axes={
        "text": {0: "batch"},
        "score": {0: "batch"}
    }
)

  return torch.tensor(embeddings.numpy(), device=self.device)
