In [11]:
%load_ext autoreload
%autoreload 2

import json
import logging
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
import transformers
from datasets import load_dataset
from scipy.stats import pearsonr, spearmanr
from transformers import (
    AutoConfig,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    PrinterCallback,
    Trainer,
    AutoModel
)
from transformers import TrainingArguments as HFTrainingArguments
from transformers import default_data_collator, set_seed
from transformers.trainer_utils import get_last_checkpoint

from progress_logger import LogCallback
from sts2.modeling_utils import DataCollatorForBiEncoder, get_model
import torch
from sts2.dataset_preprocessing import get_preprocessing_function, parse_dict, batch_get_preprocessing_function
from sts2.modeling_encoders import BiEncoderForClassification
from sts2.clf_trainer import CustomTrainer
from sts2.metrics import compute_metrics


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# カスタムモデルのテスト

### 初期化

In [1]:
# デフォルトのconfig, ModelArgumentsの設定
# model_path = "bert-base-uncased"
# model_path = "mixedbread-ai/mxbai-embed-large-v1"
model_path = "Qwen/Qwen3-Embedding-0.6B"
config = AutoConfig.from_pretrained(model_path)
print(config)
config.model_name_or_path = model_path  # ここを追加
tokenizer = AutoTokenizer.from_pretrained(model_path)
config.attn_implementation = "sdpa" # sdpaのエラー対策
# config.device_map = "auto"
config.device_map = "cuda"
config.config_name = None
config.tokenizer_name = None
config.cache_dir = None
config.use_fast_tokenizer = True
config.model_revision = "main"
config.use_auth_token = None
config.torch_dtype= "bfloat16"  # データ型をbfloat16に設定
# config.torch_dtype = "float16"  # データ型をfloat16に設定
# config.attn_implementation = "flash_attention_2"  
config.objective = "mse"
config.encoding_type = "bi_encoder"
config.pooler_type = "last"
# config.pooler_type = "avg"
config.freeze_encoder = True
config.transform = False
config.triencoder_head = "hadamard"
config.classifier_save_directory = "./output_test"

clf_configs = {
# "Concepts": {"type": "linear", "objective": "regression", "distance":"dot_product","output_dim": 128, "dropout": 0.1, "layer": 9},
# "Frames": {"type": "mlp2", "objective": "regression", "distance":"cosine","intermediate_dim": 512, "bottleneck_dim": 256, "output_dim": 64, "dropout": 0.1, "layer": 10},
"apt_label": {"type": "linear", "objective": "binary_classification", "distance":"cosine", "output_dim": 128, "dropout": 0.1, "layer": 11},
"emotion": {"type": "contrastive_logit", "objective": "contrastive_logit", "distance": "cosine", "intermediate_dim": 256, "output_dim": 6,  "dropout": 0.1, "layer": 12}
# "argpairs_score": {"type":"linear","objective":"regression","distance":"cosine","output_dim":256,"dropout":0.1,"layer": 23},
# "bws_score": {"type":"linear","objective":"regression","distance":"cosine","output_dim":256,"dropout":0.1,"layer": 23}
}

NameError: name 'AutoConfig' is not defined

In [5]:
from sts2.modeling_encoders import BiEncoderForClassification
model = BiEncoderForClassification(model_config=config, classifier_configs=clf_configs)
# 全パラメータを凍結
for param in model.parameters():
    param.requires_grad = False

# 確認
print("勾配計算を行うパラメータ数：",
      sum(p.requires_grad for p in model.parameters()),
      "/", len(list(model.parameters())))

dtype: torch.bfloat16, device: cuda:0
勾配計算を行うパラメータ数： 0 / 316


In [6]:
model

BiEncoderForClassification(
  (backbone): Qwen3Model(
    (embed_tokens): Embedding(151669, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_la

In [23]:
for name, param in model.backbone.named_parameters():
    if torch.isnan(param).any():
        print(f"{name} に NaN が含まれています")

In [7]:
sent1 = tokenizer("An airplane is taking off", return_tensors="pt")
sent2 = tokenizer("A plane is taking off", return_tensors="pt")
sent3 = tokenizer("A large plane is landing", return_tensors="pt")
sent1.to(model.device)
sent2.to(model.device)
sent3.to(model.device)

{'input_ids': tensor([[    32,   3460,  11031,    374,  20327, 151643]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [8]:
model(
      input_ids=sent1["input_ids"], 
      attention_mask=sent1["attention_mask"], 
      token_type_ids=None, 
      input_ids_2=sent2["input_ids"],
      attention_mask_2=sent2["attention_mask"],
      token_type_ids_2=None,
      input_ids_3=None,
      attention_mask_3=None,
      token_type_ids_3=None,
      )

{'apt_label': tensor([0.6602], device='cuda:0', dtype=torch.bfloat16),
 'emotion': tensor([0.7461], device='cuda:0', dtype=torch.bfloat16),
 'overall_similarity': tensor([0.9297], device='cuda:0', dtype=torch.bfloat16)}

In [9]:
model(
      input_ids=sent1["input_ids"], 
      attention_mask=sent1["attention_mask"], 
      token_type_ids=None, 
      input_ids_2=sent2["input_ids"],
      attention_mask_2=sent2["attention_mask"],
      token_type_ids_2=None,
      input_ids_3=sent3["input_ids"],
      attention_mask_3=sent3["attention_mask"],
      token_type_ids_3=None
      )

{'apt_label_pos_similarity': tensor([0.9453], device='cuda:0', dtype=torch.bfloat16),
 'apt_label_neg_similarity': tensor([0.9258], device='cuda:0', dtype=torch.bfloat16),
 'emotion_pos_similarity': tensor([0.9336], device='cuda:0', dtype=torch.bfloat16),
 'emotion_neg_similarity': tensor([0.9414], device='cuda:0', dtype=torch.bfloat16),
 'emotion_anchor_prob': tensor([[0.0835, 0.1270, 0.2930, 0.1914, 0.1387, 0.1670]], device='cuda:0',
        dtype=torch.bfloat16),
 'emotion_positive_prob': tensor([[0.0786, 0.1147, 0.3145, 0.1387, 0.1416, 0.2119]], device='cuda:0',
        dtype=torch.bfloat16),
 'emotion_negative_prob': tensor([[0.1011, 0.1289, 0.3027, 0.1377, 0.1416, 0.1875]], device='cuda:0',
        dtype=torch.bfloat16),
 'overall_pos_similarity': tensor([0.9336], device='cuda:0', dtype=torch.bfloat16),
 'overall_neg_similarity': tensor([0.7383], device='cuda:0', dtype=torch.bfloat16)}

In [26]:
model.embedding_classifiers.items()

dict_items([('argpairs_score', LinearLayer(
  (linear): Sequential(
    (0): Dropout(p=0.1, inplace=False)
    (1): Linear(in_features=1024, out_features=256, bias=True)
  )
)), ('bws_score', LinearLayer(
  (linear): Sequential(
    (0): Dropout(p=0.1, inplace=False)
    (1): Linear(in_features=1024, out_features=256, bias=True)
  )
))])

In [None]:
for name, classifier in model.embedding_classifiers.items():
    print(name)
    print(classifier)
    print("=====")

APT
LinearLayer(
  (linear): Sequential(
    (0): Dropout(p=0.1, inplace=False)
    (1): Linear(in_features=2560, out_features=128, bias=True)
  )
)
=====


In [10]:
raw_datasets1 = load_dataset("csv", data_files="/works/data3/users/yama11235/yama11235/SLBERT/utils/data_style/ELSA_joy-sadness-anger-surprise-love-fear_test.csv")
raw_datasets2 = load_dataset("csv", data_files="data/APT_train.csv")
raw_datasets = datasets.concatenate_datasets([raw_datasets1["train"].select(range(5)), raw_datasets2["train"].select(range(5))])
# raw_datasets1 = load_dataset("csv", data_files="/works/data3/users/yama11235/yama11235/SLBERT/utils/data_preprocessed/ArgPairs_test.csv")
# raw_datasets2 = load_dataset("csv", data_files="/works/data3/users/yama11235/yama11235/SLBERT/utils/data_preprocessed/BWS_test.csv")
# raw_datasets = datasets.concatenate_datasets([raw_datasets1["train"].select(range(16)), raw_datasets2["train"].select(range(16))])

In [11]:
raw_datasets[0]

{'sentence1': 'I’m really grateful that my needs have been met and my ideas respected!',
 'sentence2': 'As I flipped through some amusing content, I stumbled upon a quirky chart that claimed to reveal how your birth sign could lead you and your partner to the most entertaining zones on your body, a lighthearted insight from astrologer Darryl Gaines.',
 'sentence3': 'I feel disgusted really I feel kind of let down',
 'emotion': 0,
 'apt_label': None}

In [12]:
raw_datasets["sentence1"]

['I’m really grateful that my needs have been met and my ideas respected!',
 'In a moment of uncertainty, I felt like a music box that had been roughly handled, plagued by the fear that my melody was now flawed.',
 'It hit me like a ton of bricks when I heard Jazmine share her feelings on homosexuality; I never expected that!',
 'I seem to have reverted to a prior condition marked by strife with others, in a relentless quest for acknowledgment, while grappling with an overwhelming sense of inadequacy.',
 "I try to keep things here in the bol positive and to be perfectly honest I'm not feeling so positive lately",
 'Sgt. ernest bucklew, 33, was coming home from iraq to bury his mother in pennsylvania.',
 'Sgt. ernest bucklew, 33, was coming home from iraq to bury his mother in pennsylvania.',
 'Sgt. ernest bucklew, 33, was coming home from iraq to bury his mother in pennsylvania.',
 'Sgt. ernest bucklew, 33, was coming home from iraq to bury his mother in pennsylvania.',
 'Sgt. ernest b

In [32]:
preprocessing_function = batch_get_preprocessing_function(
    tokenizer=tokenizer,
    sentence1_key="sentence1",
    sentence2_key="sentence2",
    sentence3_key="sentence3",
    sentence3_flag=True,
    aspect_key=['emotion', 'apt_label'],
    # aspect_key=['argpairs_score', 'bws_score'],
    padding="max_length",
    max_seq_length=32,
    model_args=config,
    scale=None,
)

In [33]:
train_datasets = raw_datasets.map(
    preprocessing_function,
    batched=True,
    remove_columns=raw_datasets.column_names,
)

Map: 100%|██████████| 10/10 [00:00<00:00, 678.03 examples/s]


In [34]:
train_datasets[:]

{'input_ids': [[40,
   4249,
   2167,
   25195,
   429,
   847,
   3880,
   614,
   1012,
   2270,
   323,
   847,
   6708,
   30287,
   0,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643],
  [641,
   264,
   4445,
   315,
   26826,
   11,
   358,
   6476,
   1075,
   264,
   4627,
   3745,
   429,
   1030,
   1012,
   17267,
   17608,
   11,
   65302,
   553,
   279,
   8679,
   429,
   847,
   61584,
   572,
   1431,
   46908,
   13,
   151643,
   151643,
   151643],
  [2132,
   4201,
   752,
   1075,
   264,
   8766,
   315,
   49037,
   979,
   358,
   6617,
   619,
   1370,
   5967,
   4332,
   1059,
   15650,
   389,
   52351,
   26,
   358,
   2581,
   3601,
   429,
   0,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643,
   151643],
  [40,
   2803,
   311,
   614,
   93593,
   311,
   264,
   4867,
   2971,
   12864,
   553,
  

In [35]:
data_collator = DataCollatorForBiEncoder(
    tokenizer=tokenizer,
    padding="max_length",
    pad_to_multiple_of=None
)

data = data_collator(train_datasets)

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# data["active_heads"]のうち、"apt_label"をもつかどうかのバイナリリスト
head_idx = [True if head == "apt_label" else False for head in data["active_heads"]]
head_idx

[False, False, False, False, False, True, True, True, True, True]

In [36]:
id2head = {i: head for i, head in enumerate(clf_configs.keys())}
id2head

{0: 'apt_label', 1: 'emotion'}

In [43]:
from sts2.clf_trainer import CustomTrainer
from functools import partial
from sts2.metrics import compute_metrics

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # CUDAの同期を有効にしてデバッグしやすくする

compute_fn = partial(
    compute_metrics,
    classifier_configs=clf_configs,
    id2_head=id2head,
)

data_collator = DataCollatorForBiEncoder(
    tokenizer=tokenizer,
    padding="max_length",
    pad_to_multiple_of=None
)

corr_labels = {"apt_label": {"emotion": 0.1}}
# corr_labels = {
#   "argpairs_score": {"bws_score": 0.6}
# }

# CUDAをリセット
import torch
torch.cuda.empty_cache()

from sts2.modeling_encoders import BiEncoderForClassification
model = BiEncoderForClassification(model_config=config, classifier_configs=clf_configs)
# 全パラメータを凍結
for param in model.parameters():
    param.requires_grad = False

for param in model.backbone.parameters():
    param.requires_grad = False

trainer = CustomTrainer(
    model=model,
    args=HFTrainingArguments(
        output_dir="./output_test",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=32,
        num_train_epochs=1,
        weight_decay=0.01,
        seed=42,
        learning_rate=1e-6,
        remove_unused_columns=False,
        report_to="wandb",  # wandbを使用する場合はここを設定
    ),
    classifier_configs=clf_configs,
    data_collator=data_collator,
    train_dataset=train_datasets,
    eval_dataset=train_datasets,  # ここは適切な検証データセットに置き換えてください
    tokenizer=tokenizer,
    compute_metrics=compute_fn,
    corr_labels=corr_labels,
)

dtype: torch.bfloat16, device: cuda:0


  super().__init__(*args, **kwargs)


In [55]:
eval_dataloader = trainer.get_eval_dataloader()

# 最初のバッチだけ取り出して確認
batch = next(iter(eval_dataloader))
# batch

In [44]:
trainer.evaluate(eval_dataset=train_datasets)

preds: {'apt_label': array([      nan,       nan,       nan,       nan,       nan, 0.7265625,
       0.7265625, 0.7265625, 0.7265625, 0.71875  ], dtype=float32), 'emotion': array([       nan,        nan,        nan,        nan,        nan,
       0.97265625, 0.9765625 , 0.96875   , 0.9765625 , 0.9453125 ],
      dtype=float32), 'overall_similarity': array([       nan,        nan,        nan,        nan,        nan,
       0.94921875, 0.94140625, 0.93359375, 0.9375    , 0.9375    ],
      dtype=float32), 'apt_label_pos_similarity': array([0.90625   , 0.9296875 , 0.8671875 , 0.94140625, 0.8984375 ,
              nan,        nan,        nan,        nan,        nan],
      dtype=float32), 'apt_label_neg_similarity': array([0.91015625, 0.91796875, 0.890625  , 0.9453125 , 0.92578125,
              nan,        nan,        nan,        nan,        nan],
      dtype=float32), 'emotion_pos_similarity': array([0.89453125, 0.9375    , 0.875     , 0.94921875, 0.921875  ,
              nan,        na

{'eval_loss': 4.407558441162109,
 'eval_model_preparation_time': 0.004,
 'eval_apt_label_best-threshold': 0.0,
 'eval_apt_label_best-accuracy': 0.2,
 'eval_apt_label_best-f1': 0.3333333333333333,
 'eval_apt_label_auc': 0.0,
 'eval_apt_label_mse': 0.438134765625,
 'eval_emotion_triplet_accuracy': 0.2,
 'eval_emotion_avg_positive_similarity': 0.915625,
 'eval_emotion_avg_negative_similarity': 0.92578125,
 'eval_emotion_anchor_accuracy': 0.2,
 'eval_emotion_anchor_macro_f1': 0.2,
 'eval_apt_label_vs_emotion_pearson': -0.5973093854732312,
 'eval_apt_label_vs_emotion_spearman': -0.3345438017134364,
 'eval_runtime': 0.249,
 'eval_samples_per_second': 40.156,
 'eval_steps_per_second': 4.016}

In [92]:
trainer.train()

Processing correlation: argpairs_score vs bws_score, 4, 4, corr: 0.9342879056930542, target: 0.6000000238418579


Epoch,Training Loss,Validation Loss,Argpairs Score Mse,Argpairs Score Pearson,Bws Score Mse,Bws Score Pearson,Argpairs Score Vs Bws Score Pearson,Argpairs Score Vs Bws Score Spearman
1,0.5954,0.103479,0.336996,0.482103,0.207029,0.547772,0.921682,0.787035


Processing correlation: argpairs_score vs bws_score, 4, 4, corr: -0.004522493574768305, target: 0.6000000238418579
Processing correlation: argpairs_score vs bws_score, 4, 4, corr: -0.09636474400758743, target: 0.6000000238418579
Processing correlation: argpairs_score vs bws_score, 4, 4, corr: 0.8299806714057922, target: 0.6000000238418579
Processing correlation: argpairs_score vs bws_score, 4, 4, corr: -0.3418736457824707, target: 0.6000000238418579
Processing correlation: argpairs_score vs bws_score, 4, 4, corr: -0.5798406600952148, target: 0.6000000238418579
Processing correlation: argpairs_score vs bws_score, 4, 4, corr: -0.5565418601036072, target: 0.6000000238418579
Processing correlation: argpairs_score vs bws_score, 4, 4, corr: 0.962778627872467, target: 0.6000000238418579
Processing correlation: argpairs_score vs bws_score, 32, 32, corr: 0.9216816425323486, target: 0.6000000238418579
Head: argpairs_score, vector length: 32
Head: bws_score, vector length: 32


TrainOutput(global_step=8, training_loss=0.5954198241233826, metrics={'train_runtime': 1.7263, 'train_samples_per_second': 18.536, 'train_steps_per_second': 4.634, 'total_flos': 43351302733824.0, 'train_loss': 0.5954198241233826, 'epoch': 1.0})

### 途中から

In [4]:
train_list = ['ArgPairs']

classifier_list = ["argpairs_score"]

path = "output/mixedbread-ai/mxbai-embed-large-v1"
# path = "output/Qwen/Qwen3-Embedding-8B"
layer_num = 23
# layer_num = 35
configs = {}
clf_path = []
for train_name, classifier_path in zip(train_list, classifier_list):
    with open(os.path.join(path, train_name, f"lr:1e-4_seed:42_layer:{layer_num}", f"{train_name}.json")) as f:
        data = json.load(f)
        configs[classifier_path] = list(data.values())[0]
        clf_path.append(os.path.join(path, train_name, f"lr:1e-4_seed:42_layer:{layer_num}", 
        f"{list(data.values())[0]['type']}_layer:{list(data.values())[0]['layer']}_dim:{list(data.values())[0]['output_dim']}",
        f"{classifier_path}_classifier.bin")
        )


In [5]:
clf_path

['output/mixedbread-ai/mxbai-embed-large-v1/ArgPairs/lr:1e-4_seed:42_layer:23/linear_layer:23_dim:256/argpairs_score_classifier.bin']

In [6]:
model_path = "mixedbread-ai/mxbai-embed-large-v1"
# model_path = "/works/data3/users/yama11235/yama11235/model/hoo-cache/huggingface/hub/models--mixedbread-ai--mxbai-embed-large-v1/snapshots/db9d1fe0f31addb4978201b2bf3e577f3f8900d2"
config = AutoConfig.from_pretrained(model_path)
config.model_name_or_path = model_path  # ここを追加
tokenizer = AutoTokenizer.from_pretrained(model_path)
config.attn_implementation = "eager" # sdpaのエラー対策
# config.device_map = "auto"
config.device_map = "cuda"
# config.device_map = None
config.config_name = None
config.tokenizer_name = None
config.cache_dir = None
config.use_fast_tokenizer = True
config.model_revision = "main"
config.use_auth_token = None
# config.torch_dtype= "bfloat16"  # データ型をbfloat16に設定
config.torch_dtype = torch.float32  # データ型をfloat32に設定
# config.torch_dtype = torch.float16  # データ型をfloat16に設定
# config.attn_implementation = "flash_attention_2"  
config.objective = "mse"
config.encoding_type = "bi_encoder"
# config.pooler_type = "last"
config.pooler_type = "avg"
config.freeze_encoder = True
config.transform = False
config.triencoder_head = "hadamard"
config.classifier_save_directory = "./output_test"

model_config = config
config

BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attn_implementation": "eager",
  "cache_dir": null,
  "classifier_dropout": null,
  "classifier_save_directory": "./output_test",
  "config_name": null,
  "device_map": "cuda",
  "encoding_type": "bi_encoder",
  "freeze_encoder": true,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_name_or_path": "mixedbread-ai/mxbai-embed-large-v1",
  "model_revision": "main",
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "objective": "mse",
  "pad_token_id": 0,
  "pooler_type": "avg",
  "position_embedding_type": "absolute",
  "tokenizer_name": null,
  "torch_dtype": "float32",
  "transform": false,
  "transformers_version": "4.53.0",
  "triencoder_head": "hadamard",
  "type_vocab

In [7]:
model = BiEncoderForClassification.from_pretrained(
    model_path,
    model_config,
    clf_path,
    configs,
    # classifier_freeze=[]
    )
model.eval()
    

dtype: torch.float32, device: cuda:0


BiEncoderForClassification(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [9]:
# raw_datasets1 = load_dataset("csv", data_files="/works/data3/users/yama11235/yama11235/SLBERT/utils/data_style/ELSA_joy-sadness-anger-surprise-love-fear_test.csv")
# raw_datasets2 = load_dataset("csv", data_files="data/APT_train.csv")
# raw_datasets = datasets.concatenate_datasets([raw_datasets1["train"].select(range(5)), raw_datasets2["train"].select(range(5))])
raw_datasets1 = load_dataset("csv", data_files="data_preprocessed/ArgPairs_test.csv")
# raw_datasets2 = load_dataset("csv", data_files="/works/data3/users/yama11235/yama11235/SLBERT/utils/data_preprocessed/BWS_test.csv")
# raw_datasets = datasets.concatenate_datasets([raw_datasets1["train"].select(range(16)), raw_datasets2["train"].select(range(16))])

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
preprocessing_function = get_preprocessing_function(
    tokenizer=tokenizer,
    sentence1_key="sentence1",
    sentence2_key="sentence2",
    sentence3_key="sentence3",
    sentence3_flag=False,
    # aspect_key=['emotion', 'apt_label'],
    # aspect_key=['argpairs_score', 'bws_score'],
    aspect_key=['argpairs_score'],
    padding="max_length",
    max_seq_length=512,
    model_args=config,
    scale=None,
)

In [13]:
train_datasets = raw_datasets1.map(
    preprocessing_function,
    batched=False,
    remove_columns=raw_datasets1["train"].column_names,
)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [14]:
data_collator = DataCollatorForBiEncoder(
    tokenizer=tokenizer,
    padding="max_length",
    pad_to_multiple_of=None
)

# data = data_collator(train_datasets["train"])

In [15]:
train_datasets = train_datasets["train"]

In [18]:
id2head = {i: head for i, head in enumerate(configs.keys())}
id2head

{0: 'argpairs_score'}

In [19]:
from sts2.clf_trainer import CustomTrainer
from functools import partial
from sts2.metrics import compute_metrics

compute_fn = partial(
    compute_metrics,
    classifier_configs=configs,
    id2_head=id2head,
)

data_collator = DataCollatorForBiEncoder(
    tokenizer=tokenizer,
    padding="max_length",
    pad_to_multiple_of=None
)

# corr_labels = {"apt_label": {"emotion": 0.1}}
corr_labels = {
  "argpairs_score": {"bws_score": 0.6}
}

trainer = CustomTrainer(
    model=model,
    args=HFTrainingArguments(
        output_dir="./output_test",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=64,
        num_train_epochs=1,
        weight_decay=0.01,
        seed=42,
        learning_rate=1e-6,
        remove_unused_columns=False,
        report_to="wandb",  # wandbを使用する場合はここを設定
    ),
    classifier_configs=configs,
    data_collator=data_collator,
    train_dataset=train_datasets,
    eval_dataset=train_datasets,  # ここは適切な検証データセットに置き換えてください
    tokenizer=tokenizer,
    compute_metrics=compute_fn,
    corr_labels=corr_labels,
)

  super().__init__(*args, **kwargs)


In [20]:
trainer.evaluate(eval_dataset=train_datasets)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




predictions keys: dict_keys(['argpairs_score', 'overall_similarity', 'argpairs_score_pos_similarity', 'argpairs_score_neg_similarity', 'overall_pos_similarity', 'overall_neg_similarity'])
label_dict keys: dict_keys(['labels', 'active_heads'])
predictions shape: 6, label_dict shape: 2
preds: {'argpairs_score': array([0.23437788, 0.80355227, 0.70648766, ..., 0.27411968, 0.6079304 ,
       0.6051959 ], shape=(1200,), dtype=float32), 'overall_similarity': array([0.7324028 , 0.8853478 , 0.8460119 , ..., 0.78702384, 0.8311864 ,
       0.74303734], shape=(1200,), dtype=float32), 'argpairs_score_pos_similarity': array([nan, nan, nan, ..., nan, nan, nan], shape=(1200,), dtype=float32), 'argpairs_score_neg_similarity': array([nan, nan, nan, ..., nan, nan, nan], shape=(1200,), dtype=float32), 'overall_pos_similarity': array([nan, nan, nan, ..., nan, nan, nan], shape=(1200,), dtype=float32), 'overall_neg_similarity': array([nan, nan, nan, ..., nan, nan, nan], shape=(1200,), dtype=float32)}
labels:

[34m[1mwandb[0m: Currently logged in as: [33m2959648335[0m ([33m2959648335-university-of-tokyo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 0.030230019241571426,
 'eval_model_preparation_time': 0.0029,
 'eval_argpairs_score_mse': 0.030230020552310773,
 'eval_argpairs_score_pearson': 0.7272617397508163,
 'eval_argpairs_score_spearman': 0.7212626764282644,
 'eval_runtime': 27.5583,
 'eval_samples_per_second': 43.544,
 'eval_steps_per_second': 0.689}

In [None]:
train_datasets

Dataset({
    features: ['sentence3', 'input_ids', 'token_type_ids', 'attention_mask', 'input_ids_2', 'attention_mask_2', 'token_type_ids_2', 'active_heads', 'labels'],
    num_rows: 1200
})

In [24]:
output_tensor = torch.tensor([], dtype=torch.float32)
labels_tensor = torch.tensor([], dtype=torch.float32)
with torch.no_grad():
    eval_dataloader = trainer.get_eval_dataloader()
    for batch in eval_dataloader:
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            token_type_ids=None,
            input_ids_2=batch["input_ids_2"],
            attention_mask_2=batch["attention_mask_2"],
            token_type_ids_2=None,
            input_ids_3=None,
            attention_mask_3=None,
            token_type_ids_3=None,
        )
        output_tensor = torch.cat((output_tensor.cpu(), outputs['argpairs_score'].detach().clone().cpu()), dim=0)
        labels_tensor = torch.cat((labels_tensor.cpu(), batch["labels"].flatten().detach().clone().cpu()), dim=0)


In [27]:
pearson_corr = pearsonr(output_tensor.numpy(), labels_tensor.numpy())
pearson_corr

PearsonRResult(statistic=0.68120724, pvalue=1.895030045385178e-164)

In [28]:
spearman_corr = spearmanr(output_tensor.numpy(), labels_tensor.numpy())
spearman_corr

SignificanceResult(statistic=0.6782949622952499, pvalue=1.5636668291218919e-162)

In [65]:
sent1_list = []
sent2_list = []
labels_list = []

with torch.no_grad():
    eval_dataloader = trainer.get_eval_dataloader()
    for batch in eval_dataloader:
        sent1 = model.encode(
            input_ids=batch["input_ids"].to(model.device),
            attention_mask=batch["attention_mask"].to(model.device),
            token_type_ids=None,
        )
        sent1_list.append(sent1["argpairs_score"].cpu().numpy())
        sent2 = model.encode(
            input_ids=batch["input_ids_2"].to(model.device),
            attention_mask=batch["attention_mask_2"].to(model.device),
            token_type_ids=None,
        )
        sent2_list.append(sent2["argpairs_score"].cpu().numpy())
        labels_list.append(batch["labels"].cpu().numpy())

In [35]:
sent1_list[0]

array([[ 0.00360563,  0.25461003,  0.20472933, ..., -0.40114874,
         0.57263595, -0.09370866],
       [-0.14023702, -0.10293227,  0.61865985, ..., -0.24134655,
        -0.3402528 ,  0.38966277],
       [ 0.05375183,  0.59631807,  0.42281276, ..., -0.13983706,
         0.2536421 ,  0.12120374],
       ...,
       [-0.20158575, -0.1763052 ,  0.32370704, ..., -0.14507358,
        -0.41976887, -0.3233777 ],
       [-0.36450148, -0.00383214,  0.45837742, ..., -0.25293654,
        -0.05994258,  0.1226934 ],
       [-0.35413396, -0.09773616,  0.56367236, ..., -0.31176034,
         0.2788598 ,  0.45976478]], dtype=float32)

In [36]:
len(sent1_list)

19

In [66]:
flattened1 = np.concatenate(sent1_list, axis=0)
flattened2 = np.concatenate(sent2_list, axis=0)
labels = np.concatenate(labels_list, axis=0).flatten()

In [67]:
similarity_list = []
for i in range(len(flattened1)):
    similarity = torch.cosine_similarity(
        torch.tensor(flattened1[i]), 
        torch.tensor(flattened2[i]), 
        dim=0
    ).item()
    similarity_list.append(similarity)

In [68]:
pearsonr(similarity_list, labels)

PearsonRResult(statistic=0.7272617260896164, pvalue=4.898315025461355e-198)

In [69]:
spearmanr(similarity_list, labels)

SignificanceResult(statistic=0.7212626764282644, pvalue=2.808785295498658e-193)

In [53]:
model.device

device(type='cuda', index=0)

In [64]:
model.to("cuda")
# モデル全体のパラメータが GPU 上か
print(next(model.parameters()).device)  
# 分類ヘッドの重みが GPU 上か
print(model.embedding_classifiers['argpairs_score'].linear[1].weight.device)


cuda:0
cuda:0


In [60]:
model

BiEncoderForClassification(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024