# train.py フロー検証 (nGPT classifier)`Sentiment-Circle/utils/demo2.ipynb` と同じ方針で、`train.py` の実行ステップを 1 つずつ追跡しながら **nGPT 分類器** をテストできるようにした検証ノートブックです。

## 0. 準備- `train.py` で定義されているデータセット前処理・トレーナー初期化の関数を直接呼び出し、フローをそのまま再現します。- Weights & Biases 連携はデバッグ用途なので無効化しています (`WANDB_MODE=disabled`)。- `Train_df.csv` / `Valid_df.csv` / `Test_df.csv` から少数サンプルを取り、計算負荷を抑えます。

In [None]:
%load_ext autoreload%autoreload 2import jsonimport loggingimport osimport pathlibimport randomimport sysfrom functools import partialimport numpy as npimport torchfrom transformers import AutoConfig, AutoTokenizer, PrinterCallbackos.environ.setdefault("TOKENIZERS_PARALLELISM", "false")os.environ.setdefault("WANDB_MODE", "disabled")os.environ.setdefault("WANDB_DISABLED", "true")PROJECT_ROOT = pathlib.Path('..').resolve()UTILS_DIR = PROJECT_ROOT / 'utils'DATASET_DIR = PROJECT_ROOT / 'dataset'OUTPUT_ROOT = PROJECT_ROOT / 'outputs'OUTPUT_ROOT.mkdir(exist_ok=True)sys.path.append(str(UTILS_DIR))from train import (    ModelArguments,    DataTrainingArguments,    TrainingArguments,    load_raw_datasets,    prepare_label_mappings,)from dataset_preprocessing import batch_get_preprocessing_function, get_preprocessing_functionfrom model.modeling_utils import DataCollatorForBiEncoder, get_modelfrom clf_trainer import CustomTrainerfrom progress_logger import LogCallbackfrom model.nGPT_model import NGPTWeightNormCallbackfrom metrics import compute_metricslogging.basicConfig(level=logging.INFO)random.seed(42)np.random.seed(42)torch.manual_seed(42)print(f"Project root: {PROJECT_ROOT}")print(f"CUDA available: {torch.cuda.is_available()}")

## 1. ハイパーパラメータとクラス分類器設定`train.sh` のデフォルト値 (学習率・エポック数など) を参考にしつつ、デバッグしやすいようにバッチサイズとサンプル数だけ縮小しています。

In [None]:
MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"POOLER_TYPE = "avg"MAX_SEQ_LENGTH = 512LEARNING_RATE = 1e-4TRAIN_BATCH_SIZE = 32   # train.sh の 128 だとメモリを圧迫するため縮小EVAL_BATCH_SIZE = 64    # train.sh の 256 から縮小NUM_EPOCHS = 1GRAD_ACCUM = 1LOGGING_STEPS = 5EVAL_STEPS = 5MAX_TRAIN_SAMPLES = 64MAX_EVAL_SAMPLES = 64MAX_PRED_SAMPLES = 64classifier_config = {    "sentiment": {        "type": "nGPT",        "layer": -1,        "objective": "infoNCE",        "distance": "cosine",        "output_dim": 256,        "dropout": 0.1,        "bias": False,        "base_scale": 0.03125    }}CLASSIFIER_CONFIG_PATH = OUTPUT_ROOT / "ngpt_classifier_config.json"with open(CLASSIFIER_CONFIG_PATH, "w") as f:    json.dump(classifier_config, f, indent=2)model_args = ModelArguments(    model_name_or_path=MODEL_NAME,    pooler_type=POOLER_TYPE,    encoding_type="bi_encoder",    freeze_encoder=True,    classifier_configs=str(CLASSIFIER_CONFIG_PATH),)data_args = DataTrainingArguments(    max_seq_length=MAX_SEQ_LENGTH,    max_train_samples=MAX_TRAIN_SAMPLES,    max_eval_samples=MAX_EVAL_SAMPLES,    max_predict_samples=MAX_PRED_SAMPLES,    train_file=[str(DATASET_DIR / "Train_df.csv")],    validation_file=[str(DATASET_DIR / "Valid_df.csv")],    test_file=[str(DATASET_DIR / "Test_df.csv")],)training_args = TrainingArguments(    output_dir=str(OUTPUT_ROOT / "ngpt_debug_run"),    overwrite_output_dir=True,    per_device_train_batch_size=TRAIN_BATCH_SIZE,    per_device_eval_batch_size=EVAL_BATCH_SIZE,    gradient_accumulation_steps=GRAD_ACCUM,    learning_rate=LEARNING_RATE,    num_train_epochs=NUM_EPOCHS,    lr_scheduler_type="constant",    logging_steps=LOGGING_STEPS,    eval_steps=EVAL_STEPS,    evaluation_strategy="steps",    save_strategy="no",    do_train=True,    do_eval=True,    do_predict=True,    report_to=["none"],    wandb_project_name="sentiment_info_nce_ngpt_demo",    wandb_project="sentiment_circle",    seed=42,)training_args.remove_unused_columns = Falseprint(model_args)print(data_args)print(training_args)

## 2. データセット読み込み`load_raw_datasets` で `Train/Valid/Test` を読み込み、必要であれば `sentence1` 列へリネームします。

In [None]:
raw_datasets, sentence3_flag = load_raw_datasets(    model_args=model_args,    data_args=data_args,    training_args=training_args,    seed=training_args.seed,)print(raw_datasets)print(f"sentence3 flag: {sentence3_flag}")print(raw_datasets["train"][0])

## 3. ラベルマッピング & クラス分類器辞書CSV の `labels` 列を `sentiment` に付け替え、`nGPT` 分類器設定を `prepare_label_mappings` に渡します。

In [None]:
(    raw_datasets,    labels,    id2label,    label2id,    aspect_key,    classifier_configs,    classifier_configs_for_trainer,    corr_labels,    corr_weights,    label_name_mappings,) = prepare_label_mappings(    raw_datasets=raw_datasets,    model_args=model_args,    data_args=data_args,)print(f"labels: {labels}")print(f"aspect_key: {aspect_key}")print(f"classifier configs: {json.dumps(classifier_configs, indent=2)}")

## 4. Config / Tokenizer / モデル (nGPT 判定込み)ここから `train.py` と同様に `AutoConfig` / `AutoTokenizer` をロードし、nGPT ブロック検出によって最適化条件を調整します。

In [None]:
config = AutoConfig.from_pretrained(    model_args.config_name if model_args.config_name else model_args.model_name_or_path,    num_labels=len(labels),    id2label=id2label,    label2id=label2id,)tokenizer = AutoTokenizer.from_pretrained(    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,    use_fast=model_args.use_fast_tokenizer,)model_cls = get_model(model_args)config.update(    {        "freeze_encoder": model_args.freeze_encoder,        "model_name_or_path": model_args.model_name_or_path,        "pooler_type": model_args.pooler_type,        "transform": model_args.transform,        "attn_implementation": model_args.use_flash_attention,        "device_map": model_args.device_map,    })labels_for_heads = list(classifier_configs_for_trainer.keys())id2_head = {i: head for i, head in enumerate(labels_for_heads)}model = model_cls(model_config=config, classifier_configs=classifier_configs)if model_args.freeze_encoder:    for param in model.backbone.parameters():        param.requires_grad = Falseuse_ngpt_riemann = bool(getattr(model, "use_ngpt_blocks", False))print(f"use_ngpt_blocks: {use_ngpt_riemann}")

## 5. トークナイズと特徴量生成`get_preprocessing_function` / `batch_get_preprocessing_function` を選び、`DatasetDict.map` で `tokenizer` を実行します。

In [None]:
padding = "longest" if data_args.pad_to_max_length else Falsemax_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)if sentence3_flag:    preprocess_function = batch_get_preprocessing_function(        tokenizer=tokenizer,        sentence1_key="sentence1",        sentence2_key="sentence2",        sentence3_key="sentence3",        sentence3_flag=sentence3_flag,        aspect_key=aspect_key,        padding=padding,        max_seq_length=max_seq_length,        model_args=model_args,        scale=None,    )    batched = Trueelse:    preprocess_function = get_preprocessing_function(        tokenizer=tokenizer,        sentence1_key="sentence1",        sentence2_key="sentence2",        sentence3_key="sentence3",        sentence3_flag=sentence3_flag,        aspect_key=aspect_key,        padding=padding,        max_seq_length=max_seq_length,        model_args=model_args,        scale=None,    )    batched = Falseprocessed_datasets = raw_datasets.map(    preprocess_function,    batched=batched,    load_from_cache_file=False,    desc="Running tokenizer on dataset",    remove_columns=raw_datasets["train"].column_names,)train_dataset = processed_datasets["train"]eval_dataset = processed_datasets["validation"]predict_dataset = processed_datasets["test"]print(train_dataset[0].keys())

## 6. DataCollator / Trainer 構築`CustomTrainer` を初期化し、nGPT 用の正規化コールバックやメトリクス関数を登録します。

In [None]:
collator_dtype = getattr(config, "torch_dtype", torch.float32)data_collator = DataCollatorForBiEncoder(    tokenizer=tokenizer,    padding="max_length",    pad_to_multiple_of=None,    dtype=collator_dtype,)trainer_ref = {"trainer": None}def train_centroid_getter():    trainer_obj = trainer_ref["trainer"]    if trainer_obj is None:        return {}    return trainer_obj.get_train_label_centroids()def compute_fn(eval_pred):    trainer_obj = trainer_ref["trainer"]    embedding_mode = "classifier"    if trainer_obj is not None and getattr(trainer_obj, "use_original_eval_embeddings", False):        embedding_mode = "original"    return compute_metrics(        eval_pred,        classifier_configs=classifier_configs_for_trainer,        id2_head=id2_head,        train_centroid_getter=train_centroid_getter,        embedding_eval_mode=embedding_mode,    )ngpt_callback = NGPTWeightNormCallback(enabled=use_ngpt_riemann)trainer = CustomTrainer(    model=model,    args=training_args,    classifier_configs=classifier_configs_for_trainer,    data_collator=data_collator,    train_dataset=train_dataset,    eval_dataset=eval_dataset,    compute_metrics=compute_fn,    tokenizer=tokenizer,    callbacks=[LogCallback, ngpt_callback],    dtype=collator_dtype,    corr_labels=corr_labels,    corr_weights=corr_weights,    tsne_save_dir=os.path.join(training_args.output_dir, "tsne_plots"),    tsne_label_mappings=label_name_mappings,)trainer_ref["trainer"] = trainertrainertrainer.remove_callback(PrinterCallback)

## 7. 評価→学習→テスト`train.py` と同様に、初期 `evaluate` → `train` → `test (evaluate on test split)` の順に実行してログを確認します。

In [None]:
baseline_metrics = trainer.evaluate(eval_dataset=eval_dataset)baseline_metrics

In [None]:
train_result = trainer.train()train_metrics = train_result.metricstrain_metrics["train_samples"] = len(train_dataset)train_metrics

In [None]:
test_metrics = trainer.evaluate(eval_dataset=predict_dataset)test_metrics