In [1]:
DATA_PATH = "./input"
MODEL_NAME = "Qwen/Qwen2.5-14B"
OUTPUT_PATH = "."
MODEL_OUTPUT_PATH = f"{OUTPUT_PATH}/output_retrieval"

RETRIEVE_NUM = 25
SEED = 0
EPOCH = 10
LR = 4e-05
BS = 32

TRAINING = True
DEBUG = False
WANDB = False
REPORT_TO = "none"

In [2]:
import os
import random

import numpy as np
import pandas as pd
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sentence_transformers import (SentenceTransformer,
                                   SentenceTransformerTrainer,
                                   SentenceTransformerTrainingArguments)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from transformers import AutoTokenizer, BitsAndBytesConfig

os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm
2024-12-09 15:31:22.933207: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-09 15:31:22.954283: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-09 15:31:22.978652: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-09 15:31:22.986134: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-09 15:31:23.0

In [3]:
import torch
from torch import nn
from transformers import Qwen2Config, Qwen2Model, Qwen2PreTrainedModel
from transformers.models.qwen2.modeling_qwen2 import (
    Qwen2Attention,
    Qwen2DecoderLayer,
    Qwen2FlashAttention2,
    Qwen2MLP, Qwen2RMSNorm,
    Qwen2RotaryEmbedding,
    Qwen2SdpaAttention)


class ModifiedQwen2Attention(Qwen2Attention):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.is_causal = False


class ModifiedQwen2FlashAttention2(Qwen2FlashAttention2):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.is_causal = False


class ModifiedQwen2SdpaAttention(Qwen2SdpaAttention):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.is_causal = False


QWEN2_ATTENTION_CLASSES = {
    "eager": ModifiedQwen2Attention,
    "flash_attention_2": ModifiedQwen2FlashAttention2,
    "sdpa": ModifiedQwen2SdpaAttention,
}


class ModifiedQwen2DecoderLayer(Qwen2DecoderLayer):
    def __init__(self, config: Qwen2Config, layer_idx: int):
        nn.Module.__init__(self)
        self.hidden_size = config.hidden_size

        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](
            config=config, layer_idx=layer_idx
        )

        self.mlp = Qwen2MLP(config)
        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen2RMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )


class Qwen2BiModel(Qwen2Model):
    _no_split_modules = ["ModifiedQwen2DecoderLayer"]

    def __init__(self, config: Qwen2Config):
        Qwen2PreTrainedModel.__init__(self, config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                ModifiedQwen2DecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self._attn_implementation = config._attn_implementation
        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = Qwen2RotaryEmbedding(config=config)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

In [4]:
NUM_PROC = os.cpu_count() // 2
print(NUM_PROC)

def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

13


In [5]:
df = pd.read_csv(f"{DATA_PATH}/train_5folds_with_llm_infer.csv")
print(df.shape)

(4370, 23)


In [6]:
# 洪さんLLM synthetic_data
df_synth = pd.read_csv(f"{DATA_PATH}/synthetic_questions_render_with_answer_render_v1.csv")
df_synth = df_synth[~df_synth.isna().any(axis=1)].reset_index(drop=True)

df_synth = df_synth.sample(n=4000, random_state=0).reset_index(drop=True)

# 3rd subject nameを利用
df_synth = df_synth.rename({"ThirdSubjectName": "SubjectName"}, axis=1)
df_synth = df_synth.rename({"MisconceptionName": "Misconception"}, axis=1)

df_synth["fold"] = -1
print(df_synth.shape)

(4000, 26)


In [7]:
# 洪さんLLM synthetic_data
df_gpt = pd.read_csv(f"{DATA_PATH}/gpt-4o-mini-q-a_v2_render_v1.csv")

# rename
df_gpt = df_gpt.rename({"ConstructName-qwen25-72b-instruct": "ConstructName"}, axis=1)
df_gpt = df_gpt.rename({"MisconceptionName": "Misconception"}, axis=1)

# Qualityで絞り込み
df_gpt = df_gpt[df_gpt["quality-gpt4o-mini"] > 2].reset_index(drop=True)

df_gpt["fold"] = -2
print(df_gpt.shape)

(2185, 22)


In [8]:
# 洪さんLLM synthetic_data
df_synth2 = pd.read_csv(f"{DATA_PATH}/synthetic-round2-render.csv")
df_synth2 = df_synth2[~df_synth2.isna().any(axis=1)].reset_index(drop=True)

df_synth2 = df_synth2.rename({"ConstructName-qwen25-72b-instruct": "ConstructName"}, axis=1)
df_synth2 = df_synth2.rename({"MisconceptionName": "Misconception"}, axis=1)

df_synth2 = df_synth2[df_synth2["quality-gpt4o-mini"] > 2].reset_index(drop=True)

df_synth2["fold"] = -3
print(df_synth2.shape)

(31868, 26)


In [9]:
df = pd.concat([df, df_synth, df_gpt, df_synth2], axis=0).reset_index(drop=True)
# df = df.sample(100).reset_index(drop=True)
print(df.shape)

(42423, 41)


In [10]:
df_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping_with_paragraph_v3.csv")
df_mapping['MisconceptionName'] = df_mapping['MisconceptionName'] +\
df_mapping['a000-llama3-mega-misconception-aug-seed201_misunderstanding']

In [11]:
df = df.merge(
    df_mapping[["MisconceptionId", "MisconceptionName"]],
    on="MisconceptionId",
    how="left",
)

In [12]:
df[["SubjectName", "ConstructName", "QuestionText", "CorrectAnswerText", "AnswerText", "Misconception", "MisconceptionName"]].isnull().sum(0)

SubjectName          0
ConstructName        0
QuestionText         0
CorrectAnswerText    0
AnswerText           0
Misconception        0
MisconceptionName    0
dtype: int64

In [13]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

In [14]:
task_description = 'Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.'

In [15]:
def get_query_text(row):
    query_text = f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['QuestionText']}\n###Correct Answer###:{row['CorrectAnswerText']}\n###Misconcepte Incorrect answer###:{row['AnswerText']}"
    return query_text

In [16]:
df["InputQuery"] = df.apply(get_query_text, axis=1)

In [17]:
print(df["InputQuery"].values[0])

###question###:BIDMAS-Use the order of operations to carry out calculations involving powers-\[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ?
###Correct Answer###:\( 3 \times(2+4)-5 \)
###Misconcepte Incorrect answer###:Does not need brackets


In [18]:
df['InputText'] = df["InputQuery"].map(lambda x: get_detailed_instruct(task_description, x))

In [19]:
df = df[df["InputText"].map(len) < 2000].reset_index(drop=True)

In [20]:
print(df.shape)

(42361, 44)


# Dataset

In [21]:
train_ds = Dataset.from_pandas(df)

# Model

In [22]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

In [23]:
base_model = Qwen2BiModel.from_pretrained(
    MODEL_NAME,
    device_map="auto", #if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

Downloading shards: 100%|██████████| 8/8 [11:43<00:00, 87.93s/it]
Loading checkpoint shards: 100%|██████████| 8/8 [00:09<00:00,  1.22s/it]


In [24]:
base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [25]:
config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="FEATURE_EXTRACTION",
)

In [26]:
base_model = prepare_model_for_kbit_training(base_model)
base_model = get_peft_model(base_model, config)
# base_model = get_peft_model(base_model.model, config)
base_model.print_trainable_parameters()

trainable params: 275,251,200 || all params: 14,266,717,184 || trainable%: 1.9293


In [27]:
model = SentenceTransformer("Qwen/Qwen2.5-0.5B-Instruct", trust_remote_code=True)

No sentence-transformers model found with name Qwen/Qwen2.5-0.5B-Instruct. Creating a new one with mean pooling.


In [28]:
model._first_module().auto_model = base_model
model._first_module().tokenizer = base_tokenizer

In [29]:
model[1].pooling_mode_mean_tokens = False
model[1].pooling_mode_lasttoken = True

In [30]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 32768, 'do_lower_case': False}) with Transformer model: PeftModelForFeatureExtraction 
  (1): Pooling({'word_embedding_dimension': 896, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': True})
)

In [31]:
del base_model
torch.cuda.empty_cache()

# Training

In [32]:
# loss = CachedMultipleNegativesRankingLoss(model, mini_batch_size=28)
loss = MultipleNegativesRankingLoss(model)

In [33]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=MODEL_OUTPUT_PATH,
    # Optional training parameters:
    optim="paged_adamw_8bit",
    num_train_epochs=EPOCH,
    dataloader_num_workers=NUM_PROC,
    per_device_train_batch_size=BS,
    # per_device_eval_batch_size=BS,
    # learning_rate=LR,
    warmup_ratio=0.0,
    fp16=False,
    bf16=True,
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    # lr_scheduler_type="cosine_with_restarts",
    # eval_strategy="epoch",
    # eval_steps=8,
    save_strategy="epoch",
    save_steps=8,
    save_total_limit=1,
    logging_steps=8,
    report_to=REPORT_TO,  # Will be used in W&B if `wandb` is installed
    metric_for_best_model="eval_cosine_map@25", # eval_cosine_recall@25
    do_eval=False,
    push_to_hub=False,
    # load_best_model_at_end=True,
    # gradient_checkpointing_kwargs=True
)

In [34]:
trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_ds.select_columns(
            ["InputText", "MisconceptionName"]
        ),
        loss=loss,
    )

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss
8,3.7548
16,2.8196
24,0.8816
32,0.4191
40,0.4134
48,0.354
56,0.2421
64,0.2083
72,0.2297
80,0.2363


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
