In [None]:
# variables secup

import os
import subprocess
import random
import torch

PATH_PRE="/" #👀👀👀change this to root folder of the code

num_gpu = torch.cuda.device_count() # Number of GPUs for prediction
CV_fold=5 #set to 0 to train on full data
valid_fold=3

# ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤
train_data="df_ret_long.pkl"

# train_data=f"df_ret_long_new_{valid_fold}.pkl"
# train_data=f"df_ret_long_new_{valid_fold}alt.pkl"
# ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤

infer_batch=64
infer_max_len=512
neg_cnt_1=80
neg_cnt_2=40
sentence_pooling_method="last" # "mean", "cls"

lora_r=32
lora_alpha=64
lora_target_modules="q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj"

MODEL_VERSION=f"fold{valid_fold}_round0"

############################## recall modl ##############################
MODEL_PATH=f"{PATH_PRE}/Qwen2.5-14B-Instruct/Qwen2.5-14B-Instruct"  #👀👀👀change this to where you save the pre-trained model
lora_path=f"none"

############################## rerank modl ##############################
rank_model_path=f"{PATH_PRE}/Qwen2.5-32B-Instruct/Qwen2.5-32B-Instruct"  #👀👀👀change this to where you save the pre-trained model
rank_lora_path=f"none" 


In [None]:
# get data
# ==============================

import gc
gc.collect()
torch.cuda.empty_cache()

# 字典 {MisconceptionId1: embedding1, MisconceptionId2: embedding2, ... } 存于 f"{PATH_PRE}/data/{MODEL_VERSION}_misc.pkl"
# jsonl文件 格式{'query':row['lora_query'], 'pos':row['MisconceptionId'], 'neg':row['new_hard_recall_pids'][:100]} 存于 f"{PATH_PRE}/data/{MODEL_VERSION}_recall_top_200.jsonl"
# jsonl文件 格式{'query':query,'pos':pos,'neg':neg,'prompt':"Given a query with a relevant body,along with a title and abstract of paper,determine whether the paper is pertinent to the query by providing a prediction of either 'Yes' or 'No'."}  存于 f"{PATH_PRE}/data/{MODEL_VERSION}_recall_top_100_for_rank.jsonl"
# pickle文件 valid_df

os.chdir(f"{PATH_PRE}/train")
!python -u get_data.py \
    {PATH_PRE} \
    {MODEL_VERSION} \
    {MODEL_PATH} \
    {train_data} \
    {lora_path} \
    {lora_r} \
    {lora_alpha} \
    {lora_target_modules} \
    {CV_fold} \
    {valid_fold} \
    {num_gpu} \
    {infer_batch} \
    {infer_max_len} \
    {sentence_pooling_method}\
    {neg_cnt_1} \
    {neg_cnt_2} \

In [None]:
####### TRAIN RECALL MODEL

import gc
gc.collect()
torch.cuda.empty_cache()

ZERO_STAGE = 2 #DeepSpeed 的 ZeRO (Zero Redundancy Optimizer) 优化器设置的一个参数
MODEL_USE = MODEL_VERSION
OUTPUT = f"{PATH_PRE}/model_save/{MODEL_USE}_recall"
os.makedirs(OUTPUT, exist_ok=True)

MASTER_PORT = random.randint(10000, 65535)
print(f"Master Port: {MASTER_PORT}")

include_devices = ",".join(str(i) for i in range(num_gpu))
include_param = f"localhost:{include_devices}"

# effective train batch size = {batch_size}*{num_gpu}*{grad_accumulation_steps}
batch_size=8 #一次训练的批次数 #原8
grad_accumulation_steps=2 #训练几步更新一次参数
train_group_size=4 #一个批次的正负样本总数

lora_learning_rate=5e-5 #default 5e-4 #1e-4,5e-5, 2e-5

os.chdir(f"{PATH_PRE}/train")
# 构建命令字符串
command = f"""
deepspeed --master_port {MASTER_PORT} --include {include_param} run_recall.py \
       --path_pre {PATH_PRE} \
       --train_data {PATH_PRE}/data/{MODEL_USE}_recall_top_{neg_cnt_1}.jsonl \
       --CV_fold {CV_fold} \
       --valid_fold {valid_fold} \
       --model_name_or_path {MODEL_PATH} \
       --per_device_train_batch_size {batch_size} \
       --per_device_eval_batch_size {batch_size} \
       --train_group_size {train_group_size} \
       --gradient_accumulation_steps {grad_accumulation_steps} \
       --query_max_len 512 \
       --misc_max_len 256 \
       --earystop 0 \
       --eary_stop_epoch 5 \
       --save_batch_steps 20 \
       --save_per_epoch 1 \
       --num_train_epochs 20 \
       --learning_rate 1e-4 \
       --num_warmup_steps 100 \
       --weight_decay 0.01 \
       --lr_scheduler_type cosine \
       --seed 1236 \
       --zero_stage {ZERO_STAGE} \
       --deepspeed \
       --output_dir {OUTPUT} \
       --gradient_checkpointing \
       --lora_learning_rate {lora_learning_rate} \
       --lora_path {lora_path} \
       --lora_r {lora_r} \
       --lora_alpha {lora_alpha} \
       --lora_target_modules {lora_target_modules}
"""

# 执行命令
os.system(command)

In [None]:
####### TRAIN RERANK MODEL

import os
import subprocess
import random
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

num_gpu = torch.cuda.device_count() # Number of GPUs for prediction
print("GPU count: ", num_gpu)

ZERO_STAGE = 2 #DeepSpeed 的 ZeRO (Zero Redundancy Optimizer) 优化器设置的一个参数
MODEL_USE = MODEL_VERSION
RERANK_OUTPUT = f"{PATH_PRE}/model_save/{MODEL_USE}_rerank"
os.makedirs(RERANK_OUTPUT, exist_ok=True)

# effective train batch size = {batch_size}*{num_gpu}*{grad_accumulation_steps}
rank_batch_size=4 #一次训练的批次数
rank_grad_accumulation_steps=8 #训练几步更新一次参数
rank_train_group_size=4 #一个批次的正负样本总数
rank_learning_rate=5e-4 #default 1e-3
rank_lora_r=32 #原64
rank_lora_alpha=64 #原128
rank_lora_target_modules="q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj,lm_head"

os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["NCCL_P2P_DISABLE"] = "1"

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
os.chdir(f"{PATH_PRE}/train/reranker")

command = f"""
torchrun --nproc_per_node {num_gpu} \
-m run_reranker \
--path_pre {PATH_PRE} \
--output_dir {RERANK_OUTPUT} \
--overwrite_output_dir \
--model_name_or_path {rank_model_path} \
--train_data {PATH_PRE}/data/{MODEL_USE}_recall_top_{neg_cnt_2}_for_rerank.jsonl \
--learning_rate {rank_learning_rate} \
--num_train_epochs 4 \
--per_device_train_batch_size {rank_batch_size} \
--do_train True \
--gradient_accumulation_steps {rank_grad_accumulation_steps} \
--dataloader_drop_last False \
--query_max_len 512 \
--misc_max_len 256 \
--train_group_size {rank_train_group_size} \
--logging_steps 1 \
--save_strategy epoch \
--save_steps 1 \
--save_total_limit 50 \
--ddp_find_unused_parameters False \
--gradient_checkpointing \
--report_to "none" \
--warmup_ratio 0.05 \
--bf16 \
--lora_rank {rank_lora_r} \
--lora_alpha {rank_lora_alpha} \
--use_flash_attn False \
--target_modules {rank_lora_target_modules} \
--lora_path {rank_lora_path} \
--deepspeed stage{ZERO_STAGE}.json \
"""

# 执行命令
os.system(command)

In [None]:
# AWQ

import os, glob
import math, re
import torch
import pickle
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict)
from safetensors.torch import safe_open, load_file, save_file

model_path = f"{PATH_PRE}/Qwen2.5-32B-Instruct/Qwen2.5-32B-Instruct"  #👀👀👀change this to where you save the pre-trained model
lora_path = f"{PATH_PRE}/model_save/rerank_Qwen_4epochs/checkpoint-332/adapter.bin"  #👀👀👀change this to the path of the lora you want to merge
merged_model_path = f"{PATH_PRE}/model_save/reranker_merge" #where merged model params locate
quant_path = f"{PATH_PRE}/model_save/reranker_AWQ" #where quantized model params locate
os.makedirs(merged_model_path, exist_ok=True)
os.makedirs(quant_path, exist_ok=True)

alpha = 0.8 # wise-ft hyper-params，平衡0-shot和微调，范围[0，1]
device="cuda:0"
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }


# ⬇️⬇️⬇️ load lora into model and merge and save
print("Load lora into model and merge and save...")

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)

loraConfig = LoraConfig(
        r=32, #32
        lora_alpha=64, #64
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj","lm_head"],
        bias="none",
        lora_dropout=0.05,
        task_type="CAUSAL_LM")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
model = get_peft_model(model, loraConfig) # 函数将LoRA设置应用于加载的基础模型，使其能够在微调过程中使用LoRA技术。
d = torch.load(lora_path) # 加载LoRA模型的权重。 从指定的 lora_path 加载权重，并确保它们被加载到与模型相同的设备上。

# Wise-ft implementation
scaled_d = {}
for key, value in d.items():
    scaled_d[key] = value * math.sqrt(alpha) if "lora_A" in key or "lora_B" in key else value
model.load_state_dict(scaled_d , strict=False) # 将加载的权重应用到模型中。 # strict=False 表示在加载权重时允许模型结构中的某些不匹配。

model = model.merge_and_unload()
model.save_pretrained(merged_model_path, save_embedding_layers=True)
print("Trained model bf16 version merging and saving done!")



# ⬇️⬇️⬇️ calibration data prep
print("calibration data prep...")
sample=128

df = pd.read_pickle(f'{PATH_PRE}/data/{train_data}').sample(n=sample, random_state=42)
misconceptions = pd.read_csv(f'{PATH_PRE}/data/misconception_mapping.csv').MisconceptionName.values

msgs = []
for _, row in df.iterrows():
    msgs.append([
        {"role": "system", "content": "You are a Mathematics teacher. "},
        {"role": "user", "content": row["rerank_query"] + misconceptions[row["MisconceptionId"]]+"\n\nPlease respond with only 'Yes' or 'No'."},
        {"role": "assistant", "content": "Yes."}
    ])

tokenizer = AutoTokenizer.from_pretrained(model_path)
data=[]
for msg in msgs:
    text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
    data.append(text.strip())

    

# ⬇️⬇️⬇️ autoAWQ
print("AWQ...")

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# Load your tokenizer and model with AutoAWQ
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoAWQForCausalLM.from_pretrained(merged_model_path, device_map="auto", safetensors=True)

model.quantize(tokenizer, quant_config=quant_config, calib_data=data)
model.save_quantized(quant_path, safetensors=True, shard_size="4GB")
tokenizer.save_pretrained(quant_path)
print("model saved!!!!!!!!!!!!!!!")