In [1]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq,AutoTokenizer
import torch
import re
from tqdm import tqdm
import json



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_dir = "/home/un/桌面/QC/qwen2_5/Qwen2.5-7B-Instruct"

### 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.45it/s]


### 加载数据集

In [3]:
# train_dir="/home/un/桌面/QC/2024_全国大数据智能大赛/复赛_code/data/step_1_5000_submit_cot_without_rule_id_with_problem.json"
# train_dir="/home/un/桌面/QC/2024_全国大数据智能大赛/new_复赛_code/data/72b_train_data.json"
train_dir="/home/un/桌面/QC/2024_全国大数据智能大赛/new_复赛_code/data/72b+o1_train_data1.json"

In [4]:
def process_func(example):
    """
    将数据集进行预处理
    """
    # global i
    MAX_LENGTH = 512 
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(
        f"<|system|>\n你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有的问题都是（close-world assumption）闭世界假设，即未观测事实都为假。请逐步分析问题并在最后一行输出答案，最后一行的格式为:答案是：A。<|endoftext|>\n<|user|>\n{example['question_text']}<|endoftext|>\n<|assistant|>\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['answer']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
        print("1111")
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}   

In [5]:
model.enable_input_require_grads()

In [6]:
import pandas as pd
from datasets import Dataset
train_df = pd.read_json(train_dir)
train_ds = Dataset.from_pandas(train_df)

In [7]:
# import pandas as pd
# from datasets import Dataset
# data_name="step1"
# if data_name=="step1":
#     train_dir="/home/un/桌面/QC/2024_全国大数据智能大赛/复赛_code/data/step_1_5000_submit_cot_without_rule_id_with_problem.json"
#     train_df = pd.read_json(train_dir)
#     train_df=train_df.loc[:4499].sample(int(4500*0.7))
#     print(len(train_df))
#     train_df=pd.concat([train_df,pd.read_json(train_data_dir)])
#     train_df.reset_index(inplace=True)
#     train_df.drop("index",axis=1,inplace=True)
# if data_name=="all_90%_data":
#     train_dir="/home/un/桌面/QC/2024_全国大数据智能大赛/复赛_code/train_models/all_datas.json"
#     train_df = pd.read_json(train_dir)
#     train_df=train_df.loc[500:].sample(int(4500*0.7))
#     print(len(train_df))
#     train_df=pd.concat([train_df,pd.read_json(train_data_dir)])
#     train_df.reset_index(inplace=True)
#     train_df.drop("index",axis=1,inplace=True)

In [8]:
train_df

Unnamed: 0,question_id,question_text,answer
0,1,问题：在某大型国际机场，一架注册号为B-7389的民用航空客机，在进行长途飞行任务中，遇到突...,D
1,2,问题：在关于风暴潮、海浪、海啸和海冰灾害的应急响应启动前期工作中，某海洋管理机构收到了关于预...,B
2,3,问题：设想一个大型地震震中位于人口稠密的城市，造成了严重的人员伤亡和财产损失。地震发生后，地...,D
3,4,问题：在一次事故中，一架民用航空器不幸坠落在距市中心5公里的开阔地带。机上共有乘客和机组人员...,C
4,5,问题：在2023年6月20日，东方石化企业发生了严重的危险化学品泄漏事故，泄漏的化学品因其高...,C
...,...,...,...
9995,4996,问题：在一次发生在2023年5月的重大海上溢油事件中，需要紧急调动各种资源进行处置工作。根据...,C
9996,4997,问题：某城市突然遭遇了一场沙尘暴灾害，这场沙尘暴广泛影响了城市及其周边地区。根据初步统计，此...,B
9997,4998,问题：在一座人口密集的大城市中，突然爆发了一种肺鼠疫病例，这种疾病是由Yersinia pe...,A
9998,4999,问题：在江南省发生了8.0级的地震，地震影响范围广泛，造成严重的人员伤亡和财产损失。根据应急...,B


In [9]:
train_ds = Dataset.from_pandas(train_df)

In [10]:
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

Map: 100%|██████████| 10000/10000 [00:35<00:00, 281.73 examples/s]


In [11]:
from peft import LoraConfig, TaskType, get_peft_model
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

### 加载微调模型

In [12]:
model = get_peft_model(model, config)

### 配置超参数

In [13]:
args = TrainingArguments(
    output_dir="72b+o1_model2/qwen2_5_7b_lora", #记得每一次修改文件
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    logging_steps=2000,
    num_train_epochs=2,
    save_steps=2000,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
    # fp16=True,
    save_total_limit=2,
    # seed=2024
)

### 开始训练

In [14]:
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

trainer.train()

[2024-11-08 21:01:05,092] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  @autocast_custom_fwd
  @autocast_custom_bwd
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
2000,0.371
4000,0.2601
6000,0.2411
8000,0.2095
10000,0.1851
12000,0.0555
14000,0.0642
16000,0.0672
18000,0.0659
20000,0.0572


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=20000, training_loss=0.15767269325256347, metrics={'train_runtime': 2927.3831, 'train_samples_per_second': 6.832, 'train_steps_per_second': 6.832, 'total_flos': 2.3195489831339213e+17, 'train_loss': 0.15767269325256347, 'epoch': 2.0})

In [15]:
def predict(messages, model, tokenizer):
    device = "cuda"
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=384,
        pad_token_id=tokenizer.eos_token_id
        # do_sample=False,
        # temperature=0.7
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # print(response)
     
    return response

### 验证训练集ACC

In [16]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq,AutoTokenizer
import torch
import re
from tqdm import tqdm
import json


# from peft import PeftModel
# model_dir = "/home/un/桌面/QC/qwen2_5/Qwen2.5-7B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
# model1 = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
# model = PeftModel.from_pretrained(model1, model_id="/home/un/桌面/QC/2024_全国大数据智能大赛/复赛_code/train_models/new_prompts_model/qwen2_5_7b/checkpoint-10000")

In [17]:
import pandas as pd
train_dir="/home/un/桌面/QC/2024_全国大数据智能大赛/data/dev.json"
test_df = pd.read_json(train_dir)

In [18]:
rag_data=pd.read_json("/home/un/桌面/QC/2024_全国大数据智能大赛/data/rules1.json")

In [19]:
rag_data.loc[[10,20,30]]

Unnamed: 0,rule_id,rule_text
10,11,化学品登记中心的职责：负责建立化学品基本数据库，为事故救援和调查处理提供相关化学品基本数据与信息。
20,21,《国家危险化学品事故灾难应急预案》的启动条件：事故等级达到Ⅱ级或省级人民政府应急预案启动后，...
30,31,当确定危险化学品事故为爆炸事故后，现场紧急处置的具体方案为：（1）确定爆炸地点；（2）确定爆...


In [20]:
test_pred_list = []
test_label_list=[]
k=0
for index, row in tqdm(test_df.iterrows()):
    instruction = "你是一位经验丰富的应急响应专家，擅长解决应急场景的问题。以下是一个逻辑推理的题目，形式为单项选择题。"
    input_value = row['question_text']
    rag_prompt="\n以下是相关上下文：\n"
    # print(row["rule_id"])
    rag_data_index=row["rule_id"]
    # # print(rag_data_index)
    # # print(len(rag_data_index))
    # # content_value= rag_data.loc[int()-1].values[1]
    rag_datas=[rag_data.loc[int(i)-1].values[1] for i in rag_data_index]
    # print('\n'.join(rag_datas))
    test_label_list.append(row["answer"])
    # messages = [
    #     {"role": "system", "content": f"{instruction}"},
    #     {"role": "user", "content": f"{input_value+rag_prompt+'\n'.join(rag_datas)}"}
    # ]
    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"}
    ]
    response = predict(messages, model, tokenizer)
    test_pred_list.append(response)

0it [00:00, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


  return fn(*args, **kwargs)
500it [00:43, 11.57it/s]


In [21]:
set(test_pred_list)

{'A', 'B', 'C', 'D'}

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(test_label_list,test_pred_list)

0.95

: 