## 查看数据

In [1]:
import pandas as pd
import os
import re

In [2]:
train_dir='data/train'
train_data=pd.read_csv('data/train.csv')
pattern_file = r'^file_\d+\.txt$'
pattern_dir=r'^article_\d+$'
for path in os.listdir(train_dir):
    if re.match(pattern_dir, path):
        article_path=os.path.join(train_dir,path)
        # print(article_path)
        for filename in os.listdir(article_path):
            if re.match(pattern_file, filename):
                # print(filename,int(article_path.split('_')[-1]))
                with open(os.path.join(article_path,filename),'r') as f:
                    train_data.loc[int(article_path.split('_')[-1]),filename.split('.')[0]]=f.read()
train_data.to_json('data/train.json',orient='records',lines=True)

In [3]:
test_dir='data/test'
test_data=pd.DataFrame()
pattern_file = r'^file_\d+\.txt$'
pattern_dir=r'^article_\d+$'
for path in os.listdir(test_dir):
    if re.match(pattern_dir, path):
        article_path=os.path.join(test_dir,path)
        # print(article_path)
        for filename in os.listdir(article_path):
            if re.match(pattern_file, filename):
                # print(filename,int(article_path.split('_')[-1]))
                with open(os.path.join(article_path,filename),'r') as f:
                    test_data.loc[int(article_path.split('_')[-1]),filename.split('.')[0]]=f.read()
test_data.to_json('data/test.json',orient='records',lines=True)

## 准备构造微调数据集

In [4]:
train_data.head()

Unnamed: 0,id,real_text_id,file_1,file_2
0,0,1,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...
1,1,2,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...
2,2,1,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...
3,3,2,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...
4,4,2,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...


In [5]:
sys_prompt=''
with open('sys_prompt.txt','r') as f:
    sys_prompt=f.read()

In [6]:
sys_prompt

'Your task: Compare Text 1 and Text 2, then determine which one is factually correct (true) and which has been modified (false).\n\nText 1:\n<text1>\n{TEXT1}\n</text1>\n\nText 2:\n<text2>\n{TEXT2}\n</text2>\n\nEvaluation criteria:\n- Check for factual accuracy\n- Verify logical consistency\n- Cross-reference with common knowledge\n\nIMPORTANT: Your final answer MUST be EITHER "1" OR "2". No explanations, comments, or additional information. Only the number.'

In [7]:
train_data['prompt'] = train_data.apply(lambda row: sys_prompt.format(TEXT1=row['file_1'],TEXT2=row['file_2']), axis=1)
train_data=train_data.rename(columns={'real_text_id':'completion'})
train_data['completion']=train_data['completion'].astype(str)
train_data=train_data[['prompt','completion']]
train_data.to_json('data/train_processed.json',orient='records',lines=True)

In [8]:
train_data.head()

Unnamed: 0,prompt,completion
0,"Your task: Compare Text 1 and Text 2, then det...",1
1,"Your task: Compare Text 1 and Text 2, then det...",2
2,"Your task: Compare Text 1 and Text 2, then det...",1
3,"Your task: Compare Text 1 and Text 2, then det...",2
4,"Your task: Compare Text 1 and Text 2, then det...",2


In [9]:
longest_idx = train_data['prompt'].str.len().idxmax()
print("最长的prompt，id为：",longest_idx)
print("最长的prompt，长度为：",len(train_data.loc[longest_idx, 'prompt']))

最长的prompt，id为： 83
最长的prompt，长度为： 42413


## 先尝试推理一波

In [14]:
model_dir='/data/download-model'
model_name='Qwen3-0.6B'

In [15]:
import os

In [16]:
model_path=os.path.join(model_dir,model_name)

In [17]:
# from unsloth import FastLanguageModel
# import torch

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     max_seq_length = 40960,   # Context length - can be longer, but uses more memory
#     # load_in_4bit = True,     # 4bit uses much less memory
#     # load_in_8bit = False,    # A bit more accurate, uses 2x memory
#     # full_finetuning = False, # We have full finetuning now!
#     # token = "hf_...",      # use one if using gated models
# )

In [25]:
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from peft import PeftModel

tokenizer=AutoTokenizer.from_pretrained(model_path)
model=AutoModelForCausalLM.from_pretrained(model_path).to('cuda:7')

# model=PeftModel.from_pretrained(model,'model/sft')
# model=model.merge_and_unload()

In [26]:
generator = pipeline(
    task="text-generation",  # 指定任务类型
    model=model,             # 传入预加载的模型
    max_new_tokens=4,         # 其他参数
    tokenizer=tokenizer,
    device=7
)

Device set to use cuda:7


In [27]:
import pandas as pd

train_data=pd.read_json('data/train_processed.json',orient='records',lines=True)

print(train_data.columns)
train_data['text']=train_data.apply(
    lambda row:
        tokenizer.apply_chat_template(
            [
                {'role':'user','content':row['prompt']}
            ],
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False
        ),
    axis=1
)

Index(['prompt', 'completion'], dtype='object')


In [28]:
outputs=generator(train_data['text'].tolist())

In [29]:
outputs=[row[0]['generated_text'][len(train_data.loc[idx,'text']):] for idx,row in enumerate(outputs)]

In [30]:
train_data['predict']=outputs

In [31]:
train_data['completion']=train_data['completion'].astype(str)
accuracy = (train_data['predict'] == train_data['completion']).mean()
print(f"准确率: {accuracy:.5f}")

准确率: 0.46316


直接使用0.6B的模型，在训练集只有0.46316的准确率，如果是测试集的话，大概更低一点，距离最高分0.93153还差得远呢
- sft， 0.48421？这不是基本没有提升吗？

## 尝试直接用sft微调

In [1]:
from trl import SFTConfig,SFTTrainer
from datasets import load_dataset
from peft import LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_ds=load_dataset('json',data_files='data/train_processed.json',split='train')

In [3]:
train_ds

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 95
})

In [32]:
import torch
import os

model_dir='/data/download-model'
model_name='Qwen3-4B'
model_path=os.path.join(model_dir,model_name)

In [33]:
# from unsloth import FastLanguageModel

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     # max_seq_length = 40960,   # Context length - can be longer, but uses more memory
#     # load_in_4bit = True,     # 4bit uses much less memory
#     # load_in_8bit = False,    # A bit more accurate, uses 2x memory
#     # full_finetuning = False, # We have full finetuning now!
#     # token = "hf_...",      # use one if using gated models
# )

In [34]:
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from peft import get_peft_model

tokenizer=AutoTokenizer.from_pretrained(model_path)
model=AutoModelForCausalLM.from_pretrained(model_path).to('cuda:7')

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.33it/s]


In [35]:
# 源码里有apply_chat_template
# train_ds=train_ds.map(lambda example:{
#     'prompt':tokenizer.apply_chat_template(
#         [
#             {'role':'user','content':example['prompt']}
#         ],
#         tokenize=False,
#         add_generation_prompt=True,
#         enable_thinking=False
#     )}
# )

In [36]:
lora_config=LoraConfig(
    r = 16,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0.1,
)

In [37]:
import swanlab

swanlab.config.update({
    "model": model_name
})

In [38]:
sft_config = SFTConfig(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Use GA to mimic batch size!
    warmup_steps = 5,
    num_train_epochs = 10, 
    learning_rate = 5e-5, 
    report_to = "swanlab", 
    completion_only_loss=True
)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


In [39]:
trainer=SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = train_ds,
    args=sft_config,
    peft_config=lora_config,
)

Tokenizing train dataset: 100%|██████████| 95/95 [00:00<00:00, 226.87 examples/s]
Truncating train dataset: 100%|██████████| 95/95 [00:00<00:00, 26855.76 examples/s]
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [40]:
trainer.train()

OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 97, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/peft/peft_model.py", line 818, in forward
    return self.get_base_model()(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 730, in forward
    outputs: BaseModelOutputWithPast = self.model(
                                       ^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 463, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/modeling_layers.py", line 48, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 300, in forward
    hidden_states = self.mlp(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 90, in forward
    down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/peft/tuners/lora/layer.py", line 727, in forward
    result = result + lora_B(lora_A(dropout(x))) * scaling
                                    ^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/dropout.py", line 70, in forward
    return F.dropout(input, self.p, self.training, self.inplace)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/functional.py", line 1425, in dropout
    _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
                                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB. GPU 0 has a total capacity of 94.99 GiB of which 7.31 MiB is free. Process 2156823 has 75.40 GiB memory in use. Including non-PyTorch memory, this process has 19.57 GiB memory in use. Of the allocated memory 18.42 GiB is allocated by PyTorch, and 175.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [13]:
trainer.save_model('model/sft/')