## 查看数据

In [43]:
import pandas as pd
import os
import re

In [2]:
train_dir='data/train'
train_data=pd.read_csv('data/train.csv')
pattern_file = r'^file_\d+\.txt$'
pattern_dir=r'^article_\d+$'
for path in os.listdir(train_dir):
    if re.match(pattern_dir, path):
        article_path=os.path.join(train_dir,path)
        # print(article_path)
        for filename in os.listdir(article_path):
            if re.match(pattern_file, filename):
                # print(filename,int(article_path.split('_')[-1]))
                with open(os.path.join(article_path,filename),'r') as f:
                    train_data.loc[int(article_path.split('_')[-1]),filename.split('.')[0]]=f.read()
train_data.to_json('data/train.json',orient='records',lines=True)

In [44]:
test_dir='data/test'
test_data=pd.DataFrame()
pattern_file = r'^file_\d+\.txt$'
pattern_dir=r'^article_\d+$'
for path in os.listdir(test_dir):
    if re.match(pattern_dir, path):
        article_path=os.path.join(test_dir,path)
        # print(article_path)
        for filename in os.listdir(article_path):
            if re.match(pattern_file, filename):
                # print(filename,int(article_path.split('_')[-1]))
                with open(os.path.join(article_path,filename),'r') as f:
                    test_data.loc[int(article_path.split('_')[-1]),filename.split('.')[0]]=f.read()
                    test_data.loc[int(article_path.split('_')[-1]),'id']=int(article_path.split('_')[-1])
test_data.to_json('data/test.json',orient='records',lines=True)

## 准备构造微调数据集

In [91]:
train_data=pd.read_json('data/extra_train.json',orient='records',lines=True)
train_data.head()

Unnamed: 0,file_1,file_2,real_text_id,id
0,"Believe it or not, unless you directly contac...",This could include filing a lawsuit against t...,1,0
1,"Sanskrit, as one of the oldest Indo-European ...",Medically speaking a seizure is when the elec...,2,1
2,A Shirley Temple short subject would likely c...,- If you criminalize alcohol / drug use or sm...,2,2
3,It 's three fold : * Stuff is cheaper to mass...,Miami's vibrant music scene has long been a m...,1,3
4,That 's the problem of consciousness : we jus...,Different religions have different ways of pr...,1,4


In [92]:
train_data['real_text_id'].value_counts()

real_text_id
1    6546
2    6546
Name: count, dtype: int64

In [93]:
sys_prompt=''
with open('sys_prompt.txt','r') as f:
    sys_prompt=f.read()

In [94]:
sys_prompt

'Your task: Compare Text 1 and Text 2, then determine which one is factually correct (true) and which has been modified (false).\n\nText 1:\n<text1>\n{TEXT1}\n</text1>\n\nText 2:\n<text2>\n{TEXT2}\n</text2>\n\nEvaluation criteria:\n- Check for factual accuracy\n- Verify logical consistency\n- Cross-reference with common knowledge\n\nIMPORTANT: Your final answer MUST be EITHER "1" OR "2". No explanations, comments, or additional information. Only the number.'

In [95]:
train_data['prompt'] = train_data.apply(lambda row: sys_prompt.format(TEXT1=row['file_1'],TEXT2=row['file_2']), axis=1)
train_data=train_data.rename(columns={'real_text_id':'completion'})
train_data['completion']=train_data['completion'].astype(str)
train_data=train_data[['prompt','completion']]
train_data.to_json('data/extra_train_processed.json',orient='records',lines=True)

In [60]:
## 测试集
test_data=pd.read_json('data/test.json',orient='record',lines=True)

In [61]:
test_data['prompt'] = test_data.apply(lambda row: sys_prompt.format(TEXT1=row['file_1'],TEXT2=row['file_2']), axis=1)
test_data=test_data[['id','prompt']]
test_data.sort_values(by='id', inplace=True)
test_data.to_csv('data/test_processed.csv',header=True,index=False)

In [59]:
test_data

Unnamed: 0,id,prompt
0,581,"Your task: Compare Text 1 and Text 2, then det..."
1,207,"Your task: Compare Text 1 and Text 2, then det..."
2,656,"Your task: Compare Text 1 and Text 2, then det..."
3,134,"Your task: Compare Text 1 and Text 2, then det..."
4,360,"Your task: Compare Text 1 and Text 2, then det..."
...,...,...
1063,516,"Your task: Compare Text 1 and Text 2, then det..."
1064,448,"Your task: Compare Text 1 and Text 2, then det..."
1065,947,"Your task: Compare Text 1 and Text 2, then det..."
1066,820,"Your task: Compare Text 1 and Text 2, then det..."


In [8]:
train_data.head()

Unnamed: 0,prompt,completion
0,"Your task: Compare Text 1 and Text 2, then det...",1
1,"Your task: Compare Text 1 and Text 2, then det...",2
2,"Your task: Compare Text 1 and Text 2, then det...",1
3,"Your task: Compare Text 1 and Text 2, then det...",2
4,"Your task: Compare Text 1 and Text 2, then det...",2


In [9]:
longest_idx = train_data['prompt'].str.len().idxmax()
print("最长的prompt，id为：",longest_idx)
print("最长的prompt，长度为：",len(train_data.loc[longest_idx, 'prompt']))

最长的prompt，id为： 83
最长的prompt，长度为： 42413


## 先尝试推理一波

In [14]:
model_dir='/data/download-model'
model_name='Qwen3-0.6B'

In [15]:
import os

In [16]:
model_path=os.path.join(model_dir,model_name)

In [17]:
# from unsloth import FastLanguageModel
# import torch

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     max_seq_length = 40960,   # Context length - can be longer, but uses more memory
#     # load_in_4bit = True,     # 4bit uses much less memory
#     # load_in_8bit = False,    # A bit more accurate, uses 2x memory
#     # full_finetuning = False, # We have full finetuning now!
#     # token = "hf_...",      # use one if using gated models
# )

In [25]:
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from peft import PeftModel

tokenizer=AutoTokenizer.from_pretrained(model_path)
model=AutoModelForCausalLM.from_pretrained(model_path).to('cuda:7')

# model=PeftModel.from_pretrained(model,'model/sft')
# model=model.merge_and_unload()

In [26]:
generator = pipeline(
    task="text-generation",  # 指定任务类型
    model=model,             # 传入预加载的模型
    max_new_tokens=4,         # 其他参数
    tokenizer=tokenizer,
    device=7
)

Device set to use cuda:7


In [27]:
import pandas as pd

train_data=pd.read_json('data/train_processed.json',orient='records',lines=True)

print(train_data.columns)
train_data['text']=train_data.apply(
    lambda row:
        tokenizer.apply_chat_template(
            [
                {'role':'user','content':row['prompt']}
            ],
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False
        ),
    axis=1
)

Index(['prompt', 'completion'], dtype='object')


In [28]:
outputs=generator(train_data['text'].tolist())

In [29]:
outputs=[row[0]['generated_text'][len(train_data.loc[idx,'text']):] for idx,row in enumerate(outputs)]

In [30]:
train_data['predict']=outputs

In [31]:
train_data['completion']=train_data['completion'].astype(str)
accuracy = (train_data['predict'] == train_data['completion']).mean()
print(f"准确率: {accuracy:.5f}")

准确率: 0.46316


直接使用0.6B的模型，在训练集只有0.46316的准确率，如果是测试集的话，大概更低一点，距离最高分0.93153还差得远呢
- sft， 0.48421？这不是基本没有提升吗？

## 尝试直接用sft微调

In [1]:
from trl import SFTConfig,SFTTrainer
from datasets import load_dataset
from peft import LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_ds=load_dataset('json',data_files='data/train_processed.json',split='train')

In [3]:
train_ds

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 95
})

In [32]:
import torch
import os

model_dir='/data/download-model'
model_name='Qwen3-4B'
model_path=os.path.join(model_dir,model_name)

In [33]:
# from unsloth import FastLanguageModel

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_path,
#     # max_seq_length = 40960,   # Context length - can be longer, but uses more memory
#     # load_in_4bit = True,     # 4bit uses much less memory
#     # load_in_8bit = False,    # A bit more accurate, uses 2x memory
#     # full_finetuning = False, # We have full finetuning now!
#     # token = "hf_...",      # use one if using gated models
# )

In [34]:
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from peft import get_peft_model

tokenizer=AutoTokenizer.from_pretrained(model_path)
model=AutoModelForCausalLM.from_pretrained(model_path).to('cuda:7')

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.33it/s]


In [35]:
# 源码里有apply_chat_template
# train_ds=train_ds.map(lambda example:{
#     'prompt':tokenizer.apply_chat_template(
#         [
#             {'role':'user','content':example['prompt']}
#         ],
#         tokenize=False,
#         add_generation_prompt=True,
#         enable_thinking=False
#     )}
# )

In [36]:
lora_config=LoraConfig(
    r = 16,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0.1,
)

In [37]:
import swanlab

swanlab.config.update({
    "model": model_name
})

In [38]:
sft_config = SFTConfig(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Use GA to mimic batch size!
    warmup_steps = 5,
    num_train_epochs = 10, 
    learning_rate = 5e-5, 
    report_to = "swanlab", 
    completion_only_loss=True
)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


In [39]:
trainer=SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = train_ds,
    args=sft_config,
    peft_config=lora_config,
)

Tokenizing train dataset: 100%|██████████| 95/95 [00:00<00:00, 226.87 examples/s]
Truncating train dataset: 100%|██████████| 95/95 [00:00<00:00, 26855.76 examples/s]
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [40]:
trainer.train()

OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 97, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/peft/peft_model.py", line 818, in forward
    return self.get_base_model()(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 730, in forward
    outputs: BaseModelOutputWithPast = self.model(
                                       ^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 463, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/modeling_layers.py", line 48, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 300, in forward
    hidden_states = self.mlp(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 90, in forward
    down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/peft/tuners/lora/layer.py", line 727, in forward
    result = result + lora_B(lora_A(dropout(x))) * scaling
                                    ^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/modules/dropout.py", line 70, in forward
    return F.dropout(input, self.p, self.training, self.inplace)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fine/uv/transformers/lib/python3.11/site-packages/torch/nn/functional.py", line 1425, in dropout
    _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
                                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB. GPU 0 has a total capacity of 94.99 GiB of which 7.31 MiB is free. Process 2156823 has 75.40 GiB memory in use. Including non-PyTorch memory, this process has 19.57 GiB memory in use. Of the allocated memory 18.42 GiB is allocated by PyTorch, and 175.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [13]:
trainer.save_model('model/sft/')

## 改为对两段文本分别打分，得分高的为true

In [4]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

In [43]:
model_path='/data/download-model/Qwen3-0.6B'
model_name=model_path.split('/')[-1]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=2,  # 二分类问题
)

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at /data/download-model/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
from peft import LoraConfig,get_peft_model

lora_config=LoraConfig(
    r = 16,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0.1,
)

In [55]:
model=get_peft_model(model,lora_config)

In [7]:
import pandas as pd
from datasets import Dataset

In [30]:
data_path='data/train.json'

In [48]:
train_pd=pd.read_json('data/train.json',orient='records',lines=True)

In [49]:
train_pd

Unnamed: 0,id,real_text_id,file_1,file_2
0,0,1,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...
1,1,2,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...
2,2,1,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...
3,3,2,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...
4,4,2,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...
...,...,...,...,...
90,90,2,A main focus of modern cosmology is to underst...,A key focus of modern cosmology is to understa...
91,91,1,"APEX, as its name suggests, serves as a guide ...","APEX, as its name suggests, serves as a guide ..."
92,92,2,FORS1 and FORS2 are early instruments of the V...,FORS1 and FORS2 are early instruments of the V...
93,93,2,The observations of the Pluto-Charon system an...,The observations of the Pluto-Charon binary an...


In [50]:
train_pd['file_1_label'] = (train_pd['real_text_id'] == 1).astype(int)
train_pd['file_2_label'] = (train_pd['real_text_id'] == 2).astype(int)

In [51]:
# 假设原始 DataFrame 名为 df
train_ds = pd.concat([
    # 第一部分：file_1 与 file_1_label 组合
    pd.DataFrame({
        'text': train_pd['file_1'],
        'labels': train_pd['file_1_label']
    }),
    # 第二部分：file_2 与 file_2_label 组合
    pd.DataFrame({
        'text': train_pd['file_2'],
        'labels': train_pd['file_2_label']
    })
], ignore_index=True)

In [53]:
train_ds.to_json('data/train_text_labels.json',orient='records',lines=True)

In [37]:
longest_idx = train_ds['text'].str.len().idxmax()
print("最长的prompt，id为：",longest_idx)
print("最长的prompt，长度为：",len(train_ds.loc[longest_idx, 'text']))

最长的prompt，id为： 83
最长的prompt，长度为： 40316


In [38]:
train_ds = Dataset.from_pandas(train_ds)

In [39]:
train_ds

Dataset({
    features: ['text', 'labels'],
    num_rows: 190
})

In [40]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=40960)

In [41]:
tokenized_dataset = train_ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 190/190 [00:02<00:00, 72.21 examples/s]


In [42]:
tokenized_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 190
})

In [44]:
import swanlab

swanlab.config.update({
    "model": model_name
})

In [45]:
training_args = TrainingArguments(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Use GA to mimic batch size!
    warmup_steps = 5,
    num_train_epochs = 10, 
    learning_rate = 5e-5, 
    report_to = "swanlab", 
    run_name=model_name,
    output_dir=f"/data/finetuning/tof/{model_name}/cls",
)

In [46]:
# 创建Trainer实例
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

[2025-07-12 11:37:08,343] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2025-07-12 11:37:09,717] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


In [47]:
trainer.train()

[1m[34mswanlab[0m[0m: swanlab version 0.6.6 is available!  Upgrade: `pip install -U swanlab`    
[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.6.4                                   
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1m/home/fine/work/fake-or-real/swanlog/run-20250712_113737-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mzhangdw156[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mQwen3-0.6B[0m to the cloud
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@zhangdw156/fake-or-real[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@zhangdw156/fake-or-real/runs/02vqs1cv4nocgz77re690[0m[0m


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 3 has a total capacity of 94.99 GiB of which 7.38 MiB is free. Process 3956245 has 92.28 GiB memory in use. Including non-PyTorch memory, this process has 2.69 GiB memory in use. Of the allocated memory 1.54 GiB is allocated by PyTorch, and 65.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model(f'model/{model_name}/cls/')

## 验证分类模型

In [13]:
import os
import argparse
from transformers import pipeline,AutoModelForSequenceClassification,AutoTokenizer
from peft import PeftModel
import logging
import pandas as pd

In [14]:
model_path="/data/download-model/Qwen3-0.6B"
model_name=model_path.split('/')[-1]
lora_path="model/Qwen3-0.6B/cls"

In [25]:
tokenizer=AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=2,  # 二分类问题
)

# model=PeftModel.from_pretrained(model,lora_path)
# model=model.merge_and_unload()

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at /data/download-model/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,  # 返回所有类别的分数（0和1的概率）
)

Device set to use cuda:0


In [27]:
train_df=pd.read_json('data/train_text_labels.json',orient='records',lines=True)

In [28]:
train_df['text'] = train_df['text'].replace('', ' ')  # 替换空字符串
train_df_1=train_df[:95]
train_df_2=train_df[95:]

In [29]:
outpus_1=classifier(train_df_1['text'].tolist())
outpus_2=classifier(train_df_2['text'].tolist())

In [30]:
predict_1 = [row[1]['score'] for row in outpus_1]
predict_2 = [row[1]['score'] for row in outpus_2]

In [31]:
train_csv=pd.read_csv('data/train.csv')

In [32]:
train_csv['predict_1']=predict_1
train_csv['predict_2']=predict_2
train_csv

Unnamed: 0,id,real_text_id,predict_1,predict_2
0,0,1,0.995256,0.976252
1,1,2,0.992228,0.980944
2,2,1,0.993001,0.994762
3,3,2,0.943174,0.978689
4,4,2,0.997313,0.988094
...,...,...,...,...
90,90,2,0.979684,0.980632
91,91,1,0.992949,0.226817
92,92,2,0.759456,0.980790
93,93,2,0.997498,0.995895


In [33]:
import numpy as np

# 当 predict_1 > predict_2 时为1，否则为2
train_csv['predict'] = np.where(train_csv['predict_1'] > train_csv['predict_2'], 1, 2)
train_csv

Unnamed: 0,id,real_text_id,predict_1,predict_2,predict
0,0,1,0.995256,0.976252,1
1,1,2,0.992228,0.980944,1
2,2,1,0.993001,0.994762,2
3,3,2,0.943174,0.978689,2
4,4,2,0.997313,0.988094,1
...,...,...,...,...,...
90,90,2,0.979684,0.980632,2
91,91,1,0.992949,0.226817,1
92,92,2,0.759456,0.980790,2
93,93,2,0.997498,0.995895,1


In [34]:
train_csv['real_text_id']=train_csv['real_text_id'].astype(int)
accuracy = (train_csv['real_text_id'] == train_csv['predict']).mean()
print(f"在训练集上的准确率为: {accuracy:.5f}")

在训练集上的准确率为: 0.57895


## 用sft模型进行推理

## 把额外数据构造成本比赛的格式

In [67]:
import pandas as pd
import numpy as np
import os

In [68]:
extra_path='data/extra_train.jsonl'

In [69]:
extra_data=pd.read_json(extra_path,orient='records',lines=True)

In [70]:
extra_data

Unnamed: 0,text,label
0,S. cities. Here's more about Seattle's snowfa...,1
1,This paper delves into the interpretability a...,1
2,You could sell this toy to all your friends a...,1
3,"My time to shine . The term "" Rockefeller Rep...",0
4,This case raised several ethical and legal qu...,1
...,...,...
27995,Each letter of the alphabet is assigned to a ...,1
27996,That's because people need to make sure that ...,1
27997,"As a general rule, you must choose between a ...",0
27998,This can be used for international trade and ...,1


In [71]:
man_extra_data = extra_data[extra_data['label'] == 0].copy()  # label=0的所有数据
ai_extra_data = extra_data[extra_data['label'] == 1].copy()  # label=1的所有数据

In [77]:
max_pairs_per_type = min(len(man_extra_data), len(ai_extra_data))//2  # 每种配对类型的最大数量

In [78]:
max_pairs_per_type

6546

In [85]:
sample0_type1 = man_extra_data.sample(n=max_pairs_per_type, replace=False, random_state=np.random.randint(0, 1000))  # 随机采样label0
sample1_type1 = ai_extra_data.sample(n=max_pairs_per_type, replace=False, random_state=np.random.randint(0, 1000))  # 随机采样label1
pairs_type1 = pd.DataFrame({
    'file_1': sample0_type1['text'].values,
    'file_2': sample1_type1['text'].values,
    'real_text_id':1
})

# 类型2：text1（label1） + text2（label0）
sample1_type2 = ai_extra_data.sample(n=max_pairs_per_type, replace=False, random_state=np.random.randint(0, 1000))  # 随机采样label1（与type1不重复）
sample0_type2 = man_extra_data.sample(n=max_pairs_per_type, replace=False, random_state=np.random.randint(0, 1000))  # 随机采样label0（与type1不重复）
pairs_type2 = pd.DataFrame({
    'file_1': sample1_type2['text'].values,
    'file_2': sample0_type2['text'].values,
    'real_text_id':2
})

# 合并两种配对并打乱顺序
final_pairs = pd.concat([pairs_type1, pairs_type2], ignore_index=True)
final_pairs = final_pairs.sample(frac=1, random_state=np.random.randint(0, 1000)).reset_index(drop=True)  # 打乱顺序

In [86]:
# 查看结果
final_pairs

Unnamed: 0,file_1,file_2,real_text_id
0,"Believe it or not, unless you directly contac...",This could include filing a lawsuit against t...,1
1,"Sanskrit, as one of the oldest Indo-European ...",Medically speaking a seizure is when the elec...,2
2,A Shirley Temple short subject would likely c...,- If you criminalize alcohol / drug use or sm...,2
3,It 's three fold : * Stuff is cheaper to mass...,Miami's vibrant music scene has long been a m...,1
4,That 's the problem of consciousness : we jus...,Different religions have different ways of pr...,1
...,...,...,...
13087,"The deadline to mail is February 15. However,...","For years, he was a ghost, a rumor whispered ...",1
13088,"There are many different systems , but the ba...",The company has reportedly pushed back the re...,1
13089,Barrels breathe . They expand and contract wi...,Here’s how you might continue: 1. Reservation...,1
13090,"An activity tracker, also known as a fitness ...",While you haven't mentioned the title of the ...,1


In [87]:
final_pairs["id"] = final_pairs.index

In [89]:
final_pairs.to_json('data/extra_train.json',orient='records',lines=True)

## 未完待续

In [96]:
import time

In [103]:
begin_time=time.time()

In [104]:
end_time=time.time()

In [105]:
f"处理时间为: {(end_time-begin_time):.2f}"

'处理时间为: 24.07'