In [None]:
import os
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl.trainer import GRPOConfig, GRPOTrainer
model_name = "/mnt/d/program/pycharmProgram/models/Qwen2.5-0.5B-Instruct"
dataset_name = "_ABSA_/SFT_RL"
full_finetuning = True
way = 'full' if full_finetuning else 'lora'
output_dir = f"outputs/{model_name.split('/')[-1]}-GRPO"
run_name = f"{os.name}-{model_name.split('/')[-1]}-{dataset_name.split('/')[-1]}-{torch.cuda.get_device_name(torch.device('cuda:0')).split(' ')[-1]}-{way}"

In [None]:
# Set memory-related environment variables
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

max_prompt_length=2000
max_completion_length=3000

training_args = GRPOConfig(
    output_dir=output_dir,
    run_name=run_name,
    learning_rate=1e-5,
    beta=0.005,
    optim="adamw_8bit",
    adam_beta1=0.9,
    adam_beta2=0.99,
    weight_decay=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=4,
    num_generations=4,  # group size
    gradient_accumulation_steps=4,
    max_prompt_length=max_prompt_length,
    max_completion_length=max_completion_length,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.1,
    report_to="none",
    log_on_each_node=False,
    use_vllm=False,
    # vllm_init_kwargs={
    #     "device": "cuda:0",
    #     "gpu_memory_utilization": 0.3,
    #     "max_model_len": max_prompt_length + max_completion_length,
    #     "dtype": "half",
    #     # "enable_chunked_prefill": True,
    #     # "max_num_batched_tokens": 2048,
    # },
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    # logit_computation_mini_batch_size=1,
    # enable_profiling=False
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    use_cache = False # if use flash_attn
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=training_args.max_completion_length,
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
R1_STYLE_SYSTEM_PROMPT = """"""

TASK_SPECIFIC_INSTRUCTIONS = ""

def get_ABSA_questions(split="train") -> Dataset:
    dataset = load_dataset('json', data_files='_train.json')  # 换成自己的json数据

    def process_batch(batch):
        prompts = [[
            {'role': 'system', 'content': R1_STYLE_SYSTEM_PROMPT + "\n" + TASK_SPECIFIC_INSTRUCTIONS},
            {'role': 'user',
             'content': ""},
            {'role': 'assistant',
             'content': "<reasoning></answer>"},
            {'role': 'user', 'content': q.strip()}
        ] for q in batch['question']]

        return {
            'prompt': prompts,
            'answer': [a for a in batch['answer']]
        }

    return dataset.map(process_batch, batched=True, batch_size=100)

def answer_correctness_func(prompts, completions, answer, **kwargs):
    """"""
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    reward_score = []
    for resp, ans in zip(extracted_responses, answer):
      if resp == ans:
        reward_score.append(1.0)
      else:
        reward_score.append(0.0)
    print(f"Question: {prompts[0][-1]['content']}\nAnswer: {answer[0]}\nResponse: {responses[0]}\nExtracted: {extracted_responses[0]}")
    return reward_score

dataset = get_ABSA_questions()['train']


def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()


# F1 reward
# strict F1 reward func
# compare strict F1 and soft F1

def format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has the correct format."""
    pattern = r"^<reasoning>(?:(?!</reasoning>).)*</reasoning>\n<answer>(?:(?!</answer>).)*</answer>$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [bool(re.match(pattern, r)) for r in responses]
    return [1.0 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """soft reward, not necessarily"""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

In [None]:
# Initialize trainer
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        format_reward_func,
        strict_pair_correctness_reward_func
    ],
    args=training_args,
    train_dataset=dataset,
)
trainer.train()