In [1]:
from datasets import load_dataset
import torch

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
torch.cuda.is_available()

True

In [4]:
ds = load_dataset("qiaojin/PubMedQA", "pqa_unlabeled")


In [5]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer'],
        num_rows: 61249
    })
})


In [6]:
print(ds['train'][0]['context']['contexts'])

['Although the use of alternative medicine in the United States is increasing, no published studies have documented the effectiveness of naturopathy for treatment of menopausal symptoms compared to women receiving conventional therapy in the clinical setting.', 'To compare naturopathic therapy with conventional medical therapy for treatment of selected menopausal symptoms.', 'A retrospective cohort study, using abstracted data from medical charts.', 'One natural medicine and six conventional medical clinics at Community Health Centers of King County, Washington, from November 1, 1996, through July 31, 1998.', 'Women aged 40 years of age or more with a diagnosis of menopausal symptoms documented by a naturopathic or conventional physician.', 'Improvement in selected menopausal symptoms.', 'In univariate analyses, patients treated with naturopathy for menopausal symptoms reported higher monthly incomes ($1848.00 versus $853.60), were less likely to be smokers (11.4% versus 41.9%), exerci

In [7]:
SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and given a context, the Assistant solves it. The assistant "
    "first thinks about the context, then reasoning process in the mind and then provides the user with the answer. The reasoning "
    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> reasoning process here </think><answer> answer here </answer>"
)

def make_conversation(example):
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "".join(example["context"]['contexts'])},
            {"role": "user", "content": example["question"]},
        ],
    }

dds = ds.map(make_conversation)

In [8]:
dds

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'prompt'],
        num_rows: 61249
    })
})

In [9]:
dds['train'][0]['prompt']

[{'content': 'A conversation between User and Assistant. The user asks a question, and given a context, the Assistant solves it. The assistant first thinks about the context, then reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>',
  'role': 'system'},
 {'content': 'Although the use of alternative medicine in the United States is increasing, no published studies have documented the effectiveness of naturopathy for treatment of menopausal symptoms compared to women receiving conventional therapy in the clinical setting.To compare naturopathic therapy with conventional medical therapy for treatment of selected menopausal symptoms.A retrospective cohort study, using abstracted data from medical charts.One natural medicine and six conventional medical clinics at Community Health C

In [10]:
dds['train'][1]

{'pubid': 14499049,
 'question': 'Can randomised trials rely on existing electronic data?',
 'context': {'contexts': ['To estimate the feasibility, utility and resource implications of electronically captured routine data for health technology assessment by randomised controlled trials (RCTs), and to recommend how routinely collected data could be made more effective for this purpose.',
   'Four health technology assessments that involved patients under care at five district general hospitals in the UK using four conditions from distinct classical specialties: inflammatory bowel disease, obstructive sleep apnoea, female urinary incontinence, and total knee replacement. Patient-identifiable, electronically stored routine data were sought from the administration and clinical database to provide the routine data.',
   'Four RCTs were replicated using routine data in place of the data already collected for the specific purpose of the assessments. This was done by modelling the research pro

In [11]:
train_dataset = dds.remove_columns(['pubid', 'question', 'context'])
print(train_dataset)

DatasetDict({
    train: Dataset({
        features: ['long_answer', 'prompt'],
        num_rows: 61249
    })
})


In [12]:

from transformers import AutoModelForCausalLM

model_id = "Qwen/Qwen2-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
)

In [13]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [14]:
import re

def format_reward(completions, **kwargs):
    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    rewards_list = [1.0 if match else 0.0 for match in matches]
    return [1.0 if match else 0.0 for match in matches]

In [15]:
from sentence_transformers import SentenceTransformer, util

# Load a pretrained SBERT model
encoding_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_similarity(paragraph1, paragraph2):
    embedding1 = encoding_model.encode(paragraph1, convert_to_tensor=True)
    embedding2 = encoding_model.encode(paragraph2, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return similarity

def reward_function(completions, **kwargs):
    long_answers = kwargs["long_answer"]
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, long_answer in zip(completion_contents, long_answers):
        similarity = get_similarity(content, long_answer)
        if similarity > 0.9:
            rewards.append(1.)
        elif similarity > 0.7:
            rewards.append(0.5)
        elif similarity > 0.5:
            rewards.append(0.0)
        else:
            rewards.append(-1.0)
    return rewards

In [16]:
from trl import GRPOConfig

# Configure training arguments using GRPOConfig
training_args = GRPOConfig(
    output_dir="Qwen2-0.5B-GRPO-test",
    learning_rate=1e-5,
    remove_unused_columns=False, # to access the solution column in accuracy_reward
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    bf16=True,

    # Parameters that control de data preprocessing
    max_completion_length=64, # default: 256
    num_generations=4, # default: 8
    max_prompt_length=128, # default: 512

    # Parameters related to reporting and saving
    report_to=["tensorboard"],
    logging_steps=10,
    push_to_hub=True,
    save_strategy="steps",
    save_steps=10,
)

In [19]:
from trl import GRPOTrainer

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[format_reward, reward_function],
    args=training_args,
    train_dataset=train_dataset['train']
)

In [20]:
trainer.train()

Step,Training Loss
10,0.0


KeyboardInterrupt: 

In [None]:
trainer.save_model(training_args.output_dir)