In [1]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from openai import OpenAI
import requests
import json
from tqdm import tqdm
import pickle

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load and shuffle the full test split
smoltalk_test_raw = load_dataset("HuggingFaceTB/smol-smoltalk", split="test").shuffle(seed=42)
smoltalk_test_sample = smoltalk_test_raw.select(range(100))

# Extract user prompts only
def extract_user_prompt(messages):
    for msg in messages:
        if msg['role'] == 'user':
            return msg['content']
    return None

eval_prompts = [extract_user_prompt(example['messages']) for example in smoltalk_test_sample if extract_user_prompt(example['messages'])]


In [4]:
PRETRAINED_MODEL = "./nonlora_sft_smoltalk_9k_b8_lr1e-5_ga32"

In [5]:
sft_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
sft_tokenizer.padding_side='left'
sft_tokenizer.add_special_tokens({'pad_token': '<|pad|>',
                              'bos_token': '<|im_start|>',
                              'eos_token': '<|im_end|>'})

sft_model = AutoModelForCausalLM.from_pretrained(PRETRAINED_MODEL, torch_dtype=torch.float16,).to(device)
sft_model.config.pad_token_id=sft_tokenizer.pad_token_id
sft_model.config.eos_token_id=sft_tokenizer.eos_token_id
sft_model.config.bos_token_id=sft_tokenizer.bos_token_id
sft_model.eval()

# reference model
ref_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", torch_dtype=torch.float16,).to(device)
ref_tokenizer.padding_side='left'
ref_tokenizer.pad_token_id=ref_tokenizer.eos_token_id
ref_model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [6]:
client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1", 
    api_key="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"  # MY KEY
)

def get_reward_score(prompt, response):
    messages = [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": response}
    ]
    result = client.chat.completions.create(
        model="nvidia/llama-3.1-nemotron-70b-reward",
        messages=messages
    )
    content = result.choices[0].message.content.strip()
    return content

In [7]:
ref_tokenizer.eos_token

'<|im_end|>'

In [None]:
def generate_batch(batch_size, prompts, model, tokenizer, type = 'instruct'):
    
    rep_penalty = 1 if type == 'instruct' else 1 + 1e-5
    
    outputs_list = []
    for i in tqdm(range(0, len(prompts), batch_size)):
        batch = prompts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True)

        output_sequences = model.generate(
            input_ids=inputs['input_ids'].to(model.device),
            attention_mask=inputs['attention_mask'].to(model.device),
            do_sample=False, # disable sampling to test if batching affects output
            pad_token_id=tokenizer.pad_token_id,
            bos_token_id=tokenizer.bos_token_id,
            forced_eos_token_id=tokenizer.eos_token_id,
            tokenizer=tokenizer,
            repetition_penalty=rep_penalty,
            stop_strings = tokenizer.eos_token,
            exponential_decay_length_penalty = (300,2),
            max_new_tokens= 590
        )
        completions_only = output_sequences[:, inputs['input_ids'].shape[1]:]
        outputs_decoded = tokenizer.batch_decode(completions_only, skip_special_tokens=True)
        # print(output_completions)
        # print(output_sequences)
        outputs_list.extend(outputs_decoded)
    return outputs_list

In [19]:
BATCH_SIZE = 8

sft_wins = 0
total_evals = 0

sft_completions = generate_batch(BATCH_SIZE, eval_prompts, sft_model, sft_tokenizer, type = 'sft')
ref_completions = generate_batch(BATCH_SIZE, eval_prompts, ref_model, ref_tokenizer, type = 'instruct')
# with open('ref_completions.pkl', 'rb') as f:
#     ref_completions = pickle.load(f)

for prompt, sft_response, ref_response in zip(eval_prompts, sft_completions, ref_completions):
    sft_reward = get_reward_score(prompt, sft_response)
    sft_reward = float(sft_reward.split(':')[-1])
    ref_reward = get_reward_score(prompt, ref_response)
    ref_reward = float(ref_reward.split(':')[-1])
    
    if sft_reward > ref_reward:
        sft_wins += 1
        
    total_evals += 1
    
winrate = sft_wins/total_evals
print(winrate)

100%|██████████| 13/13 [01:31<00:00,  7.02s/it]
  0%|          | 0/13 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  8%|▊         | 1/13 [00:06<01:14,  6.18s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 15%|█▌        | 2/13 [00:20<01:58, 10.74s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 23%|██▎       | 3/13 [00:26<01:27,  8.72s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 31%|███       | 4/13 [00:32<01:10,  7.80s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERB

0.4


In [10]:
sft_response

'.\n\nDr. Emily Carter, a workshop workshop, reviewed feedback forms and suggested a more structured approach, but the plan deviated from the plan. Emily, a workshop workshop, suggested a more structured approach, but the feedback forms were not satisfied. Emily, a workshop workshop, reviewed feedback forms and suggested a more structured approach, but the feedback forms were not satisfied. Emily, a workshop workshop, suggested a more structured approach, but the feedback forms were not satisfied. Emily, a workshop workshop, suggested a more structured approach, but the feedback forms were not satisfied. Emily, a workshop workshop, suggested a more structured approach, but the feedback forms were not satisfied. Emily, a workshop workshop, suggested a more structured approach, but the feedback forms were not satisfied. Emily, a workshop workshop, suggested a more structured approach, but the feedback forms were not satisfied. Emily, a workshop workshop, suggested a more structured appro

In [11]:
ref_response

"\n\nCan you please provide me with a summary of the feedback forms and the workshop's structure? Also, can you suggest a more structured approach to the feedback forms? Lastly, can you provide me with a sample of the feedback forms? \n\nThank you for your help.\n\nBest,\nDr. Emily Carter\n\nSure, here is a summary of the feedback forms and the workshop's structure:\n\nFeedback Forms:\n\n1. Participant Feedback Form: This form is used to gather feedback from participants on the workshop. It includes questions about the workshop's content, delivery, and overall experience. The feedback forms are designed to be open and honest, allowing participants to share their thoughts and concerns.\n\n2. Workshop Structure Form: This form is used to outline the structure of the workshop, including the topics, activities, and delivery methods. The feedback forms are used to gather feedback on the workshop structure, ensuring that it meets the needs of the participants.\n\nTo address this issue, I wou

In [12]:
sft_reward

-34.0

In [13]:
ref_reward

-28.75

In [14]:
# # SPDX-License-Identifier: Apache-2.0

# from vllm import LLM, SamplingParams

# # Sample prompts.
# prompts = [
#     "Hello, my name is",
#     "The president of the United States is",
#     "The capital of France is",
#     "The future of AI is",
# ]
# # Create a sampling params object.
# sampling_params = SamplingParams(repetition_penalty=1.5, max_tokens=590)


# def main():
#     # Create an LLM.
#     llm = LLM(model="facebook/opt-125m")
    
#     llm = LLM(model="./finished_smoltalk")
#     # Generate texts from the prompts.
#     # The output is a list of RequestOutput objects
#     # that contain the prompt, generated text, and other information.
#     outputs = llm.generate(prompts, sampling_params)
#     # Print the outputs.
#     print("\nGenerated Outputs:\n" + "-" * 60)
#     for output in outputs:
#         prompt = output.prompt
#         generated_text = output.outputs[0].text
#         print(f"Prompt:    {prompt!r}")
#         print(f"Output:    {generated_text!r}")
#         print("-" * 60)


# if __name__ == "__main__":
#     main()

In [15]:
# import ray
# from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor
# import numpy as np

# config = vLLMEngineProcessorConfig(
#     model_source="unsloth/Llama-3.1-8B-Instruct",
#     engine_kwargs={
#         "enable_chunked_prefill": True,
#         "max_num_batched_tokens": 4096,
#         "max_model_len": 16384,
#     },
#     concurrency=1,
#     batch_size=32,
# )
# processor = build_llm_processor(
#     config,
#     preprocess=lambda row: dict(
#         messages=[
#             {"role": "user", "content": row["item"]}
#         ],
#         sampling_params=dict(
#             temperature=0.3,
#             max_tokens=250,
#         )
#     ),
#     postprocess=lambda row: dict(
#         answer=row["generated_text"],
#         **row  # This will return all the original columns in the dataset.
#     ),
# )

# ds = ray.data.from_items(["Start of the haiku is: Complete this for me..."])

# ds = processor(ds)
# ds.show(limit=1)