# import

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model 

In [None]:
def load_raw_model(model_path,model_type, device="cuda",checkpoint_path=None):
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map=device,
                attn_implementation="flash_attention_2",
                                                 )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    if model_type == "checkpoint":
        checkpoints = torch.load(checkpoint_path)
        model.load_state_dict(checkpoints, strict=False)

In [None]:

def do_test_sample(model,tokenizer,questions, max_new_tokens=2048, device="cuda"):
    inputs = tokenizer(questions, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens+input_ids.shape[1],
            do_sample=True,
            top_p=0.7,
            temperature=0.95,
            pad_token_id=tokenizer.eos_token_id,
        )
    outputs = generation_output[:, input_ids.shape[1]:]
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return decoded_outputs

# Prepare the checkpoint

download:
    1. raw model from modelscope
    2. sft model from modelscope
    3. rl checkpoint from training

and place them in your local path

In [None]:
raw_model_path =  "/home/fit/alex/.cache/modelscope/hub/models/Qwen/Qwen3-8B"#"local_path/to/your/raw/model"
sft_model_path =  "/WORK/fit/alex/Kaisen/checkpoints/qwen/openr1-2/global_step_1000/huggingface"#"local_path/to/your/sft/model"
rl_model_path = "/home/fit/alex/Kaisen.Yang/CoT Decomposition/temp_huggingface"

model_path = rl_model_path
model_type = "rl"
device= "cuda"

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map=device,
                attn_implementation="flash_attention_2",
                trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
prompt = "Find the solution to the problem: If x^2 +x + 1 = 2, what is the value of x?"  # 输入文本

messages = [
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False )
text += "<EXPLORATION>" # need to add manually for SFT and RL model
# text+= "<EXPLORATION></EXPLORATION>" # This will do execusion without exploration
input = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input["input_ids"],
        attention_mask=input["attention_mask"],
        max_new_tokens=4000,
        do_sample=True,
        top_p=0.7,
        temperature=0.95,
        pad_token_id=tokenizer.eos_token_id,
    )
output = generation_output[0, input["input_ids"].shape[1]:]
decoded_output = tokenizer.decode(output, skip_special_tokens=True)
print(decoded_output)
