In [7]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

### HumanEval #103 Pass@k Test Prompt ###

# Initialize model and tokenizer
model_name = "finegptproject/humaneval_SFTTrainer_model"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

def generate_outputs(prompt, model, tokenizer, top_k=5):
    """Generate multiple outputs for a given prompt."""
    inputs_encoded = tokenizer(prompt, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(
            **inputs_encoded,
            max_length=200,
            num_return_sequences=top_k,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    return [tokenizer.decode(output, skip_special_tokens=True).strip() for output in outputs]

def pass_at_k(test_cases, k=5):
    """Evaluate Pass@k metric on the given test cases."""
    pass_at_k_count = 0
    total_cases = len(test_cases)
    
    for n, m, expected in test_cases:
        prompt = (
            f"def rounded_avg(n, m):\n"
            f"    \"\"\"\n"
            f"    You are given two positive integers n and m, and your task is to compute the average of the integers "
            f"from n through m (including n and m). Round the answer to the nearest integer and convert that to binary. "
            f"If n is greater than m, return -1.\n"
            f"    Example:\n"
            f"    rounded_avg(1, 5) => \"0b11\"\n"
            f"    rounded_avg(7, 5) => -1\n"
            f"    rounded_avg(10, 20) => \"0b1111\"\n"
            f"    rounded_avg(20, 33) => \"0b11010\"\n"
            f"    \"\"\"\n\n"
            f"# Test Case: n={n}, m={m}\n"
        )
        
        outputs = generate_outputs(prompt, model, tokenizer, top_k=k)
        expected_str = str(expected)  # Convert expected to string for comparison
        if any(expected_str in output for output in outputs):
            pass_at_k_count += 1
    
    pass_at_k_score = pass_at_k_count / total_cases
    print(f"Pass@{k}: {pass_at_k_score:.2f}")

# Define test cases as tuples of (n, m, expected_output)
test_cases = [
    (1, 5, "0b11"),
    (7, 13, "0b1010"),
    (964, 977, "0b1111001010"),
    (996, 997, "0b1111100100"),
    (560, 851, "0b1011000010"),
    (185, 546, "0b101101110"),
    (362, 496, "0b110101101"),
    (350, 902, "0b1001110010"),
    (197, 233, "0b11010111"),
    (7, 5, -1),
    (5, 1, -1),
    (5, 5, "0b101")
]

# Run Pass@k test
pass_at_k(test_cases, k=5)



Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Pass@5: 0.25


In [8]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

### HumanEval #103 GPT-2 Pass@k Test Prompt ###

# Initialize model and tokenizer
model_name = "gpt2" 
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


def generate_outputs(prompt, model, tokenizer, top_k=5):
    """Generate multiple outputs for a given prompt."""
    inputs_encoded = tokenizer(prompt, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(
            **inputs_encoded,
            max_length=200,
            num_return_sequences=top_k,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    return [tokenizer.decode(output, skip_special_tokens=True).strip() for output in outputs]

def pass_at_k(test_cases, k=5):
    """Evaluate Pass@k metric on the given test cases."""
    pass_at_k_count = 0
    total_cases = len(test_cases)
    
    for n, m, expected in test_cases:
        prompt = (
            f"def rounded_avg(n, m):\n"
            f"    \"\"\"\n"
            f"    You are given two positive integers n and m, and your task is to compute the average of the integers "
            f"from n through m (including n and m). Round the answer to the nearest integer and convert that to binary. "
            f"If n is greater than m, return -1.\n"
            f"    Example:\n"
            f"    rounded_avg(1, 5) => \"0b11\"\n"
            f"    rounded_avg(7, 5) => -1\n"
            f"    rounded_avg(10, 20) => \"0b1111\"\n"
            f"    rounded_avg(20, 33) => \"0b11010\"\n"
            f"    \"\"\"\n\n"
            f"# Test Case: n={n}, m={m}\n"
        )
        
        outputs = generate_outputs(prompt, model, tokenizer, top_k=k)
        expected_str = str(expected)  # Convert expected to string for comparison
        if any(expected_str in output for output in outputs):
            pass_at_k_count += 1
    
    pass_at_k_score = pass_at_k_count / total_cases
    print(f"Pass@{k}: {pass_at_k_score:.2f}")

# Define test cases as tuples of (n, m, expected_output)
test_cases = [
    (1, 5, "0b11"),
    (7, 13, "0b1010"),
    (964, 977, "0b1111001010"),
    (996, 997, "0b1111100100"),
    (560, 851, "0b1011000010"),
    (185, 546, "0b101101110"),
    (362, 496, "0b110101101"),
    (350, 902, "0b1001110010"),
    (197, 233, "0b11010111"),
    (7, 5, -1),
    (5, 1, -1),
    (5, 5, "0b101")
]

# Run Pass@k test
pass_at_k(test_cases, k=5)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Pass@5: 0.25
