In [1]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

### HumanEval #80 Model Response ###
# Initialize model and tokenizer
model_name = "finegptproject/humaneval_SFTTrainer_model"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

# Define the test cases
test_cases = [
    ('a', False),
    ('aa', False),
    ('abcd', True),
    ('aabb', False),
    ('adb', True),
    ('xyy', False),
]

def extract_answer(output_str, input_str):
    # Remove the input string part from the output
    start_index = output_str.find(input_str)
    if start_index != -1:
        output_str = output_str[start_index + len(input_str):]
    
    # Clean and parse the output string
    output_str = output_str.strip().lower()
    if 'true' in output_str:
        return True
    elif 'false' in output_str:
        return False
    else:
        return None

def run_test_cases(test_cases):
    for s, expected in test_cases:
        # Use the exact prompt
        prompt = (
            f"def is_happy(s): \"\"\"You are given a string s. Your task is to check if the string is happy or not. "
            f"A string is happy if its length is at least 3 and every 3 consecutive letters are distinct. "
            f"For example: is_happy(a) => False is_happy(aa) => False is_happy(abcd) => True is_happy(aabb) => False "
            f"is_happy(adb) => True is_happy(xyy) => False \"\"\"\n\n"
            f"Check if the string '{s}' is happy. Answer with 'True' or 'False' only. If the string is not happy, answer 'False'."
        )
        inputs_encoded = tokenizer(prompt, return_tensors='pt')

        # Generate the output with a limit on max_length to reduce irrelevant content
        with torch.no_grad():
            outputs = model.generate(**inputs_encoded, max_length=230, num_return_sequences=1)

        output_str = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        model_answer = extract_answer(output_str, s)

        # Apply a basic post-processing logic
        if model_answer is None:
            print(f"Model did not give a clear True/False answer for input '{s}'.")
        elif model_answer != expected:
            model_answer = f"Neither True or False"
        
        print(f"Input: '{s}'")
        print(f"Extracted Answer: {model_answer}")
        print(f"Expected: {expected}\n")

# Run the test cases
run_test_cases(test_cases)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Input: 'a'
Extracted Answer: Neither True or False
Expected: False

Input: 'aa'
Extracted Answer: Neither True or False
Expected: False

Input: 'abcd'
Extracted Answer: True
Expected: True

Input: 'aabb'
Extracted Answer: Neither True or False
Expected: False

Input: 'adb'
Extracted Answer: True
Expected: True

Input: 'xyy'
Extracted Answer: Neither True or False
Expected: False



In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

### HumanEval #80 GPT2 Response ###

# Initialize model and tokenizer
model_name = "gpt2" 
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the test cases
test_cases = [
    ('a', False),
    ('aa', False),
    ('abcd', True),
    ('aabb', False),
    ('adb', True),
    ('xyy', False),
]

def extract_answer(output_str, input_str):
    # Remove the input string part from the output
    start_index = output_str.find(input_str)
    if start_index != -1:
        output_str = output_str[start_index + len(input_str):]
    
    # Clean and parse the output string
    output_str = output_str.strip().lower()
    if 'true' in output_str:
        return True
    elif 'false' in output_str:
        return False
    else:
        return None

def run_test_cases(test_cases):
    for s, expected in test_cases:
        # Use the exact prompt
        prompt = (
            f"def is_happy(s): \"\"\"You are given a string s. Your task is to check if the string is happy or not. "
            f"A string is happy if its length is at least 3 and every 3 consecutive letters are distinct. "
            f"For example: is_happy(a) => False is_happy(aa) => False is_happy(abcd) => True is_happy(aabb) => False "
            f"is_happy(adb) => True is_happy(xyy) => False \"\"\"\n\n"
            f"Check if the string '{s}' is happy. Answer with 'True' or 'False' only. If the string is not happy, answer 'False'."
        )
        inputs_encoded = tokenizer(prompt, return_tensors='pt')

        # Generate the output with a limit on max_length to reduce irrelevant content
        with torch.no_grad():
            outputs = model.generate(**inputs_encoded, max_length=230, num_return_sequences=1)

        output_str = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        model_answer = extract_answer(output_str, s)

        # Apply a basic post-processing logic
        if model_answer is None:
            print(f"Model did not give a clear True/False answer for input '{s}'.")
        elif model_answer != expected:
            model_answer = f"Neither True or False"
        
        print(f"Input: '{s}'")
        print(f"Extracted Answer: {model_answer}")
        print(f"Expected: {expected}\n")

# Run the test cases
run_test_cases(test_cases)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: 'a'
Extracted Answer: Neither True or False
Expected: False



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: 'aa'
Extracted Answer: Neither True or False
Expected: False



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: 'abcd'
Extracted Answer: True
Expected: True



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: 'aabb'
Extracted Answer: Neither True or False
Expected: False



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: 'adb'
Extracted Answer: True
Expected: True

Input: 'xyy'
Extracted Answer: Neither True or False
Expected: False

