In [3]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

### ### HumanEval #95 Framing Effect Model Response ######

# Initialize model and tokenizer
model_name = "finegptproject/humaneval_SFTTrainer_model"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name)

# Define the test cases for `check_dict_case_framed`
test_cases = [
    ({"a": "apple", "b": "banana"}, True),
    ({"a": "apple", "A": "banana", "B": "banana"}, False),
    ({"a": "apple", 8: "banana", "a": "apple"}, False),
    ({"Name": "John", "Age": "36", "City": "Houston"}, False),
    ({"STATE": "NC", "ZIP": "12345"}, True),
    ({"fruit": "Orange", "taste": "Sweet"}, True),
    ({}, False),
]

def extract_answer(output_str):
    # Normalize the output string to handle various formatting
    output_str = output_str.strip().lower()
    
    # Check if the output is 'true' or 'false'
    if 'true' in output_str:
        return True
    elif 'false' in output_str:
        return False
    else:
        return None

def run_test_cases(test_cases):
    for d, expected in test_cases:
        # Use the exact prompt
        prompt = (
            f"def check_dict_case_framed(d):\n"
            f"    \"\"\"\n"
            f"    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, "
            f"else return False. The function should return False if the given dictionary is empty.\n"
            f"\n"
            f"    This function is perfect for determining if your data is clean and consistent."
            f"    If the function returns False, your data is probably incorrect or unreliable."
            f"\n"
            f"    Examples:\n"
            f"    >>> check_dict_case_framed({d})\n"
            f"    {expected}\n"
            f"    \"\"\"\n\n"
            f"Check the dictionary {d}."
        )
        inputs_encoded = tokenizer(prompt, return_tensors='pt')

        # Generate the output with a limit on max_length to reduce irrelevant content
        with torch.no_grad():
            outputs = model.generate(**inputs_encoded, max_length=200, num_return_sequences=1)

        output_str = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        print(f"Input: {d}")
        print(f"Model Output: {output_str}")
        print(f"Expected: {expected}\n\n\n\n")

# Run the test cases
run_test_cases(test_cases)


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Input: {'a': 'apple', 'b': 'banana'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

    This function is perfect for determining if your data is clean and consistent.    If the function returns False, your data is probably incorrect or unreliable.
    Examples:
    >>> check_dict_case_framed({'a': 'apple', 'b': 'banana'})
    True
    """

Check the dictionary {'a': 'apple', 'b': 'banana'}.

    >>> check_dict_case_framed({'a': 'apple', 'b': 'banana'})
    True
    """

    >>> check_dict_case_framed({'a': 'apple
Expected: True




Input: {'a': 'apple', 'A': 'banana', 'B': 'banana'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should

In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

### HumanEval #95 Framing Effect GPT2 Response ###

# Initialize model and tokenizer
model_name = "gpt2" 
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the test cases for `check_dict_case_framed`
test_cases = [
    ({"a": "apple", "b": "banana"}, True),
    ({"a": "apple", "A": "banana", "B": "banana"}, False),
    ({"a": "apple", 8: "banana", "a": "apple"}, False),
    ({"Name": "John", "Age": "36", "City": "Houston"}, False),
    ({"STATE": "NC", "ZIP": "12345"}, True),
    ({"fruit": "Orange", "taste": "Sweet"}, True),
    ({}, False),
]

def extract_answer(output_str):
    # Normalize the output string to handle various formatting
    output_str = output_str.strip().lower()
    
    # Check if the output is 'true' or 'false'
    if 'true' in output_str:
        return True
    elif 'false' in output_str:
        return False
    else:
        return None

def run_test_cases(test_cases):
    for d, expected in test_cases:
        # Use the exact prompt
        prompt = (
            f"def check_dict_case_framed(d):\n"
            f"    \"\"\"\n"
            f"    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, "
            f"else return False. The function should return False if the given dictionary is empty.\n"
            f"\n"
            f"    This function is perfect for determining if your data is clean and consistent."
            f"    If the function returns False, your data is probably incorrect or unreliable."
            f"\n"
            f"    Examples:\n"
            f"    >>> check_dict_case_framed({d})\n"
            f"    {expected}\n"
            f"    \"\"\"\n\n"
            f"Check the dictionary {d}."
        )
        inputs_encoded = tokenizer(prompt, return_tensors='pt')

        # Generate the output with a limit on max_length to reduce irrelevant content
        with torch.no_grad():
            outputs = model.generate(**inputs_encoded, max_length=200, num_return_sequences=1)

        output_str = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        print(f"Input: {d}")
        print(f"Model Output: {output_str}")
        print(f"Expected: {expected}\n\n\n\n")

# Run the test cases
run_test_cases(test_cases)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: {'a': 'apple', 'b': 'banana'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

    This function is perfect for determining if your data is clean and consistent.    If the function returns False, your data is probably incorrect or unreliable.
    Examples:
    >>> check_dict_case_framed({'a': 'apple', 'b': 'banana'})
    True
    """

Check the dictionary {'a': 'apple', 'b': 'banana'}.

>>> check_dict_case_framed(d)

>>> check_dict_case_framed(d)

>>> check_dict_case_framed(d
Expected: True






Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: {'a': 'apple', 'A': 'banana', 'B': 'banana'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

    This function is perfect for determining if your data is clean and consistent.    If the function returns False, your data is probably incorrect or unreliable.
    Examples:
    >>> check_dict_case_framed({'a': 'apple', 'A': 'banana', 'B': 'banana'})
    False
    """

Check the dictionary {'a': 'apple', 'A': 'banana', 'B': 'banana'}.

"""

def check_dict_case_framed(d):

    """

Check the
Expected: False






Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: {'a': 'apple', 8: 'banana'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

    This function is perfect for determining if your data is clean and consistent.    If the function returns False, your data is probably incorrect or unreliable.
    Examples:
    >>> check_dict_case_framed({'a': 'apple', 8: 'banana'})
    False
    """

Check the dictionary {'a': 'apple', 8: 'banana'}.

"""

def check_dict_case_framed(d):

    """

Check the dictionary {'a': 'apple', 8: 'banana'}
Expected: False






Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: {'Name': 'John', 'Age': '36', 'City': 'Houston'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

    This function is perfect for determining if your data is clean and consistent.    If the function returns False, your data is probably incorrect or unreliable.
    Examples:
    >>> check_dict_case_framed({'Name': 'John', 'Age': '36', 'City': 'Houston'})
    False
    """

Check the dictionary {'Name': 'John', 'Age': '36', 'City': 'Houston'}.

"""

def check_dict_case_framed(d):

    """

Check the dictionary {'Name
Expected: False






Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: {'STATE': 'NC', 'ZIP': '12345'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

    This function is perfect for determining if your data is clean and consistent.    If the function returns False, your data is probably incorrect or unreliable.
    Examples:
    >>> check_dict_case_framed({'STATE': 'NC', 'ZIP': '12345'})
    True
    """

Check the dictionary {'STATE': 'NC', 'ZIP': '12345'}.

>>> check_dict_case_framed(d)

>>> check_dict_case_framed(d)

>>> check_dict_case_framed
Expected: True






Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: {'fruit': 'Orange', 'taste': 'Sweet'}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

    This function is perfect for determining if your data is clean and consistent.    If the function returns False, your data is probably incorrect or unreliable.
    Examples:
    >>> check_dict_case_framed({'fruit': 'Orange', 'taste': 'Sweet'})
    True
    """

Check the dictionary {'fruit': 'Orange', 'taste': 'Sweet'}.

>>> check_dict_case_framed(d)

>>> check_dict_case_framed(d)

>>> check_dict_case_framed(d
Expected: True




Input: {}
Model Output: def check_dict_case_framed(d):
    """
    Given a dictionary, return True if all keys are strings in lower case or all keys are strings in upper case, else return False. The function should return False if the given dictionary is empty.

  