In [1]:
import evaluate
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, load_metric
from huggingface_hub import login
from typing import List, Tuple
import google.generativeai as genai

import re
genai.configure(api_key='YOUR GEMINI API KEY HERE')
gemini = genai.GenerativeModel('gemini-1.0-pro-latest')

login(token='YOUR HUGGINGFACE API KEY HERE')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\danny\.cache\huggingface\token
Login successful


In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

metric = evaluate.load("accuracy")

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset = load_dataset("openai_humaneval")
humaneval_df = dataset['test'].to_pandas()
drop_questions = [6, 100, 40]
humaneval_df = humaneval_df.drop(index=drop_questions)
humaneval_df.head()

Unnamed: 0,task_id,prompt,canonical_solution,test,entry_point
0,HumanEval/0,from typing import List\n\n\ndef has_close_ele...,"for idx, elem in enumerate(numbers):\n ...","\n\nMETADATA = {\n 'author': 'jt',\n 'da...",has_close_elements
1,HumanEval/1,from typing import List\n\n\ndef separate_pare...,result = []\n current_string = []\n ...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",separate_paren_groups
2,HumanEval/2,\n\ndef truncate_number(number: float) -> floa...,return number % 1.0\n,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",truncate_number
3,HumanEval/3,from typing import List\n\n\ndef below_zero(op...,balance = 0\n\n for op in operations:\n...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",below_zero
4,HumanEval/4,from typing import List\n\n\ndef mean_absolute...,mean = sum(numbers) / len(numbers)\n re...,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",mean_absolute_deviation


In [4]:
def remove_triple_quotes(text: str) -> str:
    # Pattern to match triple double quotes and everything in between
    pattern_double_quotes = r'"""(.*?)"""'
    # Pattern to match triple single quotes and everything in between
    pattern_single_quotes = r"'''(.*?)'''"
    
    # Remove triple double quotes and their contents
    text_without_double_quotes = re.sub(pattern_double_quotes, '', text, flags=re.DOTALL)
    # Remove triple single quotes and their contents
    text_without_any_quotes = re.sub(pattern_single_quotes, '', text_without_double_quotes, flags=re.DOTALL)
    
    return text_without_any_quotes

def extract_first_function(text):
    pattern = r"(def .+?\):)(\n\s+.+)+"

    # Using re.DOTALL to make '.' match newlines as well.
    match = re.search(pattern, text, re.DOTALL)

    if match:
        # Returning the matched function.
        return match.group(0)
    else:
        return "No function found."

def clean_code(code: str) -> str:
    import_start = code.find("\nimport")
    def_start = code.find("\ndef")
    if import_start != -1 and (import_start < def_start or def_start == -1):
        return code[import_start+1:].lstrip()
    elif def_start != -1:
        return code[def_start+1:].lstrip()
    else:
        return ""

In [5]:
counter = {'success':0, 'runtime error':len(drop_questions), 'assertion failed':0}
def execute_code(generated_code, test, entry_point):
    def target(local_vars):
        try:
            # define Gemma-generated code
            exec(generated_code, globals(), local_vars)
            
            # define test
            updated_test_code = test.replace('candidate', entry_point)
            exec(updated_test_code, globals(), local_vars)
            # run test
            exec(f'check({entry_point})', globals(), local_vars)
            
            local_vars["test_passed"] = True
            counter["success"] += 1
        except AssertionError:
            print("Assertion Test Failed.")
            local_vars["test_passed"] = False
            counter["assertion failed"] += 1
        except Exception as e:
            print(f"Error during execution: {e}")
            local_vars["test_passed"] = False
            counter["runtime error"] += 1

    local_vars = {}
    # Remove triple quotes if needed
    print('generated code is:\n', generated_code)
    generated_code = remove_triple_quotes(generated_code)
    target(local_vars)
    # Return the result of the test execution
    return local_vars.get("test_passed", False)


def predict_and_evaluate(task):
    prompt = ('''
        Chain-of-thought prompt engineering is a technique used to guide a language model toward solving complex problems through a series of intermediate steps or reasoning paths. Instead of directly asking for the final answer, the prompt includes a sequence of logical steps that break down the problem, mimicking how a human might approach solving it. This technique can help the model tackle tasks that require multi-step reasoning, providing clearer, more interpretable solutions.
        
        I want you to break down this question using the aforementioned technique. Do not include answer in your response, be concise. Add your chain-of-thought reasoning into the EXISTING triple double quote section in the below function. Do not modify below prompt structure. Do not add any quotes yourself. Include in the prompt that do not generate main() function. Here is an example:
        Question:
        def has_close_elements(numbers: List[float], threshold: float) -> bool:
            """ Check if in given list of numbers, are any two numbers closer to each other than
            given threshold.
            >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
            False
            >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
            True
            """
        Answer:
        def has_close_elements(numbers: List[float], threshold: float) -> bool:
            """
            Check if in given list of numbers, are any two numbers closer to each other than
            given threshold.
            
            Chain-of-thought reasoning:
            - First, we need to compare each number in the list with every other number to check their differences.
            - If the absolute difference between any two numbers is less than the threshold, it means these two numbers are closer than the given threshold.
            - We should iterate through the list of numbers with two loops: an outer loop to pick each number and an inner loop to compare the picked number with all other numbers.
            - If we find any two numbers that satisfy the condition (their difference is less than the threshold), we return True immediately.
            - If we complete the loops without finding any such pair, it means no two numbers are closer than the threshold, and we return False.
            
            >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
            False
            >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
            True
            """
        
        Now, here is another question, please do it:\n
    ''') + task["prompt"]
    try:
        prompt = gemini.generate_content(prompt).text
        print('Gemini Prompt', prompt)
    except:
        prompt = "NA"
      

    prompt = task["prompt"]
    test_code = task["test"]
    entry_point = task["entry_point"]

    # Generate input_ids
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    # Generate code using the model
    outputs = model.generate(input_ids, max_length=1000)
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_code = generated_code.split("Answer:", 1)[1] if "Answer:" in generated_code else generated_code
    #generated_code = clean_code(generated_code)
    #generated_code = extract_first_function(generated_code)

    try:
        result = execute_code(generated_code, test_code, entry_point)
    except Exception as e:
        print(f"Error during execution: {e}")
        result = False

    print('the result is', result)
    return result

In [6]:
results = []
for index, row in humaneval_df.iterrows():
     task = {
         "prompt": row["prompt"],
         "test": row["test"],
         "entry_point": row["entry_point"]
     }
     print(f"executing {index}")
     results.append(predict_and_evaluate(task))

accuracy = sum(results) / len(results)
print(f"Model Accuracy on HumanEval: {accuracy * 100:.2f}%")

executing 0
Gemini Prompt def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """
    Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    
    Chain-of-thought reasoning:
    - We need to compare each number in the list with every other number to check their differences.
    - If the absolute difference between any two numbers is less than the threshold, it means these two numbers are closer than the given threshold.
    - We can iterate through the list of numbers with two loops: an outer loop to pick each number and an inner loop to compare the picked number with all other numbers.
    - If we find any two numbers that satisfy the condition (their difference is less than the threshold), we return True immediately.
    - If we complete the loops without finding any such pair, it means no two numbers are closer than the threshold, and we return False.
    
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    

  attn_output = torch.nn.functional.scaled_dot_product_attention(


generated code is:
 from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for i in range(len(numbers) - 1):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False

the result is True
executing 1
Gemini Prompt ```python
def separate_paren_groups(paren_string: str) -> List[str]:
    """ Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
    separate those group into separate strings and return the list of those.
    Separate groups are balanced (each open brace is properly closed) and not nested within each other
    Ignore any spaces in the input string.


In [7]:
counter

{'success': 15, 'runtime error': 74, 'assertion failed': 75}