In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import nltk
from tqdm.auto import tqdm
import pandas as pd

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.getenv("HF_TOKEN") # make a .env for this and put your access token as HF_TOKEN=whateverYourAccessTokenIs

In [None]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
#model_id = "meta-llama/Llama-3.2-3B-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"
#print("GPU available ", torch.cuda.is_available())

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=hf_token
).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

In [None]:
def run_model(model, tokenizer, messages, max_new_tokens=5, verbose=False):
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)

    if verbose: print("\n###input_text:###\n", input_text)

    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    if verbose: print("\n###input_ids:###\n", input_ids)

    terminators = [
      tokenizer.eos_token_id,
      tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=False,
    )


    # Decode the output and return the response without special tokens
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    if verbose: print("\n###response:###\n", response)
    
    start_marker = "python\n"
    end_marker = "```"
    assistant_response = response.split(start_marker)[1].split(end_marker)[0] # grab just the code snippet
    #assistant_response = response.split("\n")[-1].strip()
    #assistant_response = response
    return assistant_response

In [None]:
data = pd.read_json("lc_hard.json", lines=False)
data

In [None]:
def apply_lc_prompt(desc, skel):
    prompt = (
        "Your task is to complete the following problem in Python. You are provided with a skeleton code to complete and a description. Attempt to avoid importing modules as much as you can. Output your completed version of the code. "
        f"Description: {desc}"
        "Below is the starting point for your code. \n"
        f"{skel}"
    )

    return prompt.strip()

In [None]:
dataset = data.copy()
dataset["prompt"] = dataset.apply(lambda x: apply_lc_prompt(x["desc"], x["ref"]), axis=1)
print(dataset.iloc[0].to_dict())

In [None]:
import nltk.translate.bleu_score


def eval_bleu(model, tokenizer, dataset, max_new_tokens=1000):
    outputs = []

    for row in tqdm(dataset.to_dict(orient="records")):
        messages = [
            {"role": "system", "content": ""},
            {"role": "user", "content": row["prompt"]},
        ]

        output = run_model(model=model, tokenizer=tokenizer, messages=messages, max_new_tokens=max_new_tokens)

        outputs.append(output)
    
    r, h = [], []
    for idx, row in tqdm(enumerate(dataset.to_dict(orient="records"))):
        refs_in_dataset = row["ref"]
        references = []
        for real_code_solution in refs_in_dataset:
            references.append(real_code_solution.split())
        hypothesis = outputs[idx].split()
        
        r.append(references)
        h.append(hypothesis)
    
    bleu_score = nltk.translate.bleu_score.corpus_bleu(r, h, weights=(1,0,0,0))
    return bleu_score, outputs


In [None]:
df = dataset.copy()
bleu_score, outputs = eval_bleu(model, tokenizer, df)
print(f"Bleu: {bleu_score}")
df["output"] = outputs
display(df)


In [None]:
class ListNode:
    def __init__(self, val=0, next=None):
        self.val = val
        self.next = next

import threading
from typing import List, Optional

# Function to execute a callable with a timeout
def run_with_timeout(func, args, result_holder, timeout=5):
    def wrapper():
        try:
            result_holder['result'] = func(*args)
        except Exception as e:
            result_holder['result'] = e
    
    # Create a thread to execute the function
    thread = threading.Thread(target=wrapper)
    thread.start()
    thread.join(timeout)  # Wait for the specified timeout
    
    if thread.is_alive():
        thread._stop()  # Forcefully stop the thread (not safe but works for this context)
        result_holder['result'] = 'Timeout'
        
    return result_holder['result']

def eval_test_case(code, test_inputs, expected_outputs, function_name):
    try:
        # Define the namespace and execute the code
        namespace = {
            'List': List,
            'ListNode': ListNode,
            'Optional': Optional
        }
        exec(code, namespace)
        Solution = namespace.get("Solution")
        solution_instance = Solution()

        # Get the function to test
        func = getattr(solution_instance, function_name, None)
        if not callable(func):
            raise ValueError(f"Function '{function_name}' is not defined or callable")
        
        passed = 0
        total = len(test_inputs)

        for test_input, expected_output in zip(test_inputs, expected_outputs):
            try:
                # Create a result holder to capture the result of the function
                result_holder = {'result': None}
                
                # Call the function with a timeout (5 seconds)
                result = run_with_timeout(func, test_input, result_holder, timeout=5)
                
                # Check if the result is as expected
                if result == expected_output:
                    print(f"Test with input {test_input} passed. Expected {expected_output}, got {result}")
                    passed += 1
                else:
                    # Optional: print if the result is incorrect
                    print(f"Test with input {test_input} failed. Expected {expected_output}, got {result}")
                    
            except Exception as e:
                print(f"Test with input {test_input} on {function_name} failed due to error: {e}")
        
        return passed / total if total > 0 else 0.0
    
    except Exception as e:
        print(f"Error during code execution: {e}")
        return 0.0


In [None]:
code_blocks = outputs

total_pass_rate = 0

num_iter = 0

first_20_pass_rate = 0

pass_rate_per_prob = {}

for idx, code_block in enumerate(tqdm(code_blocks, desc="Evaluating Code Blocks")):
    if num_iter == 21:
        first_20_pass_rate = total_pass_rate

    test_dict = df.iloc[idx].to_dict()["test"]

    test_inputs = test_dict["input"]
    expected_outputs = test_dict["output"]
    function_name = df.iloc[idx].to_dict()["func"]
    
    pass_rate = eval_test_case(code_block, test_inputs, expected_outputs, function_name)
    total_pass_rate += pass_rate
    pass_rate_per_prob[idx] = pass_rate

    num_iter += 1
    print("")


avg_pass_rate = total_pass_rate/len(code_blocks)
print("Average Pass Rate: ", avg_pass_rate)
print("First 20 Pass Rate: ", first_20_pass_rate/20)
print("Last 30 Pass Rate: ", (total_pass_rate - first_20_pass_rate)/30)

In [None]:
print("Pass Rate Per Problem", pass_rate_per_prob)
print("Average Pass Rate: ", avg_pass_rate)
print("First 20 Pass Rate: ", first_20_pass_rate/20)
print("Last 30 Pass Rate: ", (total_pass_rate - first_20_pass_rate)/30)

In [None]:
import os

# Folder to save the output files
folder_name = "generated_outputs_3b"

# Ensure the folder exists
os.makedirs(folder_name, exist_ok=True)

# Loop through the list and create the files
for idx, content in enumerate(outputs):
    file_name = f"lc{idx + 1}.txt"  # Construct the file name
    file_path = os.path.join(folder_name, file_name)  # Full file path
    with open(file_path, 'w') as file:
        file.write(content)  # Write the content to the file

print(f"Files have been created in the '{folder_name}' folder.")
