In [1]:
import torch
import argparse
import datasets
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import AutoModelForCausalLM, TrainingArguments, HfArgumentParser, AutoTokenizer, TrainerCallback
from huggingface_hub import login
import matplotlib.pyplot as plt
from peft import LoraConfig
import nltk
import numpy as np
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.getenv("HF_TOKEN") # make a .env for this and put your access token as HF_TOKEN=whateverYourAccessTokenIs

In [3]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
#print("GPU available ", torch.cuda.is_available())

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=hf_token
).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)

In [5]:
def run_model(model, tokenizer, messages, max_new_tokens=5, verbose=False):
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)

    if verbose: print("\n###input_text:###\n", input_text)

    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    if verbose: print("\n###input_ids:###\n", input_ids)

    terminators = [
      tokenizer.eos_token_id,
      tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=False,
    )


    # Decode the output and return the response without special tokens
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    if verbose: print("\n###response:###\n", response)
    
    start_marker = "python\n"
    end_marker = "```"
    assistant_response = response.split(start_marker)[1].split(end_marker)[0] # grab just the code snippet
    #assistant_response = response.split("\n")[-1].strip()
    #assistant_response = response
    return assistant_response

In [6]:
data = pd.read_json("lc_hard.json", lines=False)
data

Unnamed: 0,desc,skeleton,examples,ref,test,func
0,\nGiven n non-negative integers representing a...,"\nclass Solution:\n def trap(self, height: ...",[],"[\nclass Solution:\n def trap(self, height:...","{'input': [[[0, 1, 0, 2, 1, 0, 1, 3, 2, 1, 2, ...",trap
1,\nGiven an array of integers heights represent...,\nclass Solution:\n def largestRectangleAre...,[],[\nclass Solution:\n def largestRectangleAr...,"{'input': [[[2, 1, 5, 6, 2, 3]], [[2, 4]]], 'o...",largestRectangleArea
2,\nGiven two sorted arrays nums1 and nums2 of s...,\nclass Solution:\n def findMedianSortedArr...,[],[\nclass Solution:\n def findMedianSortedAr...,"{'input': [[[1, 3], [2]], [[1, 2], [3, 4]]], '...",findMedianSortedArrays
3,\nGiven two strings s and t of lengths m and n...,"\nclass Solution:\n def minWindow(self, s: ...",[],"[\nclass Solution:\n def minWindow(self, s:...","{'input': [['ADOBECODEBANC', 'ABC'], ['a', 'a'...",minWindow
4,"\nYou are given an array of integers nums, the...",\nclass Solution:\n def maxSlidingWindow(se...,[],[\nclass Solution:\n def maxSlidingWindow(s...,"{'input': [[[1, 3, -1, -3, 5, 3, 6, 7], 3], [[...",maxSlidingWindow
5,\nYou are given an array of k linked-lists lis...,\n# Definition for singly-linked list.\n# clas...,[],"[\nclass Solution:\n def mergeKLists(self, ...","{'input': [[[[1, 4, 5], [1, 3, 4], [2, 6]]], [...",mergeKLists
6,"\nGiven the head of a linked list, reverse the...",\n# Definition for singly-linked list.\n# clas...,[],[\nclass Solution:\n def reverseKGroup(self...,"{'input': [[[1, 2, 3, 4, 5], 2], [[1, 2, 3, 4,...",reverseKGroup
7,\nA path in a binary tree is a sequence of nod...,\n# Definition for a binary tree node.\n# clas...,[],"[\nclass Solution:\n def traverse(self, nod...","{'input': [[[1, 2, 3]], [[-10, 9, 20, None, No...",maxPathSum
8,"\nGiven an integer array nums, return the numb...","\nclass Solution:\n def reversePairs(self, ...",[],"[\nclass Solution:\n def reversePairs(self,...","{'input': [[[1, 3, 2, 3, 1]], [[2, 4, 3, 5, 1]...",reversePairs
9,\nGiven an m x n board of characters and a lis...,"\nclass Solution:\n def findWords(self, boa...",[],"[\nclass Solution:\n def findWords(self, bo...","{'input': [[[['o', 'a', 'a', 'n'], ['e', 't', ...",findWords


In [7]:
def apply_lc_prompt(desc, skel):
    prompt = (
        "Your task is to complete the following problem in Python. You are provided with a skeleton code to complete and a description. Attempt to avoid importing modules as much as you can. Output your completed version of the code. "
        f"Description: {desc}"
        "Below is the starting point for your code. \n"
        f"{skel}"
    )

    return prompt.strip()

In [8]:
dataset = data.copy()
dataset["prompt"] = dataset.apply(lambda x: apply_lc_prompt(x["desc"], x["ref"]), axis=1)
print(dataset.iloc[0].to_dict())

{'desc': '\\nGiven n non-negative integers representing an elevation map where the width of each bar is 1, compute how much water it can trap after raining.\\n', 'skeleton': '\\nclass Solution:\\n    def trap(self, height: List[int]) -> int:\\n', 'examples': [], 'ref': ['\\nclass Solution:\\n    def trap(self, height: List[int]) -> int:\\n            left = 0\\n            right = len(height) - 1\\n            left_max = height[left]\\n            right_max = height[right]\\n            water = 0\\n\\n            while left < right:\\n                if left_max < right_max:\\n                    left += 1\\n                    left_max = max(left_max, height[left])\\n                    water += left_max - height[left]\\n                else:\\n                    right -= 1\\n                    right_max = max(right_max, height[right])\\n                    water += right_max - height[right]\\n            \\n            return water\\n'], 'test': {'input': [[[0, 1, 0, 2, 1, 0, 1, 3,

In [9]:
#import nltk.translate.bleu_score


def eval_bleu(model, tokenizer, dataset, max_new_tokens=1000):
    outputs = []

    for row in tqdm(dataset.to_dict(orient="records")):
        messages = [
            {"role": "system", "content": ""},
            {"role": "user", "content": row["prompt"]},
        ]

        output = run_model(model=model, tokenizer=tokenizer, messages=messages, max_new_tokens=max_new_tokens)

        outputs.append(output)
    
    r, h = [], []
    for idx, row in tqdm(enumerate(dataset.to_dict(orient="records"))):
        refs_in_dataset = row["ref"]
        references = []
        for real_code_solution in refs_in_dataset:
            references.append(real_code_solution.split())
        hypothesis = outputs[idx].split()
        
        r.append(references)
        h.append(hypothesis)
    
    bleu_score = nltk.translate.bleu_score.corpus_bleu(r, h, weights=(1,0,0,0))
    return bleu_score, outputs


In [10]:
df = dataset.copy()
bleu_score, outputs = eval_bleu(model, tokenizer, df)
print(f"Bleu: {bleu_score}")
df["output"] = outputs
display(df)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  2%|▏         | 1/50 [00:08<06:33,  8.02s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 2/50 [00:16<06:39,  8.32s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Bleu: 0.4968864468864469





Unnamed: 0,desc,skeleton,examples,ref,test,func,prompt,output
0,\nGiven n non-negative integers representing a...,"\nclass Solution:\n def trap(self, height: ...",[],"[\nclass Solution:\n def trap(self, height:...","{'input': [[[0, 1, 0, 2, 1, 0, 1, 3, 2, 1, 2, ...",trap,Your task is to complete the following problem...,"def trap(height):\n """"""\n Given n non-ne..."
1,\nGiven an array of integers heights represent...,\nclass Solution:\n def largestRectangleAre...,[],[\nclass Solution:\n def largestRectangleAr...,"{'input': [[[2, 1, 5, 6, 2, 3]], [[2, 4]]], 'o...",largestRectangleArea,Your task is to complete the following problem...,def largestRectangleArea(heights):\n stack ...
2,\nGiven two sorted arrays nums1 and nums2 of s...,\nclass Solution:\n def findMedianSortedArr...,[],[\nclass Solution:\n def findMedianSortedAr...,"{'input': [[[1, 3], [2]], [[1, 2], [3, 4]]], '...",findMedianSortedArrays,Your task is to complete the following problem...,"def findMedianSortedArrays(self, nums1: List[i..."
3,\nGiven two strings s and t of lengths m and n...,"\nclass Solution:\n def minWindow(self, s: ...",[],"[\nclass Solution:\n def minWindow(self, s:...","{'input': [['ADOBECODEBANC', 'ABC'], ['a', 'a'...",minWindow,Your task is to complete the following problem...,"def minWindow(s, t):\n """"""\n Given two s..."
4,"\nYou are given an array of integers nums, the...",\nclass Solution:\n def maxSlidingWindow(se...,[],[\nclass Solution:\n def maxSlidingWindow(s...,"{'input': [[[1, 3, -1, -3, 5, 3, 6, 7], 3], [[...",maxSlidingWindow,Your task is to complete the following problem...,from collections import deque\n\nclass Solutio...
5,\nYou are given an array of k linked-lists lis...,\n# Definition for singly-linked list.\n# clas...,[],"[\nclass Solution:\n def mergeKLists(self, ...","{'input': [[[[1, 4, 5], [1, 3, 4], [2, 6]]], [...",mergeKLists,Your task is to complete the following problem...,"class Solution:\n def mergeKLists(self, lis..."
6,"\nGiven the head of a linked list, reverse the...",\n# Definition for singly-linked list.\n# clas...,[],[\nclass Solution:\n def reverseKGroup(self...,"{'input': [[[1, 2, 3, 4, 5], 2], [[1, 2, 3, 4,...",reverseKGroup,Your task is to complete the following problem...,"class Solution:\n def reverseKGroup(self, h..."
7,\nA path in a binary tree is a sequence of nod...,\n# Definition for a binary tree node.\n# clas...,[],"[\nclass Solution:\n def traverse(self, nod...","{'input': [[[1, 2, 3]], [[-10, 9, 20, None, No...",maxPathSum,Your task is to complete the following problem...,"class Solution:\n def maxPathSum(self, root..."
8,"\nGiven an integer array nums, return the numb...","\nclass Solution:\n def reversePairs(self, ...",[],"[\nclass Solution:\n def reversePairs(self,...","{'input': [[[1, 3, 2, 3, 1]], [[2, 4, 3, 5, 1]...",reversePairs,Your task is to complete the following problem...,"from bisect import bisect_left, bisect_right\n..."
9,\nGiven an m x n board of characters and a lis...,"\nclass Solution:\n def findWords(self, boa...",[],"[\nclass Solution:\n def findWords(self, bo...","{'input': [[[['o', 'a', 'a', 'n'], ['e', 't', ...",findWords,Your task is to complete the following problem...,"def findWords(self, board: List[List[str]], wo..."


In [11]:
# this block is to look at a specific index's output

#print(df.iloc[2].to_dict()["output"])
#with open("scratch.txt", "w") as file:
#    file.write(df.iloc[2].to_dict()["output"])

In [12]:
from typing import List
# deprecated
def eval_test_case(code, test_inputs, expected_outputs, function_name):
    

    #print(code)
    #print(test_inputs)
    #print(expected_outputs)

    try:
        namespace = {'List': List}
        exec(code, namespace)
        Solution = namespace.get("Solution")
        solution_instance = Solution()
        

        #func = namespace.get(function_name)
        func = getattr(solution_instance, function_name, None)
        if not callable(func):
            raise ValueError(f"Function '{function_name}' is not defined or callable")
        
        passed = 0
        total = len(test_inputs)

        for test_input, expected_output in zip(test_inputs, expected_outputs):
            try:
                result = func(*test_input)
                if result == expected_output:
                    passed += 1
                #else:
                    #print(f"Test with input {test_input} failed. Expected {expected_output}, got {result}")
            except Exception as e:
                print(f"Test with input {test_input} on {function_name} failed due to error: {e}")
        
        return passed / total if total > 0 else 0.0
    
    except Exception as e:
        print(f"aw man got error {e}")
        # code messed up, penalize
        return 0.0

In [13]:
class ListNode:
    def __init__(self, val=0, next=None):
        self.val = val
        self.next = next

import threading
from typing import List, Optional

# Function to execute a callable with a timeout
def run_with_timeout(func, args, result_holder, timeout=5):
    def wrapper():
        try:
            result_holder['result'] = func(*args)
        except Exception as e:
            result_holder['result'] = e
    
    # Create a thread to execute the function
    thread = threading.Thread(target=wrapper)
    thread.start()
    thread.join(timeout)  # Wait for the specified timeout
    
    if thread.is_alive():
        thread._stop()  # Forcefully stop the thread (not safe but works for this context)
        result_holder['result'] = 'Timeout'
        
    return result_holder['result']

def eval_test_case(code, test_inputs, expected_outputs, function_name):
    try:
        # Define the namespace and execute the code
        namespace = {
            'List': List,
            'ListNode': ListNode,
            'Optional': Optional
        }
        exec(code, namespace)
        Solution = namespace.get("Solution")
        solution_instance = Solution()

        # Get the function to test
        func = getattr(solution_instance, function_name, None)
        if not callable(func):
            raise ValueError(f"Function '{function_name}' is not defined or callable")
        
        passed = 0
        total = len(test_inputs)

        for test_input, expected_output in zip(test_inputs, expected_outputs):
            try:
                # Create a result holder to capture the result of the function
                result_holder = {'result': None}
                
                # Call the function with a timeout (5 seconds)
                result = run_with_timeout(func, test_input, result_holder, timeout=5)
                
                # Check if the result is as expected
                if result == expected_output:
                    print(f"Test with input {test_input} passed. Expected {expected_output}, got {result}")
                    passed += 1
                else:
                    # Optional: print if the result is incorrect
                    print(f"Test with input {test_input} failed. Expected {expected_output}, got {result}")
                    
            except Exception as e:
                print(f"Test with input {test_input} on {function_name} failed due to error: {e}")
        
        return passed / total if total > 0 else 0.0
    
    except Exception as e:
        print(f"Error during code execution: {e}")
        return 0.0


In [14]:
code_blocks = outputs

total_pass_rate = 0

num_iter = 0

for idx, code_block in enumerate(tqdm(code_blocks, desc="Evaluating Code Blocks")):
    #if num_iter == 3:
    #    break

    test_dict = df.iloc[idx].to_dict()["test"]

    test_inputs = test_dict["input"]
    expected_outputs = test_dict["output"]
    function_name = df.iloc[idx].to_dict()["func"]
    
    total_pass_rate += eval_test_case(code_block, test_inputs, expected_outputs, function_name)

    num_iter += 1
    print("")


avg_pass_rate = total_pass_rate/len(code_blocks)
print("Average Pass Rate: ", avg_pass_rate)

Evaluating Code Blocks:   0%|          | 0/50 [00:00<?, ?it/s]

Error during code execution: 'NoneType' object is not callable

Error during code execution: 'NoneType' object is not callable

Error during code execution: 'NoneType' object is not callable

Error during code execution: 'NoneType' object is not callable

Test with input [[1, 3, -1, -3, 5, 3, 6, 7], 3] passed. Expected [3, 3, 5, 5, 6, 7], got [3, 3, 5, 5, 6, 7]
Test with input [[1], 1] failed. Expected [1], got deque index out of range

Test with input [[[1, 4, 5], [1, 3, 4], [2, 6]]] failed. Expected [1, 1, 2, 3, 4, 4, 5, 6], got 'list' object has no attribute 'next'
Test with input [[]] failed. Expected [], got None
Test with input [[[]]] failed. Expected [], got None

Test with input [[1, 2, 3, 4, 5], 2] failed. Expected [2, 1, 4, 3, 5], got 'list' object has no attribute 'next'
Test with input [[1, 2, 3, 4, 5], 3] failed. Expected [3, 2, 1, 4, 5], got 'list' object has no attribute 'next'

Error during code execution: name 'TreeNode' is not defined

Test with input [[1, 3, 2, 3, 1]

Evaluating Code Blocks: 100%|██████████| 50/50 [00:00<00:00, 437.23it/s]

Test with input [[['5', '3', '4', '6', '7', '8', '9', '1', '2'], ['6', '7', '2', '1', '9', '5', '3', '4', '8'], ['1', '9', '8', '3', '4', '2', '5', '6', '7'], ['8', '5', '9', '7', '6', '1', '4', '2', '3'], ['4', '2', '6', '8', '5', '3', '7', '9', '1'], ['7', '1', '3', '9', '2', '4', '8', '5', '6'], ['9', '6', '1', '5', '3', '7', '2', '8', '4'], ['2', '8', '7', '4', '1', '9', '6', '3', '5'], ['3', '4', '5', '2', '8', '6', '1', '7', '9']]] failed. Expected [['5', '3', '4', '6', '7', '8', '9', '1', '2'], ['6', '7', '2', '1', '9', '5', '3', '4', '8'], ['1', '9', '8', '3', '4', '2', '5', '6', '7'], ['8', '5', '9', '7', '6', '1', '4', '2', '3'], ['4', '2', '6', '8', '5', '3', '7', '9', '1'], ['7', '1', '3', '9', '2', '4', '8', '5', '6'], ['9', '6', '1', '5', '3', '7', '2', '8', '4'], ['2', '8', '7', '4', '1', '9', '6', '3', '5'], ['3', '4', '5', '2', '8', '6', '1', '7', '9']], got None

Test with input ['aab'] passed. Expected 1, got 1
Test with input ['a'] passed. Expected 0, got 0
Test wit




In [None]:
# single example that hangs
code = """
class Solution:
    def func(self, a, b):
        total_items = len(a) + len(b)
        return total_items
\"""

test_dict = df.iloc[idx].to_dict()["test"]

test_inputs = [[[5, 6], [1, 2, 3]], [[5, 6], [1,2,4,5]]]
print("Test Inputs: ", test_inputs)
expected_outputs = [5,6]
print("Expected Outputs: ", expected_outputs)
function_name = "func"
print("func name ", function_name)

try:
        namespace = {'List': List}
        exec(code, namespace)
        Solution = namespace.get("Solution")
        solution_instance = Solution()
        

        #func = namespace.get(function_name)
        func = getattr(solution_instance, function_name, None)
        if not callable(func):
            raise ValueError(f"Function '{function_name}' is not defined or callable")
        
        passed = 0
        total = len(test_inputs)

        for test_input, expected_output in zip(test_inputs, expected_outputs):
            try:
                result = func(*test_input)
                if result == expected_output:
                    passed += 1
                #else:
                    #print(f"Test with input {test_input} failed. Expected {expected_output}, got {result}")
            except Exception as e:
                print(f"Test with input {test_input} on {function_name} failed due to error: {e}")
        
        pass_rate = passed / total if total > 0 else 0.0
        print(pass_rate)
    
except Exception as e:
    print(f"aw man got error {e}")
    # code messed up, penalize
"""

Test Inputs:  [[[5, 6], [1, 2, 3]], [[5, 6], [1, 2, 4, 5]]]
Expected Outputs:  [5, 6]
func name  func
1.0
