# Helper functions

# Evaluate GitHub Python

## Helper functions

In [1]:
# Load the test dataset
import json
import os

def load_json_from_folder(folder_path):
  """
  Loads all JSON files from a specified folder.

  Args:
    folder_path: The path to the folder containing the JSON files.

  Returns:
    A list containing all JSON objects from the files.
  """

  data_packages = []
  for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
      filepath = os.path.join(folder_path, filename)
      try:
        with open(filepath, 'r') as f:
          data = json.load(f)
          data_packages.append(data)
      except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {filename}: {e}")
          
  return data_packages

def load_json_from_file(json_file):
    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {json_file}: {e}")  # Corrected line
        return None

    return data

In [2]:
import ast

def validate_code(code_str):
  """
  Validates Python code and returns error information if any.

  Args:
    code_str: The Python code as a string.

  Returns:
    None if the code is valid, otherwise a string describing the syntax error.
  """
  try:
    ast.parse(code_str)
    return None  # No error
  except SyntaxError as e:
    return str(e)  # Return the error message as a string

In [20]:
def replace_key_in_json(json_obj, key_to_replace, new_value):
  """
  Creates a new JSON object with a specific key replaced.

  Args:
    json_obj: The original JSON object.
    key_to_replace: The key to be replaced.
    new_value: The new value for the key.

  Returns:
    A new JSON object with the key replaced.
  """

  new_json_obj = json.loads(json.dumps(json_obj))  # Create a deep copy
  new_json_obj[key_to_replace] = new_value
  return new_json_obj

## Load the dataset

In [3]:
github_python_dataset = "../github-python-test"

# The original paper use the 3 4 as hold-out test set
test_dataset = [];
test_dataset.append(load_json_from_file(os.path.join(github_python_dataset, 'model-fixer.pred.evaluated.3.json')))
test_dataset.append(load_json_from_file(os.path.join(github_python_dataset, 'model-fixer.pred.evaluated.4.json')))

In [4]:
total_samples = len(test_dataset[0]) + len(test_dataset[1])
first_code = test_dataset[0][0]['src']['string_format']

print(f"Total number of samples: {total_samples}")
print(f"First code:\n {first_code}")
print(f"Error (if any): {validate_code(first_code)}")

Total number of samples: 15055
First code:
 def test_pp_no_constraint ( self ) :
    filenames = [ tests . get_data_path ( ( "str" , "str" , "str" ) ) ]
    pp_constraints = pp . _convert_constraints ( None )
    pp_loader = iris . fileformats . rules . Loader ( pp . load , { } ,
        convert , pp . _load_rules )
    cubes = list ( load_cubes ( filenames , None , pp_loader , pp_constraints )
    self . assertEqual ( len ( cubes ) , 152 )

Error (if any): '(' was never closed (<unknown>, line 6)


## Prepare the model

In [11]:
import torch
import sys
import os
from datasets import load_dataset
from transformers import pipeline

In [7]:
# Set HF token as env variable


In [8]:
# Prepare token
torch.cuda.empty_cache()
hf_token = os.environ.get('HF_TOKEN')

In [9]:
# Prepare instruction
python_syntax_fixer_instruction = "You are an expert Python code fixer. \
             You will receive input in the following format: \n\n \
             [Fix] | <error code>\n \
             <python code snippet>\n\n \
             Your task is to ONLY provide the corrected Python code with NO explanations or additional text. \n \
             Do not include the original error code in your response and do not format the code. \
             Treat the code snippet as regular text."

In [12]:
# Load the model and instruction
instruct_model_id = "meta-llama/Llama-3.2-3B-Instruct"

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    token=hf_token,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Process fixing code

In [40]:
def fix_code(code_snippet):
    code_error = validate_code(code_snippet)
    messages = [
        { "role": "system", "content": python_syntax_fixer_instruction }
    ]
    messages.append({"role": "user", "content": f"[Fix] | {code_error}\n{code_snippet}"})
    
    outputs = pipe(messages, max_new_tokens=512)
    
    return outputs[0]["generated_text"][-1]["content"]

In [43]:
from tqdm.notebook import tqdm

def perform_fixing_code(dataset):
    results = []
    for item in tqdm(dataset, desc="Fixing code"):
        code_snippet = item['src']['string_format']
        
        fixing_attempts = []
        for _ in range(10):  # Fix 10 times
            fixed_code = fix_code(code_snippet)
            remain_error = validate_code(fixed_code)
            fixing_attempts.append({
                "string_format": fixed_code,
                "err_obj": 0 if remain_error is None \
                            else { "msg": "unbalanced (){}[]", "msg_detailed": remain_error }
            })

        results.append(replace_key_in_json(item, "pred", fixing_attempts))

In [None]:
json_fixed_data = perform_fixing_code(test_dataset[0][:5])

Fixing code:   0%|          | 0/5 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id

In [36]:
len(test_dataset[0][:5])

5

In [None]:


    # Write the results to a JSON file
    with open(output_filename, "w") as outfile:
        json.dump(results, outfile, indent=4)

In [14]:
first_code

'def test_pp_no_constraint ( self ) :\n    filenames = [ tests . get_data_path ( ( "str" , "str" , "str" ) ) ]\n    pp_constraints = pp . _convert_constraints ( None )\n    pp_loader = iris . fileformats . rules . Loader ( pp . load , { } ,\n        convert , pp . _load_rules )\n    cubes = list ( load_cubes ( filenames , None , pp_loader , pp_constraints )\n    self . assertEqual ( len ( cubes ) , 152 )\n'

In [15]:
import Levenshtein

In [16]:
fixed_code = fix_code(first_code)
fixed_code

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'def test_pp_no_constraint ( self ) :\n    filenames = [ tests. get_data_path ( ("str", "str", "str") ) ]\n    pp_constraints = pp._convert_constraints(None)\n    pp_loader = iris.fileformats.rules.Loader(pp.load, {}, convert, pp._load_rules)\n    cubes = list(load_cubes(filenames, None, pp_loader, pp_constraints))\n    self.assertEqual(len(cubes), 152)'

In [17]:
validate_code(fixed_code)

In [18]:
distance = Levenshtein.distance(first_code, fixed_code)
print(f"Levenshtein distance: {distance}")

Levenshtein distance: 53


# WIP

In [None]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [None]:
messages = [
    {"role": "system", "content": "You are a Python syntax error correction expert. You will receive code snippets with syntax errors, prefixed with [Fix]. Each input will have two parts separated by a vertical bar (|). You will only fix the provided code, without any additional explanation."},
]
outputs = pipe(
    messages,
    max_new_tokens=128,
)
print(outputs[0]["generated_text"][-1])

In [11]:
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

error_code = "SyntaxError: invalid syntax"
code_snippet = """
def greet(name):
  print("Hello, " + name 

greet("Alice")
"""

messages = [
    { 
        "role": "system", 
        "content": \
            "You are an expert Python code fixer. \
             You will receive input in the following format: \n\n \
             [Fix] | <error code>\n \
             <python code snippet>\n\n \
             Your task is to ONLY provide the corrected Python code with NO explanations or additional text. \n \
             Do not include the original error code in your response and do not format the code. \
             Treat the code snippet as regular text."
    }
]

messages.append({"role": "user", "content": f"""
[Fix] | {error_code}
{code_snippet}
"""})

outputs = pipe(messages, max_new_tokens=256)
print(outputs[0]["generated_text"][-1])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': 'def greet(name):\n  print("Hello, " + name)'}


In [13]:
fixed_code = outputs[0]["generated_text"][-1]['content']
fixed_code

'def greet(name):\n  print("Hello, " + name)'

In [14]:
validate_code(code_snippet)

"'(' was never closed (<unknown>, line 3)"

In [15]:
validate_code(fixed_code)