# Prepare environment

In [8]:
import torch
import sys
import os
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import TrainingArguments, Trainer

sys.path.append("..")

from utils.data_reader import GetDataAsPython

In [6]:
# Set HF token as environment variable


In [5]:
# Prepare token
torch.cuda.empty_cache()
hf_token = os.environ.get('HF_TOKEN')

# Prepare helper function

In [54]:
# Helper function
def extract_warning_types(data):
    all_warnings = []
    for sample in data:
        if sample.linter_report.rule_id not in all_warnings:
            all_warnings.append(sample.linter_report.rule_id)
    return all_warnings

# Load and preprocess data

In [19]:
# Load data
data = GetDataAsPython("../data/data_autofix_tracking_repo_specific_final.json")
data_eslint = GetDataAsPython("../data/data_autofix_tracking_eslint_final.json")
data += data_eslint

all_warning_types = extract_warning_types(data)

In [20]:
# Warning types example
all_warning_types

['no-invalid-this',
 'no-throw-literal',
 'no-new-wrappers',
 'guard-for-in',
 'no-new-object',
 'comma-style',
 'prefer-spread',
 'no-caller',
 'no-extra-bind',
 'no-array-constructor',
 'prefer-rest-params',
 'generator-star-spacing',
 'no-this-before-super',
 'no-extend-native',
 'no-undef',
 'no-useless-escape',
 'no-dupe-keys',
 'no-console',
 'no-constant-condition',
 'no-duplicate-case',
 'no-empty',
 'no-extra-semi',
 'no-redeclare',
 'no-cond-assign',
 'no-extra-boolean-cast',
 'no-fallthrough',
 'no-unreachable',
 'valid-typeof',
 'no-unsafe-finally',
 'no-unused-vars',
 'no-debugger',
 'no-unsafe-negation',
 'no-case-declarations',
 'no-self-assign',
 'no-process-exit',
 'no-inner-declarations',
 'for-direction',
 'no-compare-neg-zero',
 'no-sparse-arrays',
 'no-func-assign',
 'no-const-assign',
 'no-global-assign',
 'use-isnan',
 'no-unused-labels',
 'require-yield',
 'getter-return',
 'no-dupe-class-members',
 'no-ex-assign',
 'constructor-super',
 'no-new-symbol',
 'no-em

In [None]:
# A data sample
data[0].__dict__

In [37]:
data[0].linter_report.message

"Unexpected 'this'."

## Write data back to file

In [41]:
def extract_train_data(sample):
    return { 
        "source_code": sample.source_code,
        "target_code": sample.target_code,
        "message": sample.linter_report.message
    }

extracted_data = list(map(extract_train_data, data))

In [42]:
import json

with open('../simplified-data.json', 'w', encoding='utf-8') as f:
    json.dump(extracted_data, f, ensure_ascii=False, indent=4)

## Load simplified data

In [43]:
dataset = load_dataset('json', data_files='../simplified-data.json')

Generating train split: 0 examples [00:00, ? examples/s]

# Load model

In [45]:
# Load model
model_name = "meta-llama/Llama-3.2-3B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [46]:
# Prepare LoRA and apply to the model
lora_config = LoraConfig(
    r=8, # Rank of the LoRA update matrices
    lora_alpha=32, # Scaling factor for the LoRA update matrices
    lora_dropout=0.05, # Dropout probability for the LoRA update matrices
    bias="none", # Whether to apply a bias to the LoRA update matrices
    task_type="CAUSAL_LM" # Type of task for which to apply LoRA
)

model = get_peft_model(model, lora_config)

In [47]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [71]:
# Define preprocess function - ERROR
def preprocess_function(examples):
    inputs = f"{examples['message']}{examples['source_code']}"
    targets = f"{examples['target_code']}"
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [72]:
# Apply preprocess function - ERROR
tokenized_data = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=2, # Adjust according to your CPU cores
    remove_columns=dataset["train"].column_names,
)

In [None]:
# Fine tune the model
training_args = TrainingArguments(
    output_dir="./model-output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    learning_rate=1e-4,
    fp16=True,  # use mixed precision if supported
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
)

trainer.train()

In [None]:
# Prepare the training data - worked
def preprocess_function(batch):

    inputs = [f"Input: {text}" for text in batch['source_code']]
    targets = [f"Output: {text}" for text in batch['target_code']]
    print(inputs, targets)
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=2,
    remove_columns=dataset["train"].column_names,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./model-output",  # Updated output directory
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
)

# Create a Trainer instance and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
)
trainer.train()

# Generate output

In [9]:
trained_model_path = '../main/meta-llama/Llama-3.2-3B_global_28-10-2024_23-21-36_fixer_trained'

# Load trained model
trained_model = AutoModelForCausalLM.from_pretrained(trained_model_path)

# Load the LoRA adapter
config = LoraConfig.from_pretrained(trained_model_path) 
trained_model = PeftModel.from_pretrained(trained_model, trained_model_path, config=config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
tokenizer.pad_token = tokenizer.eos_token

In [24]:
# Prepare input text
prompt = "Expected an object to be thrown. throw 'Could not create XML HTTP transport.';"

# Encode the input text
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
output = trained_model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

# Decode the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

Expected an object to be thrown. throw 'Could not create XML HTTP transport.';


# Compare with base model

In [25]:
base_model_path = "meta-llama/Llama-3.2-3B"

# Load trained model
base_model = AutoModelForCausalLM.from_pretrained(base_model_path, token=hf_token)
tokenizer = AutoTokenizer.from_pretrained(base_model_path, token=hf_token)

# Apply LoRA
lora_config = LoraConfig(
    r=8, # Rank of the LoRA update matrices
    lora_alpha=32, # Scaling factor for the LoRA update matrices
    lora_dropout=0.05, # Dropout probability for the LoRA update matrices
    bias="none", # Whether to apply a bias to the LoRA update matrices
    task_type="CAUSAL_LM" # Type of task for which to apply LoRA
)
base_model = get_peft_model(base_model, lora_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
# Prepare input text
prompt = "Expected an object to be thrown. throw 'Could not create XML HTTP transport.';"

# Encode the input text
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
output = base_model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)

# Decode the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

Expected an object to be thrown. throw 'Could not create XML HTTP transport.'; at Object.createXmlHttpRequest (/usr/local/lib/node_modules/express/node_modules/connect/node_modules/http-proxy/node_modules/xmlhttprequest/index.js:4:20) at Object.exports.create (/usr/local/lib/node_modules/express/node_modules/connect/node_modules/http-proxy/node_modules/xmlhttprequest/index.js:11:19) at Object.exports.create (/usr/local/lib/node_modules/express/node_modules/connect/node_modules/http-proxy/index.js:33:12) at Object.exports.create (/usr/local/lib/node_modules/


# Compare with instruct model

In [32]:
instruct_model_id = "meta-llama/Llama-3.2-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(instruct_model_id)
instruct_model = AutoModelForCausalLM.from_pretrained(
    instruct_model_id,
    token=hf_token,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a JavaScript syntax error correction expert. You will receive code snippets with syntax errors, prefixed with [Fix]. Each input will have two parts separated by a vertical bar (|). You will only fix the provided code, without any additional explanation."},
    {"role": "user", "content": "[Fix]Expected an object to be thrown| throw 'Could not create XML HTTP transport.'"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(instruct_model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = instruct_model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[Fix] throw new Error('Could not create XML HTTP transport.')


# WIP

In [65]:
# Function to generate code
def generate_code(input_code):
    inputs = tokenizer(f"Input: {input_code}", return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=100)
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_code

# Example usage
input_code = "Input: def add(a, b):"
generated_code = generate_code(input_code)
print(generated_code)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input: Input: def add(a, b): return a + b Output: Output: def add(a, b): return a - b Explanation: The output is the difference between the two numbers. The output is the sum of the two numbers. The output is the product of the two numbers. The output is the quotient of the two numbers. The output is the absolute value of the difference between the two numbers. The output is the absolute value of the sum of the two numbers. The output is the absolute value of the product of the two
