In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

In [4]:
import os

torch.cuda.empty_cache()
hf_token = os.environ.get('HF_TOKEN')

In [8]:
# Import data
data = load_dataset('json', data_files='data-small/data.json')

In [18]:
# Load model
model_name = "meta-llama/Llama-3.2-3B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
# Prepare LoRA and apply to the model
lora_config = LoraConfig(
    r=8, # Rank of the LoRA update matrices
    lora_alpha=32, # Scaling factor for the LoRA update matrices
    lora_dropout=0.05, # Dropout probability for the LoRA update matrices
    bias="none", # Whether to apply a bias to the LoRA update matrices
    task_type="CAUSAL_LM" # Type of task for which to apply LoRA
)

model = get_peft_model(model, lora_config)

In [20]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Define preprocess function - ERROR
def preprocess_function(examples):
    inputs = [f"<s>{x['input_code']}" for x in examples]
    targets = [f"{x['output_code']}</s>" for x in examples]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
# Apply preprocess function - ERROR
tokenized_data = data.map(
    preprocess_function,
    batched=True,
    num_proc=2, # Adjust according to your CPU cores
    remove_columns=data["train"].column_names,
)

Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

TypeError: string indices must be integers

In [49]:
# Error - NOT USE
def preprocess_data(batch):
    input_texts = [f"Input: {text}" for text in batch['input_code']]
    output_texts = [f"Output: {text}" for text in batch['output_code']]
    print(input_texts)
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    outputs = tokenizer(output_texts, return_tensors="pt", padding=True, truncation=True)
    
    # Shift the labels to the right to align with the causal LM training
    labels = outputs['input_ids'].clone()
    
    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels,
    }

tokenized_dataset = data.map(preprocess_data, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

['Input: def add(a, b):', 'Input: def subtract(a, b):']


In [16]:
def preprocess_data(batch):
    input_texts = [f"Input: {text}" for text in batch['input_code']]
    output_texts = [f"Output: {text}" for text in batch['output_code']]

    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    outputs = tokenizer(output_texts, return_tensors="pt", padding=True, truncation=True)
    
    # Shift the labels to the right to align with causal LM training
    labels = outputs['input_ids'].clone()

    # Replace padding token IDs in labels with -100 to ignore them in the loss calculation
    labels[labels == tokenizer.pad_token_id] = -100

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels,
    }

# Re-run tokenization on the dataset
tokenized_dataset = data.map(preprocess_data, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [17]:
# Fine tune the model
training_args = TrainingArguments(
    output_dir="./model-output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    learning_rate=1e-4,
    fp16=True,  # use mixed precision if supported
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
)

trainer.train()

ValueError: Expected input batch_size (16) to match target batch_size (12).

In [66]:
# Prepare the training data - worked
def preprocess_function(batch):
    inputs = [f"Input: {text}" for text in batch['input_code']]
    targets = [f"Output: {text}" for text in batch['output_code']]
    print(inputs, targets)
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=512, truncation=True, padding="max_length")
    # with tokenizer.as_target_tokenizer():
    #     labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = data.map(
    preprocess_function,
    batched=True,
    num_proc=2,
    remove_columns=data["train"].column_names,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./model-output",  # Updated output directory
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
)

# Create a Trainer instance and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
)
trainer.train()

Map (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

['Input: def add(a, b):'] ['Input: def subtract(a, b):']['Output: return a + b']
 ['Output: return a - b']


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss


TrainOutput(global_step=3, training_loss=11.30526351928711, metrics={'train_runtime': 4.337, 'train_samples_per_second': 1.383, 'train_steps_per_second': 0.692, 'total_flos': 51997430513664.0, 'train_loss': 11.30526351928711, 'epoch': 3.0})

In [56]:
# Generate output

In [65]:
# Function to generate code
def generate_code(input_code):
    inputs = tokenizer(f"Input: {input_code}", return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=100)
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_code

# Example usage
input_code = "Input: def add(a, b):"
generated_code = generate_code(input_code)
print(generated_code)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Input: Input: def add(a, b): return a + b Output: Output: def add(a, b): return a - b Explanation: The output is the difference between the two numbers. The output is the sum of the two numbers. The output is the product of the two numbers. The output is the quotient of the two numbers. The output is the absolute value of the difference between the two numbers. The output is the absolute value of the sum of the two numbers. The output is the absolute value of the product of the two
