In [None]:
pip install transformers accelerate peft bitsandbytes trl

In [None]:
import pandas as pd
from datasets import Dataset

# Load the JSONL file into a pandas DataFrame
df = pd.read_json('/kaggle/input/pytrainclean-jsonl-datasetreasoning-finalcode/train.jsonl', lines=True)

# Convert the pandas DataFrame to a Hugging Face Dataset object
hf_dataset = Dataset.from_pandas(df)

print("Dataset loaded successfully as pandas DataFrame and converted to Hugging Face Dataset.")
print(hf_dataset)

In [None]:
from huggingface_hub import login
login()

In [None]:
from transformers import AutoTokenizer

# 1. Load the pre-trained tokenizer for 'google/gemma-2-2b'
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')

# Add a pad token if it doesn't exist, which is common for some models like Gemma
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Or use a token that is not used in your vocabulary.

# 2. Define a function to format each example
def format_example(example):
    # Concatenate 'question', 'steps', and 'final_code' into a single string
    # This assumes a simple prompt-response structure for fine-tuning.
    # You might want to adjust this based on the specific fine-tuning method (e.g., SFTTrainer)
    # For simplicity, let's create a conversational turn format.
    formatted_text = f"User: {example['question']}\nAssistant: {example['steps']}\n{example['final_code']}"
    return {'text': formatted_text}

# 3. Apply the formatting function to the hf_dataset
hf_dataset_formatted = hf_dataset.map(format_example, batched=False)

# 4. Tokenize the formatted dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_dataset = hf_dataset_formatted.map(tokenize_function, batched=True)

print("Dataset formatted and tokenized successfully.")
print(tokenized_dataset)
print("Sample of tokenized data:")
print(tokenized_dataset[0])

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# 1. Define the model ID
model_id = 'google/gemma-2-2b'

# 2. Configure 4-bit quantization
bitsandbytes_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# 3. Load the pre-trained Gemma 2B model with quantization and device_map
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bitsandbytes_config,
    device_map="auto",
)

# 4. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# 5. Add a pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Or use a token that is not used in your vocabulary.

print(f"Model '{model_id}' loaded successfully with 4-bit quantization.")
print(f"Tokenizer for '{model_id}' loaded successfully.")
print(f"Pad token set: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")

In [None]:
from peft import LoraConfig

# Configure LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
    bias='none',
    task_type='CAUSAL_LM',
)

print("LoRA configuration created successfully.")

In [None]:
from transformers import TrainingArguments
from trl import SFTConfig,SFTTrainer

# 1. Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    optim='paged_adamw_8bit',
    save_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='constant',
    report_to='tensorboard',
    remove_unused_columns=False
)

# Training arguments
training_args = SFTConfig(  
        output_dir="mistral-finetuned-alpaca",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=100,
        num_train_epochs=1,
        max_steps=250,
        fp16=True,
        packing=False,  

        dataset_text_field="text",  
        push_to_hub=True
)

# Initialize Trainer
trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset,
        peft_config=lora_config,
        args=training_args,
)



In [None]:
print('Starting model training...')
trainer.train()

# Save the fine-tuned model
output_dir = './fine_tuned_model'
trainer.save_model(output_dir)
print(f"Fine-tuned model saved to {output_dir}")