In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType

# Step 1: Load dataset
dataset = load_dataset('ajibawa-2023/Education-High-School-Students')

# Step 2: Select a small random subset of the dataset
# Use select with a smaller range to reduce dataset size
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(100))  # Select 1,00 examples for training
small_test_dataset = dataset['train'].shuffle(seed=42).select(range(10))    # Select 10 examples for testing

# Step 3: Load Gemma-2 model and tokenizer
local_model_path = "./gemma2"  # Update this to the local model directory
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# Assign pad_token to eos_token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(local_model_path)

# Step 4: Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=256)

tokenized_train = small_train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_test = small_test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Step 5: Define LoRA config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

# Step 6: Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Step 7: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduce batch size
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    fp16=True,  # Enable mixed precision
)

# Step 8: Define Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Step 9: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

# Step 10: Train the model with LoRA
trainer.train()

# Step 11: Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Step 12: Merge LoRA weights into the base model
model = model.merge_and_unload()  # This merges LoRA weights into the base model

# Step 13: Save the full model and tokenizer
model.save_pretrained("./full_fine_tuned_gemma_2_with_lora")
tokenizer.save_pretrained("./full_fine_tuned_gemma_2_with_lora")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.52s/it]
Map: 100%|██████████| 10/10 [00:00<00:00, 162.95 examples/s]


Epoch,Training Loss,Validation Loss
1,No log,1.699705
2,No log,1.673404
3,No log,1.664043


Evaluation Results: {'eval_loss': 1.6640427112579346, 'eval_runtime': 47.9125, 'eval_samples_per_second': 0.209, 'eval_steps_per_second': 0.042, 'epoch': 3.0}


('./full_fine_tuned_gemma_2_with_lora\\tokenizer_config.json',
 './full_fine_tuned_gemma_2_with_lora\\special_tokens_map.json',
 './full_fine_tuned_gemma_2_with_lora\\tokenizer.json')