# LoRA Fine-Tuning Notebook

In [1]:
# import necessary python libraries
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, pipeline
from torch.utils.data import Dataset
import torch
import pickle
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import time
import math
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import json
from pathlib import Path
import random
import copy

In [2]:
""" configs = [
    {
        "chunk_filename_pkl": "./data/kahneman_chunks.pkl",
        "model_name": "deepseek-ai/deepseek-llm-7b-base",
        "r": 64,
        "lora_alpha": 128,
        "lora_dropout": 0.05,
        "target_modules": [
            "q_proj",
            "v_proj",
            "k_proj",
            "o_proj",
            "gate_proj"
        ],
        "bias": "none",
        "task_type": "CAUSAL_LM",
        "output_dir": "./lora_finetuned_model/deepseek-llm-7b-base/LoRA_deep_book_run",
        "per_device_train_batch_size": 8,
        "gradient_accumulation_steps": 4,
        "learning_rate": 5e-05,
        "num_train_epochs": 5,
        "save_strategy": "epoch",
        "bf16": True,
        "logging_steps": 10,
        "report_to": "none"
    },
    {
        "chunk_filename_pkl": "./data/kahneman_chunks.pkl",
        "model_name": "deepseek-ai/deepseek-llm-7b-base",
        "r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.1,
        "target_modules": [
            "q_proj",
            "v_proj"
        ],
        "bias": "none",
        "task_type": "CAUSAL_LM",
        "output_dir": "./lora_finetuned_model/deepseek-llm-7b-base/LoRA_shallow_book_run",
        "per_device_train_batch_size": 8,
        "gradient_accumulation_steps": 2,
        "learning_rate": 0.0002,
        "num_train_epochs": 3,
        "save_strategy": "epoch",
        "bf16": True,
        "logging_steps": 10,
        "report_to": "none"
    }
] """

' configs = [\n    {\n        "chunk_filename_pkl": "./data/kahneman_chunks.pkl",\n        "model_name": "deepseek-ai/deepseek-llm-7b-base",\n        "r": 64,\n        "lora_alpha": 128,\n        "lora_dropout": 0.05,\n        "target_modules": [\n            "q_proj",\n            "v_proj",\n            "k_proj",\n            "o_proj",\n            "gate_proj"\n        ],\n        "bias": "none",\n        "task_type": "CAUSAL_LM",\n        "output_dir": "./lora_finetuned_model/deepseek-llm-7b-base/LoRA_deep_book_run",\n        "per_device_train_batch_size": 8,\n        "gradient_accumulation_steps": 4,\n        "learning_rate": 5e-05,\n        "num_train_epochs": 5,\n        "save_strategy": "epoch",\n        "bf16": True,\n        "logging_steps": 10,\n        "report_to": "none"\n    },\n    {\n        "chunk_filename_pkl": "./data/kahneman_chunks.pkl",\n        "model_name": "deepseek-ai/deepseek-llm-7b-base",\n        "r": 16,\n        "lora_alpha": 32,\n        "lora_dropout": 0.

In [2]:
random_seeds = [3, 24, 34, 46, 59, 60, 61, 62, 64, 67, 68, 73, 74, 78, 81, 83, 88, 92, 94, 96]

In [3]:
# Model
model_name = "deepseek-ai/deepseek-llm-7b-base"

# files
chunk_filename_pkl = "./data/kahneman_chunks.pkl" # in this case a pickle file with a list of paragraphs stored in it


# LoRA
r = 16
lora_alpha = 32
lora_dropout = 0.1
target_modules = ["q_proj", "v_proj"]
bias = "none"
task_type = "CAUSAL_LM"

# Training
output_dir = "./lora_finetuned_model/deepseek-llm-7b-base/LoRA_shallow_book_run"
per_device_train_batch_size = 8
gradient_accumulation_steps = 2
learning_rate = 0.0002
num_train_epochs = 3
save_strategy = "epoch"
bf16 = True
logging_steps = 10
report_to = "none"

In [None]:
""" # Model
model_name = "deepseek-ai/deepseek-llm-7b-base"

# files
chunk_filename_pkl = "./data/kahneman_chunks.pkl" # in this case a pickle file with a list of paragraphs stored in it


# LoRA
r = 64
lora_alpha = 128
lora_dropout = 0.05
target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj"]
bias = "none"
task_type = "CAUSAL_LM"

# Training
output_dir = "./lora_finetuned_model/deepseek-llm-7b-base/LoRA_deep_book_run"
per_device_train_batch_size = 8
gradient_accumulation_steps = 4
learning_rate = 5e-5
num_train_epochs = 5
save_strategy = "epoch"
bf16 = True
logging_steps = 10
report_to = "none" """

In [4]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
original_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Load the pickle file
with open(chunk_filename_pkl, 'rb') as f:
    kahneman_paragraphs = pickle.load(f)

# Ensure it's a list
if not isinstance(kahneman_paragraphs, list):
    raise ValueError("The loaded pickle object is not a list!")

print(f"Loaded {len(kahneman_paragraphs)} paragraphs from pickle.")

Loaded 839 paragraphs from pickle.


In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.examples = []

        for text in texts:
            encoding = tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=max_length,
                return_tensors="pt"
            )

            input_ids = encoding["input_ids"].squeeze()
            attention_mask = encoding["attention_mask"].squeeze()

            # Set labels the same as input_ids, but ignore padding with -100
            labels = input_ids.clone()
            labels[labels == tokenizer.pad_token_id] = -100

            self.examples.append({
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels
            })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]


# Create dataset
dataset = TextDataset(kahneman_paragraphs, tokenizer)

In [7]:
for seed in random_seeds:
    # Define LoRA configuration
    lora_config = LoraConfig(
        r=r,  # Rank: Controls adaptation capacity
        lora_alpha=lora_alpha,  # Scaling factor
        lora_dropout=lora_dropout,  # Dropout probability
        target_modules=target_modules,  # Target attention layers
        bias=bias,
        task_type=task_type,
    )

    # Apply LoRA to the model
    model = get_peft_model(original_model, lora_config)
    model.print_trainable_parameters()  # Verify trainable params

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # Causal LM, not masked LM
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        save_strategy=save_strategy,
        bf16=bf16,
        logging_steps=logging_steps,
        report_to=report_to,  # Disable logging to external services like WandB
        seed=seed
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
    )

    # Record the start time
    start_time = time.time()

    # Train the model
    trainer.train()

    # Record the end time
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = (end_time - start_time)/60

    # Print the training duration in seconds
    print(f"Training took {elapsed_time:.2f} minutes.")

    # Save the LoRA fine-tuned model
    model.save_pretrained(output_dir + str(seed))
    tokenizer.save_pretrained(output_dir + str(seed))

trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6137
20,2.5927
30,2.531
40,2.4869
50,2.4887
60,2.4605
70,2.4505
80,2.4086
90,2.4354
100,2.4416


Training took 3.15 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.598
20,2.5525
30,2.5047
40,2.5474
50,2.4823
60,2.4895
70,2.4472
80,2.4142
90,2.4176
100,2.4279


Training took 3.17 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6385
20,2.5291
30,2.5307
40,2.5238
50,2.4856
60,2.4598
70,2.4741
80,2.405
90,2.4452
100,2.4169


Training took 3.17 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6308
20,2.5635
30,2.5356
40,2.4883
50,2.4836
60,2.4466
70,2.4822
80,2.4476
90,2.4006
100,2.4103


Training took 3.17 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.659
20,2.5235
30,2.5313
40,2.4979
50,2.5183
60,2.4221
70,2.4474
80,2.4336
90,2.4649
100,2.3912


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6388
20,2.5633
30,2.5247
40,2.5251
50,2.4528
60,2.4432
70,2.4545
80,2.4267
90,2.4026
100,2.4548


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6258
20,2.5802
30,2.5033
40,2.4805
50,2.5066
60,2.4803
70,2.4439
80,2.4022
90,2.4621
100,2.425


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6452
20,2.5514
30,2.4945
40,2.5301
50,2.4794
60,2.4349
70,2.4344
80,2.404
90,2.4566
100,2.4348


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6331
20,2.5604
30,2.5207
40,2.5075
50,2.4847
60,2.4427
70,2.426
80,2.4208
90,2.4226
100,2.447


Training took 3.17 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6307
20,2.5465
30,2.5082
40,2.5044
50,2.4967
60,2.4575
70,2.4371
80,2.4136
90,2.4299
100,2.4485


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.5982
20,2.5649
30,2.4893
40,2.5316
50,2.5041
60,2.4191
70,2.4513
80,2.4061
90,2.4515
100,2.4712


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6591
20,2.5764
30,2.4967
40,2.5143
50,2.4597
60,2.4844
70,2.4242
80,2.436
90,2.4429
100,2.4295


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6556
20,2.5388
30,2.5183
40,2.5157
50,2.4882
60,2.4564
70,2.4243
80,2.4395
90,2.426
100,2.4247


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6253
20,2.5747
30,2.5316
40,2.4539
50,2.5175
60,2.4835
70,2.4315
80,2.4317
90,2.4274
100,2.4082


Training took 3.17 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6008
20,2.5593
30,2.5229
40,2.522
50,2.482
60,2.454
70,2.4185
80,2.4622
90,2.4064
100,2.4347


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6458
20,2.5572
30,2.4961
40,2.5254
50,2.4772
60,2.444
70,2.4505
80,2.4151
90,2.4112
100,2.4439


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6101
20,2.5859
30,2.5191
40,2.4936
50,2.4942
60,2.4491
70,2.4482
80,2.4391
90,2.4244
100,2.4326


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6169
20,2.5461
30,2.4964
40,2.5165
50,2.5135
60,2.4579
70,2.4379
80,2.4695
90,2.382
100,2.4473


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6678
20,2.5198
30,2.5328
40,2.5123
50,2.4597
60,2.4527
70,2.4483
80,2.4297
90,2.4267
100,2.4314


Training took 3.16 minutes.
trainable params: 7,864,320 || all params: 6,918,230,016 || trainable%: 0.1137


Step,Training Loss
10,2.6296
20,2.5244
30,2.5389
40,2.5013
50,2.4912
60,2.4899
70,2.4252
80,2.4829
90,2.3988
100,2.4077


Training took 3.16 minutes.


In [None]:
""" # Define LoRA configuration
lora_config = LoraConfig(
    r=r,  # Rank: Controls adaptation capacity
    lora_alpha=lora_alpha,  # Scaling factor
    lora_dropout=lora_dropout,  # Dropout probability
    target_modules=target_modules,  # Target attention layers
    bias=bias,
    task_type=task_type,
)

# Apply LoRA to the model
model = get_peft_model(original_model, lora_config)
model.print_trainable_parameters()  # Verify trainable params """

trainable params: 91,914,240 || all params: 7,002,279,936 || trainable%: 1.3126


In [None]:
""" data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # Causal LM, not masked LM """

In [9]:
""" # Generate 20 unique random seeds
random_seeds = random.sample(range(1_000_000), 20)

# run trainings loop with 20 different configs
for config_idx, config in enumerate(configs):
    for run_idx, run_seed in enumerate(random_seeds):
        config = copy.deepcopy(config)
        config["seed"] = run_seed


        # Create a unique output directory
        base_output = Path(config["output_dir"])
        run_output = base_output.parent / f"{base_output.name}_seed{run_seed}"
        run_output.mkdir(parents=True, exist_ok=True)
        config["output_dir"] = str(run_output)

        print(f"\n=== Starting run {run_idx} for config {config_idx} with seed {run_seed} ===")
        print(f"Output dir: {config['output_dir']}")
        print(f"Num train epochs: {config['num_train_epochs']} ({type(config['num_train_epochs'])})")
        
        # Define LoRA configuration
        lora_config = LoraConfig(
            r=config["r"],  # Rank: Controls adaptation capacity
            lora_alpha=config["lora_alpha"],  # Scaling factor
            lora_dropout=config["lora_dropout"],  # Dropout probability
            target_modules=config["target_modules"],  # Target attention layers
            bias=config["bias"],
            task_type=config["task_type"],
        )

        # Apply LoRA to the model
        model = get_peft_model(original_model, lora_config)
        model.print_trainable_parameters()  # Verify trainable params

        # Prepare TrainingArguments
        training_args = TrainingArguments(
            output_dir=config["output_dir"],
            per_device_train_batch_size=config["per_device_train_batch_size"],
            gradient_accumulation_steps=config["gradient_accumulation_steps"],
            learning_rate=config["learning_rate"],
            num_train_epochs=config["num_train_epochs"],
            save_strategy=config["save_strategy"],
            bf16=config["bf16"],
            logging_steps=config["logging_steps"],
            report_to=config["report_to"],
            dataloader_drop_last=True,
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset,
            data_collator=data_collator,
        )
        print(f"len(dataset): {len(dataset)}")
        print(f"per_device_train_batch_size: {training_args.per_device_train_batch_size}")
        print(f"gradient_accumulation_steps: {training_args.gradient_accumulation_steps}")

        effective_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
        steps_per_epoch = math.ceil(len(dataset) / effective_batch_size)

        print(f"Effective batch size: {effective_batch_size}")
        print(f"Steps per epoch (Trainer internal): {steps_per_epoch}")
        print(f"Expected total steps: {config['num_train_epochs'] * steps_per_epoch}")


        effective_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
        steps_per_epoch = len(dataset) // effective_batch_size
        print(f"Trainer uses steps_per_epoch = {steps_per_epoch}")
        print(f"Total steps = {config['num_train_epochs'] * steps_per_epoch} = {config['num_train_epochs']} epochs * {steps_per_epoch} steps")
        print(f"Leftover samples per epoch: {len(dataset) % effective_batch_size}")


        # Record the start time
        start_time = time.time()

        # Train the model
        trainer.train()

        print(f"Trainer state: {trainer.state}")

        # Record the end time and compute duration
        end_time = time.time()
        elapsed_time = (end_time - start_time) / 60
        print(f"Training took {elapsed_time:.2f} minutes.") """

' # Generate 20 unique random seeds\nrandom_seeds = random.sample(range(1_000_000), 20)\n\n# run trainings loop with 20 different configs\nfor config_idx, config in enumerate(configs):\n    for run_idx, run_seed in enumerate(random_seeds):\n        config = copy.deepcopy(config)\n        config["seed"] = run_seed\n\n\n        # Create a unique output directory\n        base_output = Path(config["output_dir"])\n        run_output = base_output.parent / f"{base_output.name}_seed{run_seed}"\n        run_output.mkdir(parents=True, exist_ok=True)\n        config["output_dir"] = str(run_output)\n\n        print(f"\n=== Starting run {run_idx} for config {config_idx} with seed {run_seed} ===")\n        print(f"Output dir: {config[\'output_dir\']}")\n        print(f"Num train epochs: {config[\'num_train_epochs\']} ({type(config[\'num_train_epochs\'])})")\n        \n        # Define LoRA configuration\n        lora_config = LoraConfig(\n            r=config["r"],  # Rank: Controls adaptation cap

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    save_strategy=save_strategy,
    bf16=bf16,
    logging_steps=logging_steps,
    report_to=report_to  # Disable logging to external services like WandB
)

In [11]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

In [12]:
# Record the start time
start_time = time.time()

# Train the model
trainer.train()

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = (end_time - start_time)/60

# Print the training duration in seconds
print(f"Training took {elapsed_time:.2f} minutes.")

# In case your getting the following error: RuntimeError: NVML_SUCCESS == r INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1729647382455/work/c10/cuda/CUDACachingAllocator.cpp":995, please report a bug to PyTorch. 
# Your account doesn't fulfill the necessary GPU requirements. Keep in mind that due to self-attention the input length has quadratic costs
# You can check your VRAM and other GPU related metrics by typing nvidia-smi in the terminal

Step,Training Loss
10,2.6225
20,2.5522
30,2.5167
40,2.4639
50,2.4425
60,2.416
70,2.363
80,2.3719
90,2.323
100,2.291


Training took 5.11 minutes.


In [13]:
# Save the LoRA fine-tuned model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./lora_finetuned_model/deepseek-llm-7b-base/LoRA_deep_book_run/tokenizer_config.json',
 './lora_finetuned_model/deepseek-llm-7b-base/LoRA_deep_book_run/special_tokens_map.json',
 './lora_finetuned_model/deepseek-llm-7b-base/LoRA_deep_book_run/tokenizer.json')

In [14]:
var_names = [
    "chunk_filename_pkl", "model_name",
    "r", "lora_alpha", "lora_dropout", "target_modules", "bias", "task_type",
    "output_dir", "per_device_train_batch_size", "gradient_accumulation_steps",
    "learning_rate", "num_train_epochs", "save_strategy",
    "bf16", "logging_steps", "report_to"
]

# Construct dictionary from current global variables
config = {var: globals()[var] for var in var_names}

# Save to JSON
with open(f"{output_dir}/experiment_config.json", "w") as f:
    json.dump(config, f, indent=4)

In [15]:
# Evaluate loss
trainer = Trainer(model=model)
eval_results = trainer.evaluate(dataset)
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 9.65
