# LoRA implementation on SmolLM

### Imports

In [55]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
import time
import os
from tqdm.notebook import tqdm
from tqdm import tqdm
from tabulate import tabulate

# For tokenization and dataset loading
from transformers import AutoTokenizer, AutoModelForCausalLM
!pip install datasets
from datasets import load_dataset


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




#### Initializing device here for future use if needed

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [57]:
from dataclasses import dataclass

@dataclass
class smolConfig:
    vocab_size = 49152
    hidden_size = 576
    intermediate_size = 1536
    num_hidden_layers = 30
    num_heads = 9
    kv_heads = 3

In [58]:
torch.manual_seed(42)

def rotate_half(x):
    """
    Helper function to rotate the left half of a tensor along its final dimension.
    """
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]

    return torch.cat((-x2, x1), dim=-1)

def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """
    Applies RoPE on the query and key tensors.
    """
    cos, sin = cos.to(q.device), sin.to(q.device)

    # Unsqueexzing to enable broadcasting
    sin = sin.unsqueeze(unsqueeze_dim)
    cos = cos.unsqueeze(unsqueeze_dim)

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)

    return q_embed, k_embed

class RotaryEmbedder(nn.Module):
    def __init__(self, dim, base):
        super().__init__()
        # Precompute frequency for sine/cosine embeddings
        self.freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

    @torch.no_grad()
    def forward(self, x):
        # Generate positions (sequence indices) for the input
        pos = torch.arange(x.shape[-2], dtype=torch.long)
        # Compute angles for sine and cosine embeddings
        angles = torch.einsum("p,f->pf", pos.float(), self.freq).unsqueeze(dim=0)
        # Duplicate angles for sine and cosine embeddings
        emb = torch.cat((angles, angles), dim=-1)
        # Return cosine and sine components of the positional embeddings
        return emb.cos(), emb.sin()

class GroupedQueryAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Model dimensions and attention configurations
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_heads
        self.head_dim = config.hidden_size // self.num_heads
        self.kv_heads = config.kv_heads  # Number of key-value heads
        self.rope_theta = 10000.0  # Scaling factor for rotary embeddings

        # Linear projections for queries, keys, values, and output
        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(config.hidden_size, self.kv_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(config.hidden_size, self.kv_heads * self.head_dim, bias=False)
        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)

        # Rotary embedding generator
        self.rotary_emb = RotaryEmbedder(base=self.rope_theta, dim=self.head_dim)

    def _repeat_kv(self, x, n_rep):
        batch, num_key_value_heads, slen, head_dim = x.shape
        # Expand the number of key-value heads by repeating them
        x = x[:, :, None, :, :].expand(
            batch, num_key_value_heads, n_rep, slen, head_dim
        )
        # Reshape to align with the expected multi-head attention format
        return x.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

    def forward(self, x: torch.Tensor, attention_mask=None):
        # Input dimensions: (batch_size, seq_len, hidden_size)
        b, q, _ = x.size()

        # Project input hidden states into queries, keys, and values
        q_states = self.q_proj(x)
        k_states = self.k_proj(x)
        v_states = self.v_proj(x)

        # Reshape and transpose for multi-head attention
        q_states = q_states.view(b, q, self.num_heads, self.head_dim).transpose(1, 2)
        k_states = k_states.view(b, q, self.kv_heads, self.head_dim).transpose(1, 2)
        v_states = v_states.view(b, q, self.kv_heads, self.head_dim).transpose(1, 2)

        # Compute rotary positional embeddings
        cos, sin = self.rotary_emb(q_states)
        cos = cos.to(q_states.device)
        sin = sin.to(q_states.device)
        # Apply positional embeddings to queries and keys
        q_states, k_states = apply_rotary_pos_emb(q_states, k_states, cos, sin)

        # Repeat key and value tensors to match the number of query heads
        __kv_groups = self.num_heads // self.kv_heads
        k_states = self._repeat_kv(k_states, __kv_groups)
        v_states = self._repeat_kv(v_states, __kv_groups)

        # Compute attention scores (scaled dot-product attention)
        attn_weights = torch.matmul(q_states, k_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        # Add attention mask (e.g., for causal or padding masking)
        attn_weights = attn_weights + attention_mask

        # Normalize attention weights using softmax
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
        # Apply dropout to attention weights
        attn_weights = nn.functional.dropout(attn_weights, 0)

        # Compute attention output
        attn_output = torch.matmul(attn_weights, v_states)
        # Reshape and transpose back to original format
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(b, q, -1)

        # Project the attention output back to the hidden size
        attn_output = self.o_proj(attn_output)

        # Return the final attention output
        return attn_output

In [59]:
torch.manual_seed(42)

class RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        This is the Root Mean Square Normalisation class.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))  # Learnable scaling factor
        self.variance_epsilon = eps

    def forward(self, x):
        # Calculate variance along the last dimension (hidden size)
        variance = x.pow(2).mean(-1, keepdim=True)

        # Normalize and scale
        x = x * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * x

class MLP(nn.Module):
    def __init__(self, hidden_size, intermediate_size):
        """
        This is the gated MLP from the LLaMa architecture. Here we use the SiLU acitvation.
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.activation = nn.modules.activation.SiLU()

        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)

    def forward(self, x):
        down_proj = self.down_proj(self.activation(self.gate_proj(x)) * self.up_proj(x))
        return down_proj

class LlamaDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        """
        This is the Llama decoder block.
        """
        # Self Attention Module
        self.self_attn = GroupedQueryAttention(config)

        # FFN Module
        self.mlp = MLP(hidden_size=config.hidden_size, intermediate_size=config.intermediate_size)

        # Pre Attention and Post Attention normalisation
        self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-05)
        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-05)

    def forward(self, x, attention_mask):
        # Skip connection cache

        residual = x

        # Pre-attention normalisation
        x = self.input_layernorm(x)

        # A causal attention mask (i.e., decoder can only look at tokens that it has generated thus far)
        attention_mask = torch.triu(torch.full((attention_mask.shape[-1], attention_mask.shape[-1]),
                                               fill_value=float('-inf')), diagonal=1)

        attention_mask = attention_mask.to(x.device)

        # Self-attention block
        x = self.self_attn(x=x,attention_mask=attention_mask)
        x += residual

        # Skip connection cache for MLP
        residual = x

        # Pre-MLP normalisation
        x = self.post_attention_layernorm(x)

        # MLP block
        x = self.mlp(x)
        x += residual

        return x

In [60]:
torch.manual_seed(42)

class smolModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        # embedding layer which maps each token to a vector embedding
        self.embed_tokens = nn.Embedding(
            num_embeddings=config.vocab_size,
            embedding_dim=config.hidden_size
        )

        # Stack of decoder layers (LlamaDecoder) defined by the configuration
        self.layers = nn.ModuleList([
            LlamaDecoder(config) for _ in range(config.num_hidden_layers)
        ])

        # RMSNorm: final layer normalization applied to hidden states
        self.norm = RMSNorm(config.hidden_size, eps=1e-05)

    def forward(self, input_ids=None, attention_mask=None):
        inputs_embeds = self.embed_tokens(input_ids)
        x = inputs_embeds

        # Pass embeddings through each decoder layer
        for i, decoder_layer in enumerate(self.layers):
            layer_outputs = decoder_layer(
                x,
                attention_mask=attention_mask
            )
            x = layer_outputs

        # Final normalisation
        x = self.norm(x)

        return x

class smolLM(nn.Module):
    """
    This is the Language Model.
    It passes the embeddings from the SmolLM backbone into a LM head.
    The LM head generates logits over the space of the entire vocabulary for next word prediction.
    """
    def __init__(self, config):
        super().__init__()
        # SmolLM backbone which generates the contextualised embeddings for the input tokens
        self.model = smolModel(config)
        # The LM head which maps embeddings to logits over the vocabulary
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        # weights between LM head and the token_embedding layer are shared in the SmolLM architecture
        self.tie_weights()

    def tie_weights(self):
        # lm_head shares weights with the embedding layer
        self.lm_head.weight = self.model.embed_tokens.weight

    def forward(self, input_ids, attention_mask):
        # Input tokens are passed to the SmolLM backbone
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        # embeddings corresponding to each input token => (batch_size, seq_len, emb_dim)
        x = outputs

        # pass the embeddings through the LM head
        logits = self.lm_head(x).float()
        return {'logits': logits}

In [61]:
def __generate(model, inputs, num_tokens, tokenizer, max_length=50):
    """
    A basic greedy approach for text generation.
    """
    collect = []
    for _ in range(num_tokens):
        output = model(**inputs)
        output_id = torch.argmax(output['logits'][0, -1]).item()
        collect.append(output_id)
        if output_id == tokenizer.eos_token_id or len(collect) >= max_length:
            break
        # Update input_ids and attention_mask
        new_token = torch.tensor([output_id], device=inputs['input_ids'].device)
        inputs['input_ids'] = torch.cat([inputs['input_ids'][0], new_token]).unsqueeze(0)
        inputs['attention_mask'] = F.pad(inputs['attention_mask'], (0, 1), value=1)
    return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(collect))

In [None]:
torch.manual_seed(42)

class LoRALayer(nn.Module):
    """
    Implementation of a LoRA layer - a low-rank adaptation of a weight matrix.
    """

    def __init__(self, in_features, out_features, rank=8, alpha=16, dropout=0.0):
        """
        Initialize a LoRA layer.
        """
        super().__init__()
        # scaling factor
        self.rank = rank
        self.scaling = alpha / rank

        # matrix A
        self.A = nn.Parameter(torch.randn(rank, in_features) * 0.01)
        nn.init.kaiming_normal_(self.A, a=math.sqrt(5)) #(lowest perplex = 5)
        # nn.init.xavier_normal_(self.A) (lowest perplex = 9)
        # nn.init.normal_(self.A, mean=0.0, std=0.02) (lowest preplex = 6)

        # matrix B
        self.B = nn.Parameter(torch.zeros(out_features, rank))
        nn.init.zeros_(self.B)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        ## implement the forward pass here
        resA = F.linear(x, self.A)
        resB = F.linear(resA, self.B)
        output = self.scaling * self.dropout(resB)
        return output

In [None]:
torch.manual_seed(42)

class LoRALinear(nn.Module):
    """
    A linear layer with LoRA adaptation.
    """

    def __init__(self, linear_layer, rank=8, alpha=16, dropout=0.0):
        """
        Initialize a LoRA-adapted linear layer.
        """
        super().__init__()
        # original linear layer
        self.linear = linear_layer

        # freeze the weights of the original layer
        for p in self.linear.parameters():
            p.requires_grad = False

        # input and output dimensions from the linear layer
        in_features = self.linear.in_features
        out_features = self.linear.out_features

        # create the LoRA adaptation layer
        self.lora = LoRALayer(in_features, out_features, rank=rank, alpha=alpha, dropout=dropout)

    def forward(self, x):
        ## implement the forward pass here
        output = self.linear(x) + self.lora(x)
        return output


In [None]:
def add_lora_to_model(model, target_modules=None, rank=8, alpha=16, dropout=0.0):
    """
    Add LoRA adapters to target modules in the model.

    Returns:
        Model with LoRA adapters
    """
    ## your code here:
    if target_modules is None:
        target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']

    for param in model.parameters():
        param.requires_grad = False

    for main_mod in model.modules():
        sub_mod = main_mod._modules
        citems = list(sub_mod.items())
        for cname, cmod in citems:
            is_linear = isinstance(cmod, nn.Linear)

            check = False
            for target in target_modules:
                if target in cname:
                    check = True
                    break

            if is_linear and check:
                lora_wrapper = LoRALinear(linear_layer=cmod,rank=rank,alpha=alpha,dropout=dropout)
                main_mod._modules[cname] = lora_wrapper

    for module in model.modules():
        if isinstance(module, LoRALinear):
            for p in module.lora.parameters():
                p.requires_grad = True

    model_with_lora = model

    return model_with_lora


**Initializing the Base and LoRA models.**

In [None]:
config = smolConfig()
base_model = smolLM(config)
checkpoint = "HuggingFaceTB/SmolLM-135M"
reference_model = AutoModelForCausalLM.from_pretrained(checkpoint)
base_model.load_state_dict(reference_model.state_dict(), strict=False)

target_modules = [
    ## ADD MODULES HERE
     'q_proj','k_proj', 'v_proj','o_proj','up_proj', 'down_proj', 'gate_proj'
]

## DO NOT CHANGE THIS
lora_model = add_lora_to_model(
    base_model,
    target_modules=target_modules,
    rank=4,
    alpha=8,
    dropout=0.3,
)

### Parameter Analysis

Let's compare the parameter counts between the original model and the LoRA-enhanced version. Implement the parameter counting and analysis function.

You should see that the % of trainable parameters in our `lora_model` should be <1%.

In [66]:
def count_parameters(model: nn.Module, only_trainable: bool = False):
    """
    Count the number of parameters in a model.

    Args:
        model: PyTorch model
        only_trainable: If True, count only trainable parameters

    Returns:
        Number of parameters
    """
    if only_trainable:
            return sum(p.numel() for p in model.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in model.parameters())


def analyze_parameters(original_model: nn.Module, lora_model: nn.Module):
    """
    Analyze parameter counts between original and LoRA-adapted models.

    Returns:
        Dictionary with parameter statistics
    """

    total_params = count_parameters(original_model)
    trainable_params = count_parameters(lora_model, only_trainable=True)

    # calculate parameter savings
    param_percent = (trainable_params / total_params) * 100

    # count parameters by layer type
    lora_params_by_type = {}
    for name, module in lora_model.named_modules():
        if isinstance(module, LoRALayer):
            # extract the module type from the name
            parts = name.split(".")
            module_type = next(
                (
                    p
                    for p in parts
                    if any(
                        t in p
                        for t in [
                            ## ADD MODULES HERE
                    'q_proj', 'k_proj', 'v_proj', 'o_proj','up_proj', 'down_proj', 'gate_proj'
                        ]
                    )
                ),
                "other",
            )

            # count parameters in this LoRA layer
            params = sum(p.numel() for p in module.parameters())

            # add to the count by type
            if module_type in lora_params_by_type:
                lora_params_by_type[module_type] += params
            else:
                lora_params_by_type[module_type] = params

    stats =  {
        "total_params": total_params,
        "trainable_params": trainable_params,
        "param_percent": param_percent,
        "params_by_type": lora_params_by_type,
    }

    return stats

stats = analyze_parameters(base_model, lora_model)

print(f"Total Parameters in Original Model: {stats['total_params']}")
print(f"Trainable Parameters in LoRA Model: {stats['trainable_params']}")
print(f"% of trainable parameters: {stats['param_percent']:.2f}%")
print()
print(f"LoRA Parameters in each layer:")
for k, v in stats['params_by_type'].items():
    print(f"{k}: {v}")

Total Parameters in Original Model: 135736128
Trainable Parameters in LoRA Model: 1221120
% of trainable parameters: 0.90%

LoRA Parameters in each layer:
q_proj: 138240
k_proj: 92160
v_proj: 92160
o_proj: 138240
up_proj: 253440
down_proj: 253440
gate_proj: 253440


### Fine-tuning with LoRA

#### Dataset Preparation
Let's set up a small dataset for fine-tuning:

In [67]:
def prepare_dataset(
    tokenizer,
    dataset_name="databricks/databricks-dolly-15k",
    subset=None,
    max_samples=500,
):
    """
    Prepare a dataset for fine-tuning.

    Args:
        tokenizer: Tokenizer to use
        dataset_name: HuggingFace dataset name
        subset: Dataset subset (if applicable)
        max_samples: Maximum number of samples to use

    Returns:
        Processed dataset ready for training
    """
    # load dataset
    if subset:
        dataset = load_dataset(dataset_name, subset)
    else:
        dataset = load_dataset(dataset_name)

    train_data = (
        dataset["train"]
        .shuffle(seed=42)
        .select(range(min(max_samples, len(dataset["train"]))))
    )

    train_val_split = train_data.train_test_split(test_size=0.2, seed=42)
    train_data = train_val_split["train"]
    val_data = train_val_split["test"]

    def tokenize_function(examples):
        tokenized = tokenizer(examples["instruction"], padding="max_length", truncation=True, max_length=512)
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    train_tokenized = train_data.map(
        tokenize_function, batched=True, remove_columns=train_data.column_names
    )
    val_tokenized = val_data.map(
        tokenize_function, batched=True, remove_columns=val_data.column_names
    )

    train_tokenized.set_format("torch")
    val_tokenized.set_format("torch")

    train_dataloader = DataLoader(train_tokenized, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_tokenized, batch_size=8)

    return train_dataloader, val_dataloader

#### Initialzing our Tokenizer and Dataset.

In [68]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# the tokenizer does not have a defined padding token, so we initialize our own as the [EOS] token.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

train_dataloader, val_dataloader = prepare_dataset(tokenizer=tokenizer,  dataset_name="databricks/databricks-dolly-15k", max_samples=3000)

**We can test our base model to ensure it's working correctly.**

In [69]:
prompt = "The future of AI is"
inputs = tokenizer(prompt, return_tensors="pt")

out = __generate(base_model, inputs, num_tokens=100, tokenizer=tokenizer)

print('=='*10 + f' Output generated' + '=='*10)
print(prompt + ' ' + out)

The future of AI is  bright, but it’s not without its challenges. One of the biggest challenges is the lack of regulation and oversight. AI systems are often developed and deployed without the necessary safeguards in place to ensure they are safe and ethical. This lack of regulation can


#### Training Loop
The training function for our model with LoRA adapters has been implemented below

In [70]:
def train_lora(
    model,
    train_dataloader,
    val_dataloader,
    optimizer,
    epochs=3,
    device=None,
):
    """
    Train a model with LoRA adapters.

    Args:
        model: LoRA-adapted model
        train_dataloader: Training data
        val_dataloader: Validation data
        optimizer: PyTorch optimizer
        epochs: Number of training epochs
        device: Device to train on

    Returns:
        Training history
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    history = {
        "train_loss": [],
        "val_loss": [],
        "train_perplexity": [],
        "val_perplexity": [],
    }

    # add scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

    for epoch in range(epochs):
        model.train()
        train_losses = []

        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Train]")

        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs["logits"]

            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()
            shift_attention_mask = attention_mask[:, 1:].contiguous()

            loss_fct = nn.CrossEntropyLoss(reduction="none")
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )

            loss = loss.view(shift_labels.size())
            loss = (loss * shift_attention_mask).sum() / shift_attention_mask.sum()

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            progress_bar.set_postfix({"train_loss": loss.item()})

        avg_train_loss = sum(train_losses) / len(train_losses)
        avg_train_perplexity = torch.exp(torch.tensor(avg_train_loss)).item()

        model.eval()
        val_losses = []

        progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Val]")

        with torch.no_grad():
            for batch in progress_bar:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs["logits"]

                shift_logits = logits[:, :-1, :].contiguous()
                shift_labels = labels[:, 1:].contiguous()
                shift_attention_mask = attention_mask[:, 1:].contiguous()

                loss_fct = nn.CrossEntropyLoss(reduction="none")
                loss = loss_fct(
                    shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
                )

                loss = loss.view(shift_labels.size())
                loss = (loss * shift_attention_mask).sum() / shift_attention_mask.sum()

                val_losses.append(loss.item())
                progress_bar.set_postfix({"val_loss": loss.item()})

        avg_val_loss = sum(val_losses) / len(val_losses)
        avg_val_perplexity = torch.exp(torch.tensor(avg_val_loss)).item()

        history["train_loss"].append(avg_train_loss)
        history["val_loss"].append(avg_val_loss)
        history["train_perplexity"].append(avg_train_perplexity)
        history["val_perplexity"].append(avg_val_perplexity)

        print(
            f"Epoch {epoch+1}/{epochs} - "
            f"Train Loss: {avg_train_loss:.4f}, Train Perplexity: {avg_train_perplexity:.4f}, "
            f"Val Loss: {avg_val_loss:.4f}, Val Perplexity: {avg_val_perplexity:.4f}"
        )

        scheduler.step()

    return history, model

#### Training our Model.

In [None]:
## DO NOT CHANGE THIS
# torch.cuda.empty_cache()
# torch.cuda.reset_peak_memory_stats()

optimizer = torch.optim.AdamW(
    [p for p in lora_model.parameters() if p.requires_grad], lr=1e-4, weight_decay=0.01
)

history, trained_lora_model = train_lora(model=lora_model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, optimizer=optimizer, epochs=5)

Epoch 1/5 [Train]: 100%|██████████| 300/300 [03:47<00:00,  1.32it/s, train_loss=3.65]
Epoch 1/5 [Val]: 100%|██████████| 75/75 [00:22<00:00,  3.31it/s, val_loss=3.25]


Epoch 1/5 - Train Loss: 3.4935, Train Perplexity: 32.9011, Val Loss: 3.2364, Val Perplexity: 25.4413


Epoch 2/5 [Train]: 100%|██████████| 300/300 [03:48<00:00,  1.31it/s, train_loss=3.88]
Epoch 2/5 [Val]: 100%|██████████| 75/75 [00:22<00:00,  3.30it/s, val_loss=3.23]


Epoch 2/5 - Train Loss: 3.2192, Train Perplexity: 25.0084, Val Loss: 3.1136, Val Perplexity: 22.5023


Epoch 3/5 [Train]: 100%|██████████| 300/300 [03:48<00:00,  1.32it/s, train_loss=3.46]
Epoch 3/5 [Val]: 100%|██████████| 75/75 [00:22<00:00,  3.31it/s, val_loss=3.23]


Epoch 3/5 - Train Loss: 3.0998, Train Perplexity: 22.1936, Val Loss: 3.0835, Val Perplexity: 21.8352


Epoch 4/5 [Train]: 100%|██████████| 300/300 [03:48<00:00,  1.32it/s, train_loss=2.8] 
Epoch 4/5 [Val]: 100%|██████████| 75/75 [00:22<00:00,  3.31it/s, val_loss=3.23]


Epoch 4/5 - Train Loss: 3.0436, Train Perplexity: 20.9797, Val Loss: 3.0657, Val Perplexity: 21.4494


Epoch 5/5 [Train]: 100%|██████████| 300/300 [03:48<00:00,  1.32it/s, train_loss=2.2] 
Epoch 5/5 [Val]: 100%|██████████| 75/75 [00:22<00:00,  3.31it/s, val_loss=3.23]

Epoch 5/5 - Train Loss: 2.9960, Train Perplexity: 20.0045, Val Loss: 3.0609, Val Perplexity: 21.3466





**Optional: You can save your trained model in case you decide to do the assignment in parts.**

In [72]:
torch.save(trained_lora_model.state_dict(), "lora_finetuned_model.pth")

#### Merging LoRA Weights for Inference
For efficient inference, we can merge LoRA weights with the original weights.

Implement the function `merge_lora_weights`.

In [None]:
def merge_lora_weights(model):
    """
    Merge LoRA weights with original weights for efficient inference.

    Args:
        model: LoRA-adapted model

    Returns:
        Model with merged weights
    """
    ## your code here
    for name, module in list(model.named_modules()):
        if isinstance(module, LoRALinear):
            lora_A = module.lora.A
            lora_B = module.lora.B
            scaling = module.lora.scaling
            delta = torch.matmul(lora_B, lora_A) * scaling
            module.linear.weight.data.add_(delta.data)
            # module.lora = None

            # cleanup not necessary but replaced wtith nn.Linear
            parent = model
            path = name.split(".")
            for p in path[:-1]:
                parent = getattr(parent, p)
            setattr(parent, path[-1], module.linear)

    merged_model = model

    return merged_model


In [81]:
# Merge LoRA weights into the base model
merged_model = merge_lora_weights(trained_lora_model)

**Optional: Save your merged model.**

In [82]:
torch.save(merged_model.state_dict(), "merged_lora_model.pth")

### Text Generation and Comparison
Now let's compare text generation between models.

#### Loading in the fully finetuned model.

In [None]:
finetuned_base_model = smolLM(config)

## add your model path here
model_path = "/kaggle/input/fullyfinetuned/pytorch/default/1/full_finetuned_smolLM.pth"

# Load the finetuned weights into the base model
finetuned_base_model.load_state_dict(torch.load(model_path, weights_only=True))

# Set to eval mode for inference
finetuned_base_model.eval()

smolLM(
  (model): smolModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoder(
        (self_attn): GroupedQueryAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (rotary_emb): RotaryEmbedder()
        )
        (mlp): MLP(
          (activation): SiLU()
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (lm_head): Linear(in_features=576, out_features=49152, bias=

#### We can now compare the fully finetuned and LoRA finetuned model to evaluate the effectiveness of using LoRA.

In [84]:
def compare_generations(models, tokenizer, prompts, max_tokens=100):
    """
    Compare text generation between different model versions.

    Args:
        models: Dictionary of models to compare
        tokenizer: Tokenizer
        prompts: List of prompts to test
        max_tokens: Maximum tokens to generate

    Returns:
        DataFrame with generation results
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = []

    def calculate_perplexity(model, inputs):
        """
        Computes perplexity for a given model and input.
        """
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs["logits"]
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = inputs["input_ids"][:, 1:].contiguous()
            loss_fct = nn.CrossEntropyLoss(reduction="mean")
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )
            perplexity = torch.exp(loss).item()
        return perplexity

    for prompt in prompts:
        print(f"Prompt: {prompt}")
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        prompt_results = {"Prompt": prompt}

        for model_name, model in models.items():
            model.to(device)
            model.eval()

            start_time = time.time()
            output = __generate(
                model, inputs.copy(), num_tokens=max_tokens, tokenizer=tokenizer
            )
            end_time = time.time()

            perplexity = calculate_perplexity(model, inputs)

            prompt_results[f"{model_name} Perplexity"] = perplexity

            print(f"Model: {model_name}")
            print(f"Generated: {output}")
            print(f"Time: {end_time - start_time:.2f}s")
            print(f"Perplexity: {perplexity:.4f}")
            print("-" * 50)

        results.append(prompt_results)

    df_results = pd.DataFrame(results)

    return df_results

In [85]:
# Define models for comparison
models = {
    "Fully Finetuned Model": finetuned_base_model,
    "LoRA Finetuned Model": merged_model,
}

# Define prompts to test
prompts = [
    "Once upon a time, in a distant galaxy,",
    "The future of artificial intelligence is",
    "A wise old wizard once said,",
]

# Run the comparison
df_results = compare_generations(models, tokenizer, prompts, max_tokens=100)

Prompt: Once upon a time, in a distant galaxy,
Model: Fully Finetuned Model
Generated:  there was a kind-hearted alien named Zork. One day, while exploring the universe, Zork came across a planet called Zorbaheim. Excited to explore, Zork decided to visit the planet and start his new life.


Time: 2.50s
Perplexity: 5.9674
--------------------------------------------------
Model: LoRA Finetuned Model
Generated:  there was a spaceship named Zephyr who had just landed on a beautiful planet called Earth. Zephyr was excited to explore this new world but also felt a bit overwhelmed by its vastness and complexity. One day, while wandering around
Time: 2.53s
Perplexity: 5.9229
--------------------------------------------------
Prompt: The future of artificial intelligence is
Model: Fully Finetuned Model
Generated:  very bright.<|endoftext|>
Time: 0.19s
Perplexity: 22.8368
--------------------------------------------------
Model: LoRA Finetuned Model
Generated:  uncertain. Will it be used for g

**Compare the perplexity scores of the models**


In [79]:
print(tabulate(df_results, headers='keys', tablefmt='fancy_grid'))

╒════╤══════════════════════════════════════════╤════════════════════════════════════╤═══════════════════════════════════╕
│    │ Prompt                                   │   Fully Finetuned Model Perplexity │   LoRA Finetuned Model Perplexity │
╞════╪══════════════════════════════════════════╪════════════════════════════════════╪═══════════════════════════════════╡
│  0 │ Once upon a time, in a distant galaxy,   │                            5.96741 │                           5.92286 │
├────┼──────────────────────────────────────────┼────────────────────────────────────┼───────────────────────────────────┤
│  1 │ The future of artificial intelligence is │                           22.8368  │                          17.3586  │
├────┼──────────────────────────────────────────┼────────────────────────────────────┼───────────────────────────────────┤
│  2 │ A wise old wizard once said,             │                           53.2328  │                          20.443   │
╘════╧══════════

### Analysis and Discussion
For this section, analyze your results and answer the following questions:

**Question 2:** How does LoRA performance compare to full fine-tuning? What are the tradeoffs?

**Ans:** LoRA achieves comparable to even superior performance with significantly reduced computational cost. As shown in the results, the LoRA-finetuned model outperformed the fully fine-tuned model on all evaluated prompts, producing lower perplexity scores, which indicate better predictions. The key tradeoff lies in efficiency: LORA only trains a small number of additional parameters, allowing faster training, less memory usage, and easier model storage and deployment. However, full fine-tuning might still be prefrable when complete control over the model’s behavior is required.

**Question 3:** Which target modules benefit most from LoRA adaptation in SmolLM?

**Ans:** Keeping in mind SmolLM, LoRA performs best when applied to the main projection layers of the attention which are the query, key,value, and output projections. These layers are responsible for computing the attention dynamics and have a high impact on the model’s ability to contextualize tokens effectively. This allows us to not modify the entire network but rather just apply LoRA to specific moduls.

**Question 4:** How does rank value affect the quality of adaptation and the parameter count?

**Ans:** The rank parameter in LoRA determines the dimensionality of the low-rank matrices used to approximate weight updates, directly impacting both the number of trainable parameters and the model’s capacity for adaptation. Higher ranks enable more expressive updates, improving performance on complex tasks but increasing computational cost/parameters. Lower ranks reduce memory and training overhead but may underfit with lesser parameters. Therefore, rank represents a tradeoff between adaptation quality and efficiency.

**Question 5:** What are the practical benefits of LoRA for deploying fine-tuned models?

**Ans:** Since LoRA freezes the base model and only trains a small number of low-rank adapter parameters, it drastically reduces storage and memory requirements (and by uqite a significant margin). This enables rapid deployment of multiple task-specific adapters without duplicating the entire model, which is especially valuable in resource-constrained environments. Additionally, LoRA adapters can be easily swapped, merged, or shared across systems, facilitating scalable and maintainable model serving. 

# Fin.