# Experiments which show how gradients and optimsers work woth adapters

In [20]:
import torch
import torch.nn as nn
from torch.optim import Optimizer

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

import numpy as np
import matplotlib.pyplot as plt

In [18]:
def compare_weights(model1, model2):
    for (name1, param1), (_, param2) in zip(model1.named_parameters(), model2.named_parameters()): 
        weight1 = param1.data
        weight2 = param2.data
        
        if (weight1 is not None and weight2 is not None and torch.equal(weight1, weight2)) or \
            (weight1 is None and weight2 is None):
            continue
        else:
            return True, {
                "name": name1,
                "model1": weight1.mean().item() if weight1 is not None else None,
                "model2": weight2.mean().item() if weight2 is not None else None
            }

    return False, {}

def compare_gradients(model1, model2):    
    for (name1, param1), (_, param2) in zip(model1.named_parameters(), model2.named_parameters()):           
        grad1 = param1.grad
        grad2 = param2.grad
        
        if (grad1 is not None and grad2 is not None and torch.equal(grad1, grad2)) or \
            (grad1 is None and grad2 is None):
            continue
        else:
            return True, {
                "name": name1,
                "model1": grad1.mean().item() if grad1 is not None else None,
                "model2": grad2.mean().item() if grad2 is not None else None
            }
        
    return False, {}

def compare_model_states(model_1, model_2):      
    print("=== Weight Comparison ===")
    weights_not_same, weight_diffs = compare_weights(model_1, model_2)
    
    if weights_not_same:
        print("Mean Weight differences:")
        print(weight_diffs)
    
    print("\n=== Gradient Comparison ===")
    grads_not_same, grad_diffs = compare_gradients(model_1, model_2)

    if grads_not_same:
        print("Mean Gradient differences:")
        print(grad_diffs)


## Compare before and after forward pass

In [22]:
model_name = "EleutherAI/pythia-70m"
model = AutoModelForCausalLM.from_pretrained(model_name)
model.train()
tokenizer = AutoTokenizer.from_pretrained(model_name)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Prepare input
text = "This is a test sentence."
inputs = tokenizer(text, return_tensors="pt")
inputs["labels"] = inputs["input_ids"].clone()

model_initial = type(model)(model.config)
model_initial.load_state_dict(model.state_dict())

outputs = model(**inputs)
loss = outputs.loss
loss.backward()

compare_model_states(model_initial, model)

=== Weight Comparison ===

=== Gradient Comparison ===
Mean Gradient differences:
{'name': 'gpt_neox.embed_in.weight', 'model1': None, 'model2': -5.554166135801793e-14}


## Compare post-forward to post-step

In [24]:
model_forward = type(model)(model.config)
model_forward.load_state_dict(model.state_dict())

optimizer.step()

compare_model_states(model_forward, model)

=== Weight Comparison ===
Mean Weight differences:
{'name': 'gpt_neox.embed_in.weight', 'model1': 1.702372173895128e-05, 'model2': 1.7025038687279448e-05}

=== Gradient Comparison ===
Mean Gradient differences:
{'name': 'gpt_neox.embed_in.weight', 'model1': None, 'model2': -5.554166135801793e-14}
=== Weight Comparison ===

=== Gradient Comparison ===


For completeness compare after zero-grad

In [25]:
model_zero = type(model)(model.config)
model_zero.load_state_dict(model.state_dict())

optimizer.zero_grad()

compare_model_states(model_zero, model)

=== Weight Comparison ===

=== Gradient Comparison ===
