In [1]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn as nn
import numpy as np
import os
import random
import time
import datetime
import csv
from tqdm import tqdm
from datasets import load_dataset
from copy import deepcopy
import gc
import pandas as pd

# --- Experiment Configuration ---

# Reproducibility
RANDOM_SEED = 42

# Device Configuration
# Use two GPUs if available for base model and compressed model
device_main = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device_main = torch.device('cpu')

device_comp = torch.device("cuda:1" if torch.cuda.device_count() > 1 else device_main)
# device_comp = torch.device('cpu')

# Dataset Configuration
SMOKE_TEST = True # Set to True to use a smaller subset of the data for quick tests
SMOKE_TEST_SAMPLES = 5 # Number of samples for smoke test perplexity evaluation

# Logging and Saving
RESULTS_FILE = 'results.csv'
SAVE_MODELS = False # Set to True to save the weights of each trained model
MODEL_SAVE_DIR = 'saved_models'

# --- Hyperparameter Grid ---
# Define different configurations to test in the experimentation loop.
# Add or remove dictionaries to change the experiments.
hyperparameter_grid = [
    {
        'model_path': 'lmsys/vicuna-7b-v1.5',
        'merge_layers': 4,
        'interval': 2,
        'highest_lay': 20, # Should be adapted based on model architecture
        'lowest_lay': 6,
        'threshold': 0.65,
    },
    {
        'model_path': 'HuggingFaceTB/SmolLM2-135M-Instruct',
        'merge_layers': 4,
        'interval': 2,
        'highest_lay': 39,
        'lowest_lay': 0,
        'threshold': 0.75, # Higher threshold
    },
    {
        'model_path': 'lmsys/vicuna-7b-v1.5',
        'merge_layers': 3, # Fewer layers merged at a time
        'interval': 3,     # Wider interval
        'highest_lay': 39,
        'lowest_lay': 0,
        'threshold': 0.65,
    },
]

# # Create directory for saving models if it doesn't exist


# if SAVE_MODELS and not os.path.exists(MODEL_SAVE_DIR):
#     os.makedirs(MODEL_SAVE_DIR)

In [2]:
def cal_last_hidden_sim_from_saved(saved_hidden_states, model, tokenizer, sents):
    """
    Calculates mean cosine similarity between *saved* last hidden states
    (from a previous model) and last hidden states from `model` for given sentences.

    Parameters:
        saved_hidden_states: List[torch.Tensor] of last hidden states, one per sentence
        model: Hugging Face-style model to compare against
        tokenizer: Tokenizer for the model
        sents: List[str] — should be same sentences used to generate saved_hidden_states

    Returns:
        float: mean cosine similarity
    """

    hs2 = extract_last_hidden_states(model, tokenizer, sents)

    sims = []
    for h1, h2 in zip(saved_hidden_states, hs2):
        v1 = h1.flatten()
        v2 = h2.flatten()
        sim = torch.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0)).item()
        sims.append(sim)

    return float(np.mean(sims))

In [3]:
def set_seed(seed):
    """Sets the random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def create_model(model_path, device):
    """Creates and loads a pretrained model and tokenizer."""
    print(f"Loading model: {model_path}")
    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    return model, tokenizer

def get_model_size(model: nn.Module):
    """Calculates the number of parameters in a model."""
    num_elements = 0
    for param in model.parameters():
        num_elements += param.numel()
    return num_elements

def evaluate_perplexity(model, tokenizer, smoke_test=False, smoke_test_samples=5):
    """Computes perplexity on the WikiText-2 test split."""
    model.eval()
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    text = "\n\n".join(ds["text"])
    tokens = tokenizer(text, return_tensors="pt")
    input_ids = tokens.input_ids.to(model.device)

    nsamples = smoke_test_samples if smoke_test else 40
    seq_len = 2048
    
    # Check if the dataset is long enough
    if input_ids.size(1) < nsamples * seq_len:
        print(f"Warning: Dataset too small for nsamples={nsamples} and seq_len={seq_len}. Reducing nsamples.")
        nsamples = input_ids.size(1) // seq_len
        if nsamples == 0:
            print("Error: Not enough data to evaluate perplexity.")
            return float('nan')


    loss_fct = nn.CrossEntropyLoss()
    nlls = []
    
    for i in tqdm(range(nsamples), desc="Perplexity Evaluation"):
        start, end = i * seq_len, (i + 1) * seq_len
        batch = input_ids[:, start:end]
        with torch.no_grad():
            logits = model(batch).logits

        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = batch[:, 1:]
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)),
            shift_labels.reshape(-1)
        )
        nlls.append(loss * seq_len)

    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * seq_len))
    return ppl.item()


def extract_last_hidden_states(model, tokenizer, sents):
    """
    Runs the given model on a list of sentences and returns the
    final hidden states (detached to CPU) for each sentence.

    Parameters:
        model: Hugging Face-style model with `model.layers` attribute
        tokenizer: Corresponding tokenizer
        sents: List of strings

    Returns:
        List[torch.Tensor]: one hidden state tensor per sentence
    """
    hidden_states = []

    # Temporary container for hook capture
    hidden_container = {}

    def hook_fn(module, input, output):
        # Assumes output is a tuple (hidden_states, ...)
        hidden_container["h"] = output[0].detach().cpu()

    # Register on last transformer block (LLaMA/Vicuna style)
    hook = model.model.layers[-1].register_forward_hook(hook_fn)

    try:
        for s in sents:
            enc = tokenizer(s, return_tensors="pt")
            inputs = enc.input_ids.to(model.device)
            with torch.no_grad():
                _ = model(inputs)
            hidden_states.append(hidden_container["h"])
    finally:
        hook.remove()

    return hidden_states


def cal_last_hidden_sim(model1, model2, tokenizer, sents):
    """
    Calculates mean cosine similarity between final hidden layers
    of two models for given sentences.

    Uses extract_last_hidden_states() to get embeddings in a memory-safe way.
    """
    hs1 = extract_last_hidden_states(model1, tokenizer, sents)
    hs2 = extract_last_hidden_states(model2, tokenizer, sents)

    sims = []
    for h1, h2 in zip(hs1, hs2):
        # Flatten to 1D
        v1 = h1.flatten()
        v2 = h2.flatten()
        sim = torch.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0)).item()
        sims.append(sim)

    return float(np.mean(sims))



def compress_model(base_model, tokenizer, config):
    """Applies the layer-merging compression algorithm to a model."""
    
    sents = ['Mouron () is a commune in the Arde',
             'The 81st Mechanised Brigade () is a mechanised brigade of the Romanian Land Force',
             'There are 18 National Natural Landmarks in the U.S. state of Washington, out of nearly',
             'Torreorgaz is a municipality in the',
             'Copa Libertadores 1973 was won by defending champions Independiente of A']

    last_hidden_states = extract_last_hidden_states(base_model, tokenizer, sents)

    # Save to disk for future use
    torch.save(last_hidden_states, "last_hidden_states.pt")

    # model_to_compress = deepcopy(base_model).to(device_comp)
    model_to_compress = base_model
    del base_model
    gc.collect()
    torch.cuda.empty_cache()
    print('Compressed model loaded')
    
    lay = config['highest_lay'] - config['merge_layers']
    
    while lay >= config['lowest_lay']:
        print(f"Attempting to merge at layer: {lay}")
        
        # Ensure we don't go out of bounds
        if lay >= len(model_to_compress.model.layers):
            lay = len(model_to_compress.model.layers) - 1 - config['merge_layers']
            if lay < config['lowest_lay']: break
            continue

        tmp_merged_model = merge_layers_return_model(
            model_to_compress, lay, config['merge_layers'] - 1
        ).to(device_main)
        
        torch.cuda.empty_cache()
        sim_value = cal_last_hidden_sim_from_saved(
            last_hidden_states,
            tmp_merged_model,
            tokenizer,
            sents
        )
        print(f"Similarity after potential merge at layer {lay}: {sim_value:.4f}")

        if sim_value > config['threshold']:
            print(f"Merge accepted. New layer count: {len(tmp_merged_model.model.layers)}")
            model_to_compress = tmp_merged_model
            lay -= config['interval']
        else:
            print("Merge rejected.")
            lay -= 1
            
        del tmp_merged_model
        gc.collect()
        torch.cuda.empty_cache()

    model_to_compress.to(0)       
    # Update model config with the new number of layers
    model_to_compress.config.num_hidden_layers = len(model_to_compress.model.layers)
    return model_to_compress


# def merge_layers_return_model(model, merge_base_lay, merge_layer_num):
#     """Helper function to perform the actual layer weight merging."""
#     # model.to('cpu')
#     # torch.cuda.empty_cache()
#     # model_copy = deepcopy(model).to(0)
#     model.to('cpu')
    
#     model_copy = deepcopy(model).to(1)
#     # Ensure merge_layer_num doesn't exceed available layers
#     merge_layer_num = min(merge_layer_num, len(model.model.layers) - merge_base_lay - 1)
#     if merge_layer_num < 0: return model_copy # Nothing to merge

#     for diff_lay in range(merge_base_lay + 1, merge_base_lay + 1 + merge_layer_num):
#         # MLP layers
#         model_copy.model.layers[merge_base_lay].mlp.gate_proj.weight.data.add_(model.model.layers[diff_lay].mlp.gate_proj.weight.data.clone().to(1) - model_copy.model.layers[merge_base_lay].mlp.gate_proj.weight.data)
#         model_copy.model.layers[merge_base_lay].mlp.down_proj.weight.data.add_(model.model.layers[diff_lay].mlp.down_proj.weight.data.clone().to(1) - model_copy.model.layers[merge_base_lay].mlp.down_proj.weight.data)
#         model_copy.model.layers[merge_base_lay].mlp.up_proj.weight.data.add_(model.model.layers[diff_lay].mlp.up_proj.weight.data.clone().to(1) - model_copy.model.layers[merge_base_lay].mlp.up_proj.weight.data)


#         print('can copy')
#         # Attention layers
#         model_copy.model.layers[merge_base_lay].self_attn.q_proj.weight.data.add_(model.model.layers[diff_lay].self_attn.q_proj.weight.data.clone().to(1) - model_copy.model.layers[merge_base_lay].self_attn.q_proj.weight.data)
#         model_copy.model.layers[merge_base_lay].self_attn.k_proj.weight.data.add_(model.model.layers[diff_lay].self_attn.k_proj.weight.data.clone().to(1) - model_copy.model.layers[merge_base_lay].self_attn.k_proj.weight.data)
#         model_copy.model.layers[merge_base_lay].self_attn.v_proj.weight.data.add_(model.model.layers[diff_lay].self_attn.v_proj.weight.data.clone().to(1) - model_copy.model.layers[merge_base_lay].self_attn.v_proj.weight.data)
#         model_copy.model.layers[merge_base_lay].self_attn.o_proj.weight.data.add_(model.model.layers[diff_lay].self_attn.o_proj.weight.data.clone().to(1) - model_copy.model.layers[merge_base_lay].self_attn.o_proj.weight.data)
    
#     # Delete the merged layers
#     for diff_lay in range(merge_base_lay + merge_layer_num, merge_base_lay, -1):
#         del model_copy.model.layers[diff_lay]

#     del model
#     gc.collect()
#     return model_copy

def log_results(results, filename):
    """Appends experiment results to a CSV file."""
    file_exists = os.path.isfile(filename)
    with open(filename, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=results.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(results)

In [4]:
import gc
import torch


import gc
import torch

def merge_layers_return_model(model, merge_base_lay, merge_layer_num, temp_device=None):
    """
    In-place layer merge without deepcopy.
    Copies only needed layers to a temporary device (or same as base layer if None).
    """
    total_layers = len(model.model.layers)
    merge_layer_num = min(merge_layer_num, total_layers - merge_base_lay - 1)
    if merge_layer_num <= 0:
        return model

    base_layer = model.model.layers[merge_base_lay]
    base_device = base_layer.mlp.gate_proj.weight.device
    temp_device = temp_device or base_device  # default to base layer's device

    for diff_lay in range(merge_base_lay + 1, merge_base_lay + 1 + merge_layer_num):
        # Clone each source layer to temp_device
        src_mlp_gate = model.model.layers[diff_lay].mlp.gate_proj.weight.data.clone().to(temp_device)
        src_mlp_down = model.model.layers[diff_lay].mlp.down_proj.weight.data.clone().to(temp_device)
        src_mlp_up   = model.model.layers[diff_lay].mlp.up_proj.weight.data.clone().to(temp_device)

        src_q = model.model.layers[diff_lay].self_attn.q_proj.weight.data.clone().to(temp_device)
        src_k = model.model.layers[diff_lay].self_attn.k_proj.weight.data.clone().to(temp_device)
        src_v = model.model.layers[diff_lay].self_attn.v_proj.weight.data.clone().to(temp_device)
        src_o = model.model.layers[diff_lay].self_attn.o_proj.weight.data.clone().to(temp_device)

        # Ensure base layer is also on temp_device before merging
        base_layer = base_layer.to(temp_device)

        # Merge MLP
        base_layer.mlp.gate_proj.weight.data.add_(src_mlp_gate - base_layer.mlp.gate_proj.weight.data)
        base_layer.mlp.down_proj.weight.data.add_(src_mlp_down - base_layer.mlp.down_proj.weight.data)
        base_layer.mlp.up_proj.weight.data.add_(src_mlp_up - base_layer.mlp.up_proj.weight.data)

        # Merge Attention
        base_layer.self_attn.q_proj.weight.data.add_(src_q - base_layer.self_attn.q_proj.weight.data)
        base_layer.self_attn.k_proj.weight.data.add_(src_k - base_layer.self_attn.k_proj.weight.data)
        base_layer.self_attn.v_proj.weight.data.add_(src_v - base_layer.self_attn.v_proj.weight.data)
        base_layer.self_attn.o_proj.weight.data.add_(src_o - base_layer.self_attn.o_proj.weight.data)

        # Optionally move base layer back to its original device after each merge
        if temp_device != base_device:
            base_layer = base_layer.to(base_device)

        del src_mlp_gate, src_mlp_down, src_mlp_up, src_q, src_k, src_v, src_o
        gc.collect()
        torch.cuda.empty_cache()

    # Remove merged layers
    for idx in range(merge_base_lay + merge_layer_num, merge_base_lay, -1):
        del model.model.layers[idx]

    return model


In [5]:
# --- Main Loop ---

for i, config in enumerate(hyperparameter_grid):
    
    experiment_start_time = time.time()
    
    print("\n" + "="*50)
    print(f"Running Experiment {i+1}/{len(hyperparameter_grid)}")
    print(f"Configuration: {config}")
    print("="*50)

    try:
        # 1. Set seed for reproducibility
        set_seed(RANDOM_SEED)

        # 2. Model Creation
        base_model, tokenizer = create_model(config['model_path'], device=device_main)
        
        # Get original model stats
        original_size = get_model_size(base_model)
        original_layers = len(base_model.model.layers)

        print('Base model created')

        # 3. Model Compression (The core "training" or modification step)
        print("\n--- Starting Model Compression ---")
        compressed_model = compress_model(base_model, tokenizer, config)
        print("--- Model Compression Finished ---")

        # 4. Evaluation
        print("\n--- Evaluating Compressed Model ---")
        perplexity = evaluate_perplexity(compressed_model, tokenizer, SMOKE_TEST, SMOKE_TEST_SAMPLES)
        print(f"Perplexity: {perplexity:.4f}")
        
        # 5. Model Size Calculation
        compressed_size = get_model_size(compressed_model)
        compressed_layers = len(compressed_model.model.layers)
        
        # 6. Timing
        experiment_end_time = time.time()
        duration_minutes = (experiment_end_time - experiment_start_time) / 60
        
        # 7. Results Logging
        results = {
            'timestamp': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'model_path': config['model_path'],
            'merge_layers': config['merge_layers'],
            'interval': config['interval'],
            'threshold': config['threshold'],
            'original_layers': original_layers,
            'compressed_layers': compressed_layers,
            'original_params_B': f"{original_size / 1e9:.3f}",
            'compressed_params_B': f"{compressed_size / 1e9:.3f}",
            'perplexity': f"{perplexity:.4f}",
            'duration_minutes': f"{duration_minutes:.2f}"
        }
        
        log_results(results, RESULTS_FILE)
        print(f"\nResults logged to {RESULTS_FILE}")

        # 8. Save Model
        if SAVE_MODELS:
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            save_path = os.path.join(MODEL_SAVE_DIR, f'exp_{i+1}_{timestamp}')
            os.makedirs(save_path, exist_ok=True)
            compressed_model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            print(f"Model saved to {save_path}")

    except Exception as e:
        print(f"\n!!!!!! Experiment {i+1} failed with an error: {e} !!!!!!")
        # Log failure
        error_results = {
            'timestamp': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'model_path': config.get('model_path', 'N/A'),
            'merge_layers': config.get('merge_layers', 'N/A'),
            'interval': config.get('interval', 'N/A'),
            'threshold': config.get('threshold', 'N/A'),
            'perplexity': 'FAILED',
            'error_message': str(e)
        }
        log_results(error_results, RESULTS_FILE)
        
    finally:
        # Clean up memory
        del base_model, tokenizer, compressed_model
        gc.collect()
        torch.cuda.empty_cache()
        print("\nMemory cleaned up.")

print("\n\nAll experiments finished.")


Running Experiment 1/3
Configuration: {'model_path': 'lmsys/vicuna-7b-v1.5', 'merge_layers': 4, 'interval': 2, 'highest_lay': 20, 'lowest_lay': 6, 'threshold': 0.65}
Loading model: lmsys/vicuna-7b-v1.5


2025-08-22 12:40:03.140598: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755866403.163641     168 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755866403.170708     168 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base model created

--- Starting Model Compression ---
Compressed model loaded
Attempting to merge at layer: 16
Similarity after potential merge at layer 16: 0.7098
Merge accepted. New layer count: 29
Attempting to merge at layer: 14
Similarity after potential merge at layer 14: 0.4530
Merge rejected.
Attempting to merge at layer: 13
Similarity after potential merge at layer 13: 0.2012
Merge rejected.
Attempting to merge at layer: 12
Similarity after potential merge at layer 12: -0.0990
Merge rejected.
Attempting to merge at layer: 11
Similarity after potential merge at layer 11: -0.2110
Merge rejected.
Attempting to merge at layer: 10
Similarity after potential merge at layer 10: -0.3283
Merge rejected.
Attempting to merge at layer: 9
Similarity after potential merge at layer 9: 0.3934
Merge rejected.
Attempting to merge at layer: 8
Similarity after potential merge at layer 8: 0.5570
Merge rejected.
Attempting to merge at layer: 7
Similarity after potential merge at layer 7: 0.5578
Me

README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (341469 > 4096). Running this sequence through the model will result in indexing errors
Perplexity Evaluation: 100%|██████████| 5/5 [00:00<00:00,  8.43it/s]


Perplexity: inf

Results logged to results.csv

Memory cleaned up.

Running Experiment 2/3
Configuration: {'model_path': 'HuggingFaceTB/SmolLM2-135M-Instruct', 'merge_layers': 4, 'interval': 2, 'highest_lay': 39, 'lowest_lay': 0, 'threshold': 0.75}
Loading model: HuggingFaceTB/SmolLM2-135M-Instruct


config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Base model created

--- Starting Model Compression ---
Compressed model loaded
Attempting to merge at layer: 35
Attempting to merge at layer: 25
Similarity after potential merge at layer 25: 0.7132
Merge rejected.
Attempting to merge at layer: 24
Similarity after potential merge at layer 24: 0.5793
Merge rejected.
Attempting to merge at layer: 23
Similarity after potential merge at layer 23: 0.5768
Merge rejected.
Attempting to merge at layer: 22
Similarity after potential merge at layer 22: 0.5755
Merge rejected.
Attempting to merge at layer: 21
Similarity after potential merge at layer 21: 0.5730
Merge rejected.
Attempting to merge at layer: 20
Similarity after potential merge at layer 20: 0.5724
Merge rejected.
Attempting to merge at layer: 19
Similarity after potential merge at layer 19: 0.5720
Merge rejected.
Attempting to merge at layer: 18
Similarity after potential merge at layer 18: 0.5710
Merge rejected.
Attempting to merge at layer: 17
Similarity after potential merge at lay

Token indices sequence length is longer than the specified maximum sequence length for this model (304978 > 8192). Running this sequence through the model will result in indexing errors
Perplexity Evaluation: 100%|██████████| 5/5 [00:00<00:00, 378.55it/s]


Perplexity: inf

Results logged to results.csv

Memory cleaned up.

Running Experiment 3/3
Configuration: {'model_path': 'lmsys/vicuna-7b-v1.5', 'merge_layers': 3, 'interval': 3, 'highest_lay': 39, 'lowest_lay': 0, 'threshold': 0.65}
Loading model: lmsys/vicuna-7b-v1.5


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Base model created

--- Starting Model Compression ---
Compressed model loaded
Attempting to merge at layer: 36
Attempting to merge at layer: 28
Similarity after potential merge at layer 28: 0.7931
Merge accepted. New layer count: 30
Attempting to merge at layer: 25
Similarity after potential merge at layer 25: 0.7596
Merge accepted. New layer count: 28
Attempting to merge at layer: 22
Similarity after potential merge at layer 22: 0.6693
Merge accepted. New layer count: 26
Attempting to merge at layer: 19
Similarity after potential merge at layer 19: 0.4258
Merge rejected.
Attempting to merge at layer: 18
Similarity after potential merge at layer 18: 0.2088
Merge rejected.
Attempting to merge at layer: 17
Similarity after potential merge at layer 17: 0.1237
Merge rejected.
Attempting to merge at layer: 16
Similarity after potential merge at layer 16: 0.5653
Merge rejected.
Attempting to merge at layer: 15
Similarity after potential merge at layer 15: 0.5779
Merge rejected.
Attempting t

Token indices sequence length is longer than the specified maximum sequence length for this model (341469 > 4096). Running this sequence through the model will result in indexing errors
Perplexity Evaluation: 100%|██████████| 5/5 [00:00<00:00, 358.73it/s]


Perplexity: inf

Results logged to results.csv

Memory cleaned up.


All experiments finished.


In [6]:
# Display the results table
print("\n--- Experiment Results Summary ---")
if os.path.exists(RESULTS_FILE):
    results_df = pd.read_csv(RESULTS_FILE)
    display(results_df)
else:
    print(f"Results file '{RESULTS_FILE}' not found.")


--- Experiment Results Summary ---


ParserError: Error tokenizing data. C error: Expected 7 fields in line 4, saw 11
