In [1]:

import sys
from pymoo.algorithms.soo.nonconvex.ga import GA
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.mutation.pm import PM
from pymoo.operators.repair.rounding import RoundingRepair
from pymoo.operators.sampling.rnd import IntegerRandomSampling

from mergenetic import PROJECT_ROOT
from mergenetic.evaluation import *
from mergenetic.utils import *
from mergenetic.merging.merger import *
from mergenetic.optimization import *
from mergenetic.merging import SlerpMerger, TiesDareMerger
import argparse
import pandas as pd
from mergenetic.optimization import MergingProblem
from mergenetic.searcher import SearcherLMEval
from dataclasses import dataclass
import yaml
import pickle
from end2end_utils import *


  from .autonotebook import tqdm as notebook_tqdm


# Tutorial: Evolutionary Model Merging with Mergenetic

In this notebook, we illustrate how to perform evolutionary merging of language models using the Mergenetic library for a **problem defined via lm-eval harness**. 

The notebook covers the following steps:

1. **Configuration Merging** - Define the configuration for the merging problem.
1. **Data Loading** – Read the CSV dataset.
2. **Anchor Extraction** – Randomly select anchor points.
3. **Pre-Evaluation (Optional)** – Evaluate the base models (can be skipped).
4. **Theta Retrieval** – Obtain the latent ability parameters.
5. **Train/Test Split** – Separate the sampled anchors from the rest of the data.
6. **Set Performance Estimation Parameters** – Unpack and configure the parameters for performance estimation.
7. **Define the Merger** – Create the merging object (using SlerpMerger in this example).
8. **Define the Optimization Problem** – Specify the problem instance that wraps the merger, evaluation data, and settings.
9. **Define the Evolutionary Algorithm** – Set up a genetic algorithm (GA) with sampling, crossover, and mutation operators.
10. **Run the Search** – Execute the evolutionary search and test the merged model.

If you wish to run the same problem via a script, the code can be found at [this script](end2end.py)

# 1. Configuration for Merging


This configuration file sets up the parameters for running the cross-lingual evolutionary merging process. The goal is to merge language models using evolutionary algorithms while controlling evaluation and performance estimation aspects. To do so we need to furnish a list of details, like the model that must be merged, the task on which they should be evaluated,... Most importantly, three types of parameters must be furnished:

REQUIRED PARAMETERS:

- **pop_size** (`int`):  
  Specifies the size of the population in the evolutionary search (e.g., `25`).

- **n_iter** (`int`):  
  The number of iterations (or generations) to run the evolutionary algorithm (e.g., `7`).

- **ft_model_paths** (`List[str]`):  
  A list of fine-tuning model identifiers/paths. In this example, it includes:  
  - `"OpenLLM-Ro/RoMistral-7b-Instruct"`  
  - `"meta-math/MetaMath-Mistral-7B"`

- **path_to_store_merged_model** (`str`):  
  Path where the merged model(s) will be stored during the evolutionary search (e.g., `"experiments/models/merged/"`).

- **path_to_store_yaml** (`str`):  
  Directory path for saving YAML configuration files (e.g., `"experiments/evolutionary-merging-lm-harness/romanian_math/"`).

- **dtype** (`str`):  
  Data type used for model parameters; here it is `"float16"`.

- **run_id** (`str`):  
  A unique identifier for this run or experiment (e.g., `"romanian_math_gmpirt"`).

- **bench** (`str`):  
  Specifies the benchmark to use for evaluation (e.g., `"gsm8k"`). 

- **mode** (`str`):  
  Indicates the merging mode (e.g., `"gmpirt"`).

- **seed** (`int`):  
  Random seed value to ensure reproducibility (e.g., `420`). &#128521;

- **dataset_path** (`str`):  
  File path to the dataset used for evaluation (e.g., `"data/new_datasets/gsm8k/gsm8k_test_romanian.csv"`).

- **tasks** (`dict[str, str]`):  
  The lm-eval harness task on which we wish to test the model:
  - `sample`: `"gsm8k-ro"`  
  - `test`: `"gsm8k-ro"`

- **metric** (`str`):  
  The evaluation metric to be used (e.g., `"exact_match"`) is the key of the dictionary that must be used to access the quantitive performance of the model in the task defined above from the dictionary returned by lm-eval harness.  
  *Default is `"acc"` if not explicitly set.*


OPTIONAL GENERIC PARAMETERS:

- **device** (`str | None`):  
  Specifies the computing device (e.g., `"cuda:0"` for GPU, or `None` for CPU).

OPTIONAL EVALUATION PARAMETERS:

- **ft_model_to_evaluate** (`List[str]`):  
  List of fine-tuned models to be evaluated in order to compute the latent ability ot this endpoint models. These are the models that we are going to merge. Model should be passed here when we do not have already the correctness score to extract the thetas. In alternative, you could pass directly the correctness path in responses_path.

- **eval_task** (`str`):  
  Specifies the evaluation task for computing the correctness scores of the model that we will evaluate (e.g., `"MATH"`).

- **output_path_evaluation** (`List[str]`):  
  List of paths where evaluation outputs will be saved.

- **eval_batch_size** (`int`):  
  Batch size used during evaluation (e.g., `64`).

OPTIONAL THETA ESTIMATION PARAMETERS:

- **responses_path** (`List[str]`):  
  List of paths to model responses. 

- **output_path_theta_estimation** (`List[str]`):  
  List of paths to save theta estimation results. Example values include:  
  - `"experiments/evolutionary-merging-lm-harness/results/ft_ro_theta.pkl"`  
  - `"experiments/evolutionary-merging-lm-harness/results/metamath_ro_theta.pkl"`


OPTIONAL FOR PROMPTING
- **custom_prompt_template** (`str`):  
  A custom prompt template that can be used in prompting the language model.


In [2]:
# Instantiating the configuration
config = ConfigCrossLingualLMEval(
    pop_size=25,
    n_iter=7,
    ft_model_paths=[
        "/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/models/base/OpenLLM-Ro/RoMistral-7b-Instruct",
        "/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/models/base/meta-math/MetaMath-Mistral-7B"
    ],
    path_to_store_merged_model="/leonardo_scratch/large/userexternal/tmencatt",
    path_to_store_yaml="/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/",
    dtype="float16",
    run_id="romanian_math_gmpirt",
    bench="gsm8k",
    mode="gmpirt",
    seed=420,
    tasks={
        "sample": "gsm8k-ro",
        "test": "gsm8k-ro"
    },
    metric="exact_match",
    dataset_path="/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/data/new_datasets/gsm8k/gsm8k_test_romanian.csv",
    device="cuda:0",
    ft_model_to_evaluate=["", ""],
    eval_task="MATH",
    output_path_evaluation=["", ""],
    eval_batch_size=64,
    responses_path=["", ""],
    output_path_theta_estimation=[
        "/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/results/ft_ro_theta.pkl",
        "/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/results/metamath_ro_theta.pkl"
    ],
    thetas_paths=[
        "/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/results/ft_ro_theta.pkl",
        "/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/results/metamath_ro_theta.pkl"
    ]
)

# 2. Optional 

In [3]:
# STEP 3. Get the responses of the base models
predictions = evaluate_model(config)
print("STEP 2 completed: Predictions obtained", flush=True)

# Evaluation configuration:  ConfigCrossLingualLMEval(pop_size=25, n_iter=7, ft_model_paths=['/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/models/base/OpenLLM-Ro/RoMistral-7b-Instruct', '/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/models/base/meta-math/MetaMath-Mistral-7B'], path_to_store_merged_model='/leonardo_scratch/large/userexternal/tmencatt', path_to_store_yaml='/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/', dtype='float16', run_id='romanian_math_gmpirt', dataset_path='/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/data/new_datasets/gsm8k/gsm8k_test_romanian.csv', thetas_paths=['/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/results/ft_ro_theta.pkl', '/leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/results/metamath_ro_theta.pkl'], bench='gsm8k', mode='gmpirt', seed=420, tasks={'sample': 'gsm8k-ro', 't

In [4]:
# STEP 4. Get the thetas 
thetas = retrieve_thetas(config)
print("STEP 3 completed: Thetas obtained", flush=True)

STEP 3 completed: Thetas obtained


In [5]:
# STEP 5. Unpack some parameters and set the accuracy estimation parameters
pop_size = config.pop_size
n_iter = config.n_iter
run_id = config.run_id
bench = config.bench
mode = config.mode
tasks = config.tasks
metric = config.metric
est_parameters = AccuracyEstimationLMEval(  thetas = thetas, 
                                                bench = bench, 
                                                mode = mode,
                                                tasks = tasks,
                                                metric = metric)
print("STEP 4 completed: Accuracy estimation parameters set", flush = True)

STEP 4 completed: Accuracy estimation parameters set


In [6]:
# STEP 7. Define the merger

merger = SlerpMerger(
                    run_id = run_id,
                    path_to_base_model= config.ft_model_paths[0],
                    path_to_model_1 = config.ft_model_paths[1],
                    path_to_store_yaml = config.path_to_store_yaml,
                    path_to_store_merged_model = config.path_to_store_merged_model,
                    dtype = config.dtype,
                    layer_range_base_model = [0,32],
                    layer_range_model_1 = [0,32]
                    )


if config.device:
    device = config.device
else:
    device = 'cuda'
print("STEP 5 completed: Merger defined", flush=True)

STEP 5 completed: Merger defined


In [7]:
# STEP 8. Define the problem
problem = AnchoredCrossLingualMathProblemLMEval(
                                merger,
                                evolve_task=tasks['sample'],
                                test_task=tasks['test'],
                                n_var=11,
                                n_obj=1,
                                n_eq_constr=0,
                                n_ieq_constr=0,
                                discrete=True,
                                eval_batch_size = 64,
                                device=device
                                )
problem.set_est_parameters(est_parameters)
print("STEP 8 completed: Problem defined", flush=True)

STEP 8 completed: Problem defined


In [8]:
# STEP 9. Define the algorithm
algorithm = GA(pop_size=pop_size,
        sampling=IntegerRandomSampling(),
        crossover=SBX(),
        mutation=PM(),
        eliminate_duplicates=True,
        )
print("STEP 9 completed: Algorithm defined", flush=True)

STEP 9 completed: Algorithm defined


In [10]:
# STEP 10. Define the searcher and run it
result_path = "experiments/evolutionary-merging-lm-harness"
searcher = SearcherLMEval(problem, algorithm, result_path, n_iter, run_id = run_id, seed=config.seed, verbose=False)
searcher.search()
    

yaml configuration file created at /leonardo_work/IscrC_MGNTC/tmencatt/mergenetic/experiments/evolutionary-merging-lm-harness/romanian_math/romanian_math_gmpirt/config.yaml
Deleted folder and all contents: /leonardo_scratch/large/userexternal/tmencatt/romanian_math_gmpirt


Warmup loader cache: 100%|██████████| 2/2 [00:00<00:00, 68.57it/s]
Executing graph:   4%|▍         | 64/1457 [00:14<05:06,  4.55it/s]


KeyboardInterrupt: 

In [None]:
searcher.test()