# Runtime and memory evaluation of the overlapping phylogenetic tree dataset pipeline

This script evaluates the **runtime performance** and **peak memory usage** of the proposed data pipeline.

The pipeline is designed to generate biologically meaningful datasets of phylogenetic trees with controlled taxon overlap, supporting applications such as supertree construction, tree comparison, and phylogenetic clustering.

### Scope of evaluation

This runtime analysis covers all major stages of the pipeline, including:

1. **Loading species lists** from a main CSV file (`all_species_lists.csv`)
2. **Random selection of species** within families to form the dataset
3. **Computation of overlapping subsets** using predefined taxon overlap percentages
4. **Unzipping pre-downloaded VertLife tree files**
5. **Conversion raw trees to Newick format** and dataset combining

> The step of requesting tree data from the VertLife webserver is excluded from this evaluation, as it introduces variability from external network latency and server processing times. Pre-downloaded ZIP archives of tree files are used instead.

### Datasets evaluated

The script evaluates four datasets:

| Dataset    | Unique Species | Trees per Subset | Total Trees |
| ---------- | -------------- | ---------------- | ----------- |
| Amphibians | 120            | 55               | 550         |
| Birds      | 135            | 60               | 600         |
| Mammals    | 105            | 50               | 500         |
| Sharks     | 95             | 45               | 450         |

### Output

For each dataset, the script records:

* **Total runtime** (in seconds)
* **Peak memory usage** (in MB)
* **System specifications** for reproducibility

The results are summarized in a final performance table.


In [1]:
!pip install biopython memory_profiler

Collecting biopython
  Downloading biopython-1.85-cp39-cp39-win_amd64.whl (2.8 MB)
Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory-profiler, biopython
Successfully installed biopython-1.85 memory-profiler-0.61.0


In [4]:
import os
import time
import zipfile
import random
import psutil
import platform
import pandas as pd
from Bio import Phylo
from memory_profiler import memory_usage

# === SYSTEM INFO ===
def print_system_info():
    print("=== System Info ===")
    print(f"Platform: {platform.system()} {platform.release()}")
    print(f"Processor: {platform.processor()}")
    print(f"Architecture: {platform.machine()}")
    print(f"CPU cores: {psutil.cpu_count(logical=False)}")
    print(f"Logical processors: {psutil.cpu_count(logical=True)}")
    print(f"Total RAM: {round(psutil.virtual_memory().total / (1024 ** 3), 2)} GB")

# === GROUPING FUNCTION ===
def select_one_per_subgroup(species_list):
    subgroups = {}
    for species in species_list:
        subgroup = species.split()[0]
        if subgroup not in subgroups:
            subgroups[subgroup] = species
    return list(subgroups.values())

# === OVERLAPPING SUBSET GENERATION ===
def calculate_n_for_k(k, p_values=[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]):
    n = k
    previous_new_species = 0
    for p in p_values:
        common_species = (2 * k * p) / (1 + p)
        rounded_common_species = round(common_species)
        new_species = k - rounded_common_species
        actual_new_species = new_species - previous_new_species
        n += int(actual_new_species)
        previous_new_species = new_species
    return int(n)

def find_k_from_n(n, max_k=1000, p_values=[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]):
    for k in range(1, max_k+1):
        if calculate_n_for_k(k, p_values) >= n:
            return k
    return None

def generate_overlapping_subsets(species_list, n, group, p_values=[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]):
    assert len(species_list) >= n, "Not enough species!"
    selected = species_list[:n]
    k = find_k_from_n(n, p_values=p_values)
    if k is None:
        raise ValueError("Could not find valid k for given n.")

    subsets = {f"Subset {i}": [] for i in range(1, 11)}
    start_index = 0
    subsets["Subset 1"] = selected[start_index:start_index + k]
    current_position = start_index + k

    for i, p in enumerate(p_values, start=2):
        common_species = round((2 * k * p) / (1 + p))
        new_species = k - common_species
        subset_start = start_index + int(new_species)
        subset_end = subset_start + k
        
        subset_species = selected[subset_start:subset_end]
        if len(subset_species) < k:
            subset_species += selected[:k - len(subset_species)]
        subsets[f"Subset {i}"] = subset_species
        current_position = subset_start + k

    max_len = len(selected)
    for key in subsets:
        subsets[key] += [None] * (max_len - len(subsets[key]))

    df = pd.DataFrame(subsets)
    df.to_csv(f"{group}_overlapping_subsets.csv", index=False)
    return df

# === TREE CONVERSION ===
def unzip_all_in_folder(folder):
    for file in os.listdir(folder):
        if file.endswith(".zip"):
            zip_path = os.path.join(folder, file)
            zip_base = os.path.splitext(file)[0]  # Use job_id as prefix
            try:
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    for name in zip_ref.namelist():
                        if name.endswith(".nex"):
                            new_name = f"{zip_base}.nex"
                            extracted_path = os.path.join(folder, new_name)
                            with zip_ref.open(name) as src, open(extracted_path, 'wb') as dst:
                                dst.write(src.read())
            except zipfile.BadZipFile:
                print(f"Skipping bad zip: {zip_path}")

def convert_nexus_to_newick(nexus_file, t):
    try:
        trees = list(Phylo.parse(nexus_file, "nexus"))
    except Exception as e:
        print(f"Failed to parse {nexus_file}: {e}")
        return []

    if len(trees) < t:
        t = len(trees)
    selected = random.sample(trees, t)
    return [tree.format('newick').strip() for tree in selected if tree]

# === FULL DATASET PROCESSING ===
def process_dataset(group, folder, species_list, n, t):
    p_values = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]

    def full_pipeline():
        # Select unique families (genus level)
        unique_families = select_one_per_subgroup(species_list)
        if len(unique_families) < n:
            raise ValueError(f"Not enough unique subgroups to select {n} species for {group}.")
        _ = generate_overlapping_subsets(unique_families, n, group, p_values)

        # Unzip and process Nexus files
        unzip_all_in_folder(folder)
        all_newick = []
        for file in os.listdir(folder):
            if file.endswith(".nex"):
                fpath = os.path.join(folder, file)
                all_newick.extend(convert_nexus_to_newick(fpath, t))
        with open(f"overlapping_dataset_{group}.txt", "w") as f:
            for tree in all_newick:
                f.write(tree + '\n')

    start = time.time()
    peak_mem = memory_usage(full_pipeline, max_usage=True, interval=0.1)
    runtime = time.time() - start
    #return round(runtime / 60, 1), round(peak_mem, 1)
    return round(runtime, 1), round(peak_mem, 1)

# === DATASETS TO EVALUATE ===
all_species_lists = pd.read_csv("all_species_lists.csv")
datasets = {
    "amphibians": {"folder": "amphibians_nexus", "t": 55, "n": 120},
    "birds": {"folder": "birds_nexus", "t": 60, "n": 135},
    "mammals": {"folder": "mammals_nexus", "t": 50, "n": 105},
    "sharks": {"folder": "sharks_nexus", "t": 45, "n": 95},
}

species_dict = {
    "amphibians": all_species_lists['Amphibians'].dropna().tolist(),
    "birds": all_species_lists['Birds'].dropna().tolist(),
    "mammals": all_species_lists['Mammals'].dropna().tolist(),
    "sharks": all_species_lists['Sharks'].dropna().tolist(),
}

# === RUN EVALUATION ===
print_system_info()
print("\n=== Full Pipeline Runtime Evaluation ===")
results = []

for group, cfg in datasets.items():
    print(f"\nProcessing {group.capitalize()}...")
    species_list = species_dict[group]
    runtime, mem = process_dataset(group, cfg["folder"], species_list, cfg["n"], cfg["t"])
    results.append({
        "Dataset": group.capitalize(),
        "Unique Species": cfg["n"],
        "Trees per Subset": cfg["t"],
        "Total Trees": cfg["t"] * 10,
        "Runtime (sec)": runtime,
        "Peak Memory (MB)": mem
    })

# === DISPLAY SUMMARY TABLE ===
df = pd.DataFrame(results)
print("\n=== Summary Table ===")
print(df.to_string(index=False))


=== System Info ===
Platform: Windows 10
Processor: Intel64 Family 6 Model 140 Stepping 1, GenuineIntel
Architecture: AMD64
CPU cores: 4
Logical processors: 8
Total RAM: 31.69 GB

=== Full Pipeline Runtime Evaluation ===

Processing Amphibians...

Processing Birds...

Processing Mammals...

Processing Sharks...

=== Summary Table ===
   Dataset  Unique Species  Trees per Subset  Total Trees  Runtime (sec)  Peak Memory (MB)
Amphibians             120                55          550            3.8             120.2
     Birds             135                60          600            4.7             124.7
   Mammals             105                50          500            2.9             118.9
    Sharks              95                45          450            2.0             114.8
