# Supertree Construction with Averaged Branch Lengths

As a reference tree, we constructed a supertree and assigned branch lengths through post-processing. The overall approach consisted of two main steps: (1) inferring the supertree topology, and (2) assigning branch lengths by averaging values across consistent splits from the source trees.
The following script constructs a supertree with branch lengths for each dataset (Amphibians, Birds, Mammals, and Sharks).

In [1]:
!pip install sc-supertree cogent3 dendropy

Collecting sc-supertree
  Downloading sc_supertree-2025.6.25-py3-none-any.whl.metadata (6.2 kB)
Collecting cogent3
  Downloading cogent3-2025.7.10a4-py3-none-any.whl.metadata (12 kB)
Collecting dendropy
  Downloading DendroPy-5.0.8-py3-none-any.whl.metadata (6.1 kB)
Collecting loky!=3.5.0 (from cogent3)
  Downloading loky-3.5.5-py3-none-any.whl.metadata (8.4 kB)
Collecting scitrack (from cogent3)
  Downloading scitrack-2024.10.8-py3-none-any.whl.metadata (8.9 kB)
Collecting stevedore (from cogent3)
  Downloading stevedore-5.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting pbr>=2.0.0 (from stevedore->cogent3)
  Downloading pbr-6.1.1-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading sc_supertree-2025.6.25-py3-none-any.whl (12 kB)
Downloading cogent3-2025.7.10a4-py3-none-any.whl (716 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m716.3/716.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DendroPy-5.0.8-py3-none-any.whl (465 kB)
[2K   [90m━━━━━━━━━━

In [8]:
from sc_supertree import load_trees, construct_supertree
import dendropy
import numpy as np

In [18]:
def read_dendropy_trees(filename):
    taxa = dendropy.TaxonNamespace()
    trees = dendropy.TreeList.get(path=filename, schema='newick', taxon_namespace=taxa, preserve_underscores=True)
    return trees

def assign_branch_lengths(supertree_dendro, source_trees):
    supertree_dendro.encode_bipartitions()
    for tree in source_trees:
        tree.encode_bipartitions()

    for edge in supertree_dendro.postorder_edge_iter():
        if edge.head_node is None:
            continue  # skip root

        split = edge.split_bitmask
        matching_lengths = []

        for tree in source_trees:
            for e in tree.postorder_edge_iter():
                if e.head_node is None:
                    continue
                if e.split_bitmask == split and e.length is not None:
                    matching_lengths.append(e.length)

        if matching_lengths:
            edge.length = sum(matching_lengths) / len(matching_lengths)

    return supertree_dendro

def save_tree(tree, filename):
    tree.write(
        path=filename,
        schema='newick',
        suppress_rooting=True,
        unquoted_underscores=True
    )


In [19]:
source_file = "amphibians170.txt"
output_file = "supertree_amphibians.txt"

# Load input trees for both topology and branch length inference
source_trees_cogent = load_trees(source_file)  # For sc-supertree
source_trees_dendro = read_dendropy_trees(source_file)  # For averaging

In [26]:
source_file = "birds100.txt"
output_file = "supertree_birds.txt"

# Load input trees for both topology and branch length inference
source_trees_cogent = load_trees(source_file)  # For sc-supertree
source_trees_dendro = read_dendropy_trees(source_file)  # For averaging

In [28]:
source_file = "mammals140.txt"
output_file = "supertree_mammals.txt"

# Load input trees for both topology and branch length inference
source_trees_cogent = load_trees(source_file)  # For sc-supertree
source_trees_dendro = read_dendropy_trees(source_file)  # For averaging

In [30]:
source_file = "sharks100.txt"
output_file = "supertree_sharks.txt"

# Load input trees for both topology and branch length inference
source_trees_cogent = load_trees(source_file)  # For sc-supertree
source_trees_dendro = read_dendropy_trees(source_file)  # For averaging

In [31]:
print("Building supertree...")
# Step 1: Build the supertree topology and save to file (without branch lengths)
temp_topology_file = "temp_supertree_topology.nwk"
supertree = construct_supertree(source_trees_cogent, pcg_weighting="branch")
supertree.write(temp_topology_file)
print(f"Supertree topology saved to: {temp_topology_file}")

# Step 2: Load the topology into DendroPy
taxon_namespace = source_trees_dendro.taxon_namespace
supertree_dendro = dendropy.Tree.get(path=temp_topology_file, schema='newick', taxon_namespace=taxon_namespace)

# Step 3: Assign averaged branch lengths
print("Assigning branch lengths...")
supertree_with_lengths = assign_branch_lengths(supertree_dendro, source_trees_dendro)

# Step 4: Save final supertree with branch lengths
save_tree(supertree_with_lengths, output_file)
print(f"Supertree with branch lengths saved to: {output_file}")


#print(supertree_with_lengths.as_string(schema='newick'))

Building supertree...
Supertree topology saved to: temp_supertree_topology.nwk
Assigning branch lengths...
Supertree with branch lengths saved to: supertree_sharks.txt


### Additional checking and debugging

In [32]:
import random

def debug_compare_split_lengths(supertree, source_trees):
    """
    Randomly selects a non-root internal edge from the supertree,
    finds all matching edges in source trees,
    and compares their branch lengths.
    """
    # Make sure splits are encoded
    supertree.encode_bipartitions()
    for tree in source_trees:
        tree.encode_bipartitions()

    # Collect candidate edges (exclude root and leaves)
    internal_edges = [
        edge for edge in supertree.postorder_edge_iter()
        if edge.head_node and not edge.head_node.is_leaf()
    ]

    if not internal_edges:
        print("No internal edges found.")
        return

    # Pick a random internal edge
    selected_edge = random.choice(internal_edges)
    selected_split = selected_edge.split_bitmask

    # Identify taxa under this edge
    taxa_under_split = selected_edge.head_node.leaf_nodes()
    taxon_labels = sorted([leaf.taxon.label for leaf in taxa_under_split])

    print(f"\n🔍 Debugging a split for taxa: {taxon_labels}")

    if selected_edge.length is not None:
        print(f"Supertree assigned branch length: {selected_edge.length:.4f}")
    else:
        print("⚠️ Supertree edge has no branch length assigned.")

    # Search for matching splits in source trees
    matching_lengths = []
    for idx, tree in enumerate(source_trees):
        for edge in tree.postorder_edge_iter():
            if edge.head_node and edge.split_bitmask == selected_split and edge.length is not None:
                matching_lengths.append(edge.length)

    if matching_lengths:
        avg_length = sum(matching_lengths) / len(matching_lengths)
        print(f"✅ Found {len(matching_lengths)} matching splits in source trees.")
        print(f"Average branch length from source trees: {avg_length:.4f}")
        print(f"Individual lengths: {[round(l, 4) for l in matching_lengths]}")
    else:
        print("⚠️ No matching splits found in source trees.")

In [36]:
debug_compare_split_lengths(supertree_with_lengths, source_trees_dendro)


🔍 Debugging a split for taxa: ['Arhynchobatis_asperrimus', 'Insentiraja_laxipella']
Supertree assigned branch length: 20.8122
✅ Found 10 matching splits in source trees.
Average branch length from source trees: 20.8122
Individual lengths: [35.3965, 10.2715, 25.9714, 3.4099, 32.4682, 21.4506, 42.6491, 15.1806, 19.2976, 2.0272]
