## Validation of phylogenetic tree datasets with overlapping taxa

This script performs **correctness validation** for phylogenetic tree datasets generated using the pipeline described in our study.

Each dataset is provided as a single `.txt` file (e.g., `amphibians_trees.txt`, `birds_trees.txt`, etc.), containing multiple Newick-formatted phylogenetic trees, one per line. These trees are organized internally into 10 species subsets, each representing a different level of taxon overlap.

The validation process includes the following checks:

1. **Tree Format Validation**: Ensures all trees are valid Newick strings.
2. **Branch Length Check**: Confirms that every tree includes branch length values.
3. **Intra-Subset Consistency**: Verifies that all trees within a subset share an identical taxon set.
4. **Inter-Subset Overlap**: Computes the pairwise Jaccard index between subsets to evaluate whether observed taxon overlap aligns with expected values (ranging from \~10% to \~90%).

This script supports reproducible validation of biological tree datasets and can be used to confirm that the datasets are ready for downstream analysis, including supertree construction, tree comparison, and metric validation.

In [3]:
from IPython.display import display
import os
from io import StringIO
from typing import List, Tuple
from Bio import Phylo
import numpy as np
from collections import defaultdict
from itertools import combinations
import pandas as pd

def read_all_trees(file_path: str) -> List[Phylo.BaseTree.Tree]:
    """Reads all Newick trees from a file, one per line."""
    trees = []
    with open(file_path, "r") as f:
        for line in f:
            if line.strip():
                tree = Phylo.read(StringIO(line.strip()), "newick")
                trees.append(tree)
    return trees

def extract_taxa(tree: Phylo.BaseTree.Tree) -> List[str]:
    return [leaf.name for leaf in tree.get_terminals()]

def validate_tree_format_and_branch_lengths(tree: Phylo.BaseTree.Tree) -> Tuple[bool, bool]:
    try:
        is_valid = all(clade.branch_length is not None for clade in tree.get_terminals())
        return True, is_valid
    except Exception:
        return False, False

def get_jaccard_index(set1: set, set2: set) -> float:
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 0.0

def split_into_subsets(trees: List, num_subsets: int) -> List[List]:
    """Splits a list of trees into equal-sized subsets."""
    per_subset = len(trees) // num_subsets
    return [trees[i * per_subset : (i + 1) * per_subset] for i in range(num_subsets)]

def validate_group(file_path: str, num_subsets: int = 10) -> None:
    """Validates tree subsets inside a single group file (e.g., amphibians_trees.txt)."""
    all_trees = read_all_trees(file_path)
    subsets = split_into_subsets(all_trees, num_subsets)

    validation_results = []
    taxa_sets = []
    all_species = set()

    for i, subset in enumerate(subsets):
        subset_taxa = set(extract_taxa(subset[0]))
        all_species.update(subset_taxa)

        valid_format = True
        has_branch_lengths = True

        for tree in subset:
            taxa = set(extract_taxa(tree))
            if taxa != subset_taxa:
                valid_format = False
            _, branch_lengths_ok = validate_tree_format_and_branch_lengths(tree)
            if not branch_lengths_ok:
                has_branch_lengths = False

        validation_results.append({
            "Subset index": i + 1,
            "Num trees": len(subset),
            "Num taxa (in subset)": len(subset_taxa),
            "Valid Newick": valid_format,
            "Has branch lengths": has_branch_lengths
        })
        taxa_sets.append(subset_taxa)

    jaccard_matrix = np.zeros((num_subsets, num_subsets))
    for i, j in combinations(range(num_subsets), 2):
        jaccard = get_jaccard_index(taxa_sets[i], taxa_sets[j])
        jaccard_matrix[i][j] = jaccard_matrix[j][i] = round(jaccard, 3)
    np.fill_diagonal(jaccard_matrix, 1.0)

    df = pd.DataFrame(validation_results)

    dataset_name = os.path.basename(file_path)
    print(f"\n Validation results for {dataset_name}:")
    display(df)
    print(f"\n Total unique species in {dataset_name}: {len(all_species)}")

    print("\n Pairwise Jaccard similarity matrix between subsets:")
    print(jaccard_matrix)
    
validate_group("amphibians_trees.txt")
validate_group("birds_trees.txt")
validate_group("mammals_trees.txt")
validate_group("sharks_trees.txt")



 Validation results for amphibians_trees.txt:


Unnamed: 0,Subset index,Num trees,Num taxa (in subset),Valid Newick,Has branch lengths
0,1,55,66,True,True
1,2,55,66,True,True
2,3,55,66,True,True
3,4,55,66,True,True
4,5,55,66,True,True
5,6,55,66,True,True
6,7,55,66,True,True
7,8,55,66,True,True
8,9,55,66,True,True
9,10,55,66,True,True



 Total unique species in amphibians_trees.txt: 120

 Pairwise Jaccard similarity matrix between subsets:
[[1.    0.63  0.517 0.389 0.859 0.168 0.76  0.808 0.886 0.282]
 [0.63  1.    0.833 0.65  0.737 0.347 0.833 0.5   0.553 0.5  ]
 [0.517 0.833 1.    0.784 0.61  0.435 0.692 0.404 0.451 0.61 ]
 [0.389 0.65  0.784 1.    0.467 0.571 0.535 0.294 0.333 0.784]
 [0.859 0.737 0.61  0.467 1.    0.222 0.886 0.692 0.76  0.347]
 [0.168 0.347 0.435 0.571 0.222 1.    0.269 0.1   0.128 0.737]
 [0.76  0.833 0.692 0.535 0.886 0.269 1.    0.61  0.671 0.404]
 [0.808 0.5   0.404 0.294 0.692 0.1   0.61  1.    0.913 0.2  ]
 [0.886 0.553 0.451 0.333 0.76  0.128 0.671 0.913 1.    0.234]
 [0.282 0.5   0.61  0.784 0.347 0.737 0.404 0.2   0.234 1.   ]]

 Validation results for birds_trees.txt:


Unnamed: 0,Subset index,Num trees,Num taxa (in subset),Valid Newick,Has branch lengths
0,1,60,74,True,True
1,2,60,74,True,True
2,3,60,74,True,True
3,4,60,74,True,True
4,5,60,74,True,True
5,6,60,74,True,True
6,7,60,74,True,True
7,8,60,74,True,True
8,9,60,74,True,True
9,10,60,74,True,True



 Total unique species in birds_trees.txt: 135

 Pairwise Jaccard similarity matrix between subsets:
[[1.    0.721 0.13  0.276 0.345 0.165 0.096 0.558 0.437 0.213]
 [0.721 1.    0.244 0.423 0.51  0.287 0.203 0.783 0.626 0.345]
 [0.13  0.244 1.    0.663 0.558 0.897 0.897 0.345 0.451 0.783]
 [0.276 0.423 0.663 1.    0.85  0.741 0.591 0.558 0.701 0.85 ]
 [0.345 0.51  0.558 0.85  1.    0.626 0.495 0.663 0.827 0.721]
 [0.165 0.287 0.897 0.741 0.626 1.    0.805 0.396 0.51  0.873]
 [0.096 0.203 0.897 0.591 0.495 0.805 1.    0.298 0.396 0.701]
 [0.558 0.783 0.345 0.558 0.663 0.396 0.298 1.    0.805 0.465]
 [0.437 0.626 0.451 0.701 0.827 0.51  0.396 0.805 1.    0.591]
 [0.213 0.345 0.783 0.85  0.721 0.873 0.701 0.465 0.591 1.   ]]

 Validation results for mammals_trees.txt:


Unnamed: 0,Subset index,Num trees,Num taxa (in subset),Valid Newick,Has branch lengths
0,1,50,58,True,True
1,2,50,58,True,True
2,3,50,58,True,True
3,4,50,58,True,True
4,5,50,58,True,True
5,6,50,58,True,True
6,7,50,58,True,True
7,8,50,58,True,True
8,9,50,58,True,True
9,10,50,58,True,True



 Total unique species in mammals_trees.txt: 105

 Pairwise Jaccard similarity matrix between subsets:
[[1.    0.45  0.902 0.137 0.234 0.657 0.902 0.349 0.785 0.568]
 [0.45  1.    0.398 0.45  0.611 0.706 0.506 0.812 0.589 0.812]
 [0.902 0.398 1.    0.105 0.196 0.589 0.812 0.303 0.706 0.506]
 [0.137 0.45  0.105 1.    0.758 0.289 0.172 0.568 0.221 0.349]
 [0.234 0.611 0.196 0.758 1.    0.415 0.275 0.758 0.333 0.487]
 [0.657 0.706 0.589 0.289 0.415 1.    0.731 0.568 0.841 0.871]
 [0.902 0.506 0.812 0.172 0.275 0.731 1.    0.398 0.871 0.634]
 [0.349 0.812 0.303 0.568 0.758 0.568 0.398 1.    0.468 0.657]
 [0.785 0.589 0.706 0.221 0.333 0.841 0.871 0.468 1.    0.731]
 [0.568 0.812 0.506 0.349 0.487 0.871 0.634 0.657 0.731 1.   ]]

 Validation results for sharks_trees.txt:


Unnamed: 0,Subset index,Num trees,Num taxa (in subset),Valid Newick,Has branch lengths
0,1,45,52,True,True
1,2,45,52,True,True
2,3,45,52,True,True
3,4,45,52,True,True
4,5,45,52,True,True
5,6,45,52,True,True
6,7,45,52,True,True
7,8,45,52,True,True
8,9,45,52,True,True
9,10,45,52,True,True



 Total unique species in sharks_trees.txt: 95

 Pairwise Jaccard similarity matrix between subsets:
[[1.    0.733 0.6   0.763 0.238 0.405 0.486 0.284 0.333 0.195]
 [0.733 1.    0.425 0.552 0.13  0.268 0.333 0.169 0.209 0.095]
 [0.6   0.425 1.    0.793 0.465 0.705 0.825 0.529 0.6   0.405]
 [0.763 0.552 0.793 1.    0.351 0.552 0.651 0.405 0.465 0.3  ]
 [0.238 0.13  0.465 0.351 1.    0.677 0.576 0.891 0.793 0.891]
 [0.405 0.268 0.705 0.552 0.677 1.    0.857 0.763 0.857 0.6  ]
 [0.486 0.333 0.825 0.651 0.576 0.857 1.    0.651 0.733 0.507]
 [0.284 0.169 0.529 0.405 0.891 0.763 0.651 1.    0.891 0.793]
 [0.333 0.209 0.6   0.465 0.793 0.857 0.733 0.891 1.    0.705]
 [0.195 0.095 0.405 0.3   0.891 0.6   0.507 0.793 0.705 1.   ]]
