In [1]:
%pip install biopython ete3

Collecting ete3
  Downloading ete3-3.1.3.tar.gz (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: ete3
  Building wheel for ete3 (setup.py) ... [?25ldone
[?25h  Created wheel for ete3: filename=ete3-3.1.3-py3-none-any.whl size=2273785 sha256=c1b85313dabd273eb21aad88ac1507fecc5af22e335ff497b0fb65d576940b87
  Stored in directory: /Users/vidyaanildhulappanavar/Library/Caches/pip/wheels/a0/72/00/1982bd848e52b03079dbf800900120bc1c20e92e9a1216e525
Successfully built ete3
Installing collected packages: ete3
Successfully installed ete3-3.1.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may nee

In [2]:
import random
from Bio import Phylo, AlignIO, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalwCommandline
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor, _Matrix
from ete3 import Tree, TreeStyle, TreeComparison

p = 0.01
num_mutants = 5
num_leaves_to_delete = 10
original_sequence = 'ATG' * 300 + 'CGT' * 100  # Example sequence (900 coding + 300 non-coding)

def mutate_sequence(sequence, p):
    mutated_sequence = list(sequence)
    for i in range(len(sequence)):
        if i < 900:  # Coding region
            rate = p if (i % 3 == 0 or i % 3 == 1) else 3 * p
        else:  # Non-coding region
            rate = 5 * p
        if random.random() < rate:
            mutated_sequence[i] = random.choice(['A', 'G', 'C', 'T'])
    return ''.join(mutated_sequence)

def generate_mutants(sequence, num_mutants, p):
    mutants = [mutate_sequence(sequence, p)]
    for _ in range(num_mutants - 1):
        mutants.append(mutate_sequence(mutants[-1], p))
    return mutants

# Step 1: Generate SNP mutants
mutants = generate_mutants(original_sequence, num_mutants, p)

# Step 2: Successively mutate the sequences to generate 32 leaves
all_sequences = [original_sequence] + mutants
for _ in range(4):
    new_sequences = []
    for seq in all_sequences:
        new_sequences.extend(generate_mutants(seq, num_mutants, p))
    all_sequences = new_sequences

# Step 3: Sanity check
num_differences = [sum(1 for a, b in zip(original_sequence, seq) if a != b) for seq in all_sequences]
print(f"Sanity check - Average number of differences: {sum(num_differences) / len(num_differences)}")

# Step 4: Randomly delete 10 leaves
random.shuffle(all_sequences)
all_sequences = all_sequences[:-num_leaves_to_delete]

# Step 5: Construct phylogenetic tree using full sequences
def write_fasta(sequences, filename):
    records = [SeqRecord(Seq(seq), id=str(i)) for i, seq in enumerate(sequences)]
    with open(filename, "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")

write_fasta(all_sequences, "full_sequences.fasta")

# Align sequences
clustalw_exe = r"clustalw2"  # Ensure ClustalW is installed and the path is correct
clustalw_cline = ClustalwCommandline(clustalw_exe, infile="full_sequences.fasta")
stdout, stderr = clustalw_cline()

# Calculate distances and construct tree
aln = AlignIO.read("full_sequences.aln", "clustal")
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(aln)
constructor = DistanceTreeConstructor()
tree = constructor.nj(dm)
Phylo.write(tree, "full_tree.xml", "phyloxml")

# Step 6: Split each sequence into three subsequences
def split_sequence(sequence):
    return [sequence[:400], sequence[400:800], sequence[800:]]

subsequences = [split_sequence(seq) for seq in all_sequences]

# Step 7: Construct three phylogenetic trees using subsequences
def construct_subsequence_trees(subsequences):
    trees = []
    for i in range(3):
        sub_seqs = [subseq[i] for subseq in subsequences]
        write_fasta(sub_seqs, f"subseq_{i}.fasta")
        clustalw_cline = ClustalwCommandline(clustalw_exe, infile=f"subseq_{i}.fasta")
        stdout, stderr = clustalw_cline()
        aln = AlignIO.read(f"subseq_{i}.aln", "clustal")
        dm = calculator.get_distance(aln)
        tree = constructor.nj(dm)
        Phylo.write(tree, f"sub_tree_{i}.xml", "phyloxml")
        trees.append(tree)
    return trees

sub_trees = construct_subsequence_trees(subsequences)

# Step 8: Compare trees using Robinson-Foulds distance
def compare_trees(tree1, tree2):
    rf_distance, max_rf = tree1.robinson_foulds(tree2)
    return rf_distance, rf_distance / max_rf

# Load trees for comparison
full_tree = Tree("full_tree.xml")
sub_trees_loaded = [Tree(f"sub_tree_{i}.xml") for i in range(3)]

# Compare full tree with subsequence trees
for i, sub_tree in enumerate(sub_trees_loaded):
    rf_distance, normalized_rf = compare_trees(full_tree, sub_tree)
    print(f"RF distance between full tree and subtree {i}: {rf_distance}, Normalized RF: {normalized_rf}")

# Compare subsequence trees with each other
for i in range(3):
    for j in range(i + 1, 3):
        rf_distance, normalized_rf = compare_trees(sub_trees_loaded[i], sub_trees_loaded[j])
        print(f"RF distance between subtree {i} and subtree {j}: {rf_distance}, Normalized RF: {normalized_rf}")

print("Phylogenetic tree construction and comparison complete.")



Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


ImportError: cannot import name 'TreeComparison' from 'ete3' (/Users/vidyaanildhulappanavar/miniconda3/lib/python3.10/site-packages/ete3/__init__.py)

In [3]:
import random
from Bio import Phylo, AlignIO, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalwCommandline
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from ete3 import Tree, TreeStyle

# Parameters
p = 0.01
num_mutants = 5
num_leaves_to_delete = 10
original_sequence = 'ATG' * 300 + 'CGT' * 100  # Example sequence (900 coding + 300 non-coding)

def mutate_sequence(sequence, p):
    mutated_sequence = list(sequence)
    for i in range(len(sequence)):
        if i < 900:  # Coding region
            rate = p if (i % 3 == 0 or i % 3 == 1) else 3 * p
        else:  # Non-coding region
            rate = 5 * p
        if random.random() < rate:
            mutated_sequence[i] = random.choice(['A', 'G', 'C', 'T'])
    return ''.join(mutated_sequence)

def generate_mutants(sequence, num_mutants, p):
    mutants = [mutate_sequence(sequence, p)]
    for _ in range(num_mutants - 1):
        mutants.append(mutate_sequence(mutants[-1], p))
    return mutants

# Step 1: Generate SNP mutants
mutants = generate_mutants(original_sequence, num_mutants, p)

# Step 2: Successively mutate the sequences to generate 32 leaves
all_sequences = [original_sequence] + mutants
for _ in range(4):
    new_sequences = []
    for seq in all_sequences:
        new_sequences.extend(generate_mutants(seq, num_mutants, p))
    all_sequences = new_sequences

# Step 3: Sanity check
num_differences = [sum(1 for a, b in zip(original_sequence, seq) if a != b) for seq in all_sequences]
print(f"Sanity check - Average number of differences: {sum(num_differences) / len(num_differences)}")

# Step 4: Randomly delete 10 leaves
random.shuffle(all_sequences)
all_sequences = all_sequences[:-num_leaves_to_delete]

# Step 5: Construct phylogenetic tree using full sequences
def write_fasta(sequences, filename):
    records = [SeqRecord(Seq(seq), id=str(i)) for i, seq in enumerate(sequences)]
    with open(filename, "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")

write_fasta(all_sequences, "full_sequences.fasta")

# Align sequences
clustalw_exe = r"clustalw2"  # Ensure ClustalW is installed and the path is correct
clustalw_cline = ClustalwCommandline(clustalw_exe, infile="full_sequences.fasta")
stdout, stderr = clustalw_cline()

# Calculate distances and construct tree
aln = AlignIO.read("full_sequences.aln", "clustal")
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(aln)
constructor = DistanceTreeConstructor()
tree = constructor.nj(dm)
Phylo.write(tree, "full_tree.xml", "phyloxml")

# Step 6: Split each sequence into three subsequences
def split_sequence(sequence):
    return [sequence[:400], sequence[400:800], sequence[800:]]

subsequences = [split_sequence(seq) for seq in all_sequences]

# Step 7: Construct three phylogenetic trees using subsequences
def construct_subsequence_trees(subsequences):
    trees = []
    for i in range(3):
        sub_seqs = [subseq[i] for subseq in subsequences]
        write_fasta(sub_seqs, f"subseq_{i}.fasta")
        clustalw_cline = ClustalwCommandline(clustalw_exe, infile=f"subseq_{i}.fasta")
        stdout, stderr = clustalw_cline()
        aln = AlignIO.read(f"subseq_{i}.aln", "clustal")
        dm = calculator.get_distance(aln)
        tree = constructor.nj(dm)
        Phylo.write(tree, f"sub_tree_{i}.xml", "phyloxml")
        trees.append(tree)
    return trees

sub_trees = construct_subsequence_trees(subsequences)

# Step 8: Compare trees using Robinson-Foulds distance
def compare_trees(tree1, tree2):
    rf_distance, max_rf = tree1.robinson_foulds(tree2)
    return rf_distance, rf_distance / max_rf

# Load trees for comparison
full_tree = Tree("full_tree.xml")
sub_trees_loaded = [Tree(f"sub_tree_{i}.xml") for i in range(3)]

# Compare full tree with subsequence trees
for i, sub_tree in enumerate(sub_trees_loaded):
    rf_distance, normalized_rf = full_tree.robinson_foulds(sub_tree)
    print(f"RF distance between full tree and subtree {i}: {rf_distance}, Normalized RF: {normalized_rf}")

# Compare subsequence trees with each other
for i in range(3):
    for j in range(i + 1, 3):
        rf_distance, normalized_rf = sub_trees_loaded[i].robinson_foulds(sub_trees_loaded[j])
        print(f"RF distance between subtree {i} and subtree {j}: {rf_distance}, Normalized RF: {normalized_rf}")

print("Phylogenetic tree construction and comparison complete.")


Sanity check - Average number of differences: 260.72453333333334


ApplicationError: Non-zero return code 127 from 'clustalw2 -infile=full_sequences.fasta', message '/bin/sh: clustalw2: command not found'