## Imports

In [1]:
import os
import subprocess
from Bio import AlignIO
import sys
import PyQt5
from ete3 import Tree, TreeStyle, TextFace, NodeStyle, AttrFace
import math
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# my functions
sys.path.append(os.path.join('..', 'scripts', 'python'))
from create_phyl_tree import run_mafft, alignment_stats, run_raxmlng_check, run_raxmlng_tree, run_raxmlng_bootstrap, run_raxmlng_support, filter_fasta_file, shorten_sequence_names, num_seqs, run_gblocks_grid_search, move_search_grid_files

## Variables

In [2]:
project = 'Suthaus_2022'
marker = 'Full18S'
sim = 'sim_90'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
results_path = os.path.join('..', 'results')
reference_align_dir = os.path.join(raw_data, 'reference_alignments', 'vamp')
new_seq_dir = os.path.join(results_path, 'tax_assignment_vsearch', project, marker, sim, denoise_method, 'vamp_specific')

# Preparing input fasta file for the sequence alignment

### Create a fasta file with all the new sequences assigned to the vampyrellids

In [None]:
# Path to the output file
merged_fasta_file = os.path.join(new_seq_dir, 'all_samples.fasta')

# Create a list of fasta files, excluding those starting with 'Mock'
files = [file for file in os.listdir(new_seq_dir) if not file.startswith('Mock')]

# List to store the modified lines
merged_lines = []

for file in files:
    path = os.path.join(new_seq_dir, file)
    sample = file.split('_')[0]
    with open(path, 'r') as infile:
        for line in infile:
            if line.startswith('>'):
                # Replace ';' with '_' in the line as the RAxML-ng does not support the semicolon
                modified_line = line.strip().replace(';', '_')
                # Add the sample name to the line
                merged_lines.append(modified_line + '_' + sample + '\n')
            else:
                merged_lines.append(line)

# Write the contents to the output file: results -> tax_assignment_vsearch -> vamp_specific -> all_samples.fasta
with open(merged_fasta_file, 'w') as outfile:
    outfile.writelines(merged_lines)

### Prepare vampyrella reference sequence database

In [None]:
# We have to exclude the unpublished sequences
# All the unpublished sequences started with 'XXXXXXXX' in the sequence ID

# Vampyrella reference sequences path
vamp_ref_seqs = os.path.join(reference_align_dir, '2023_VAMPYRELLIDA_SSU_annotated_PR2_AM_AS_not_aligned_version.fasta')
# Vampyrella filtered fasta file
output_dir = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim)
filtered_fasta_file = os.path.join(output_dir, 'vamp_ref_seqs.fasta')

# Create path to the save directory if not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Filter out sequences with headers starting with "XXXXXXXX", and return the filtered lines.
filtered_lines = filter_fasta_file(file_path = vamp_ref_seqs, exclude_pattern = 'XXXXXXXX')
shorten_seq_names = shorten_sequence_names(filtered_lines)

# Write the contents to the output fasta file file: results -> phyl_trees -> vamp_ref_seqs.fasta
with open(filtered_fasta_file, 'w') as outfile:
    outfile.writelines(shorten_seq_names)

### Combine the new sequences with our vampyrella reference sequence database

In [8]:
# Reference alignment path
vamp_ref_seqs = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'vamp_ref_seqs.fasta')

# Path to the new sequences
# new_vamp_seqs = os.path.join(new_seq_dir, 'all_samples.fasta')

new_vamp_seqs = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', 'extracted_18S_merged.fasta')

# Path to the output directory and name of the output file
# output_dir = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim)
output_dir = os.path.join(results_path, 'phyl_trees', 'all_projects_combined')

combined_alignment = os.path.join(output_dir, 'vamp_ref_and_new_seqs_comb.fasta')

# Combine ref. sequences fasta file and new vamp sequences

# Read the contents of the reference sequences file
with open(vamp_ref_seqs, 'r') as file1:
    content1 = file1.read()

# Read the contents of the denoised sequences file
with open(new_vamp_seqs, 'r') as file2:
    content2 = file2.read()

# Write the combined contents to the new file: results -> phyl_trees -> vamp_ref_and_new_seqs_comb.fasta
with open(combined_alignment, 'w') as infile:
    infile.write(content1 + '\n' + content2)

print(f"Combined files {new_vamp_seqs} and {vamp_ref_seqs} into {combined_alignment}")

Combined files ../results/phyl_trees/all_projects_combined/extracted_18S_merged.fasta and ../results/phyl_trees/Suthaus_2022/Full18S/RAD/sim_90/vamp_ref_seqs.fasta into ../results/phyl_trees/all_projects_combined/vamp_ref_and_new_seqs_comb.fasta


# Create alignment

In [9]:
# Creating alignment
# input_sequences = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'vamp_ref_and_new_seqs_comb.fasta')
input_sequences = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', 'vamp_ref_and_new_seqs_comb.fasta')

# output_sequences = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments', 'vamp_mafft.fasta')
output_sequences = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', 'vamp_mafft.fasta')

# log_file = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments', 'vamp_mafft.log')
log_file = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', 'vamp_mafft.log')


run_mafft(input_file = input_sequences, 
          output_file = output_sequences, 
          log_file = log_file, 
          method = 'localpair', 
          maxiterate_num = 1000)

Alignment was saved to: ../results/phyl_trees/all_projects_combined/vamp_mafft.fasta
Log file was saved to: ../results/phyl_trees/all_projects_combined/vamp_mafft.log


0

In [None]:
# Alignment stats
alignment = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments', 'vamp_mafft.fasta')


num_sequences, alignment_length, average_gaps, percentage_gaps, percentage_identity = alignment_stats(alignment_file = alignment)

print(f'''
Number of sequences: {num_sequences}
Alignment length: {alignment_length} base pairs
Average number of gaps per sequence: {round(average_gaps, 2)}
Percentage gaps: {round(percentage_gaps, 2)}%
Percentage identity: {round(percentage_identity, 2)}%
''')

In [None]:
# Alignment stats
# alignment = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments', 'vamp_mafft.fasta')

alignment = os.path.join(results_path, 'phyl_trees', 'refrence_alignment_tree', 'vamp_mafft.fasta')


num_sequences, alignment_length, average_gaps, percentage_gaps, percentage_identity = alignment_stats(alignment_file = alignment)

print(f'''
Number of sequences: {num_sequences}
Alignment length: {alignment_length} base pairs
Average number of gaps per sequence: {round(average_gaps, 2)}
Percentage gaps: {round(percentage_gaps, 2)}%
Percentage identity: {round(percentage_identity, 2)}%
''')

# Check alignment

In [None]:
# Check the alignment manually to see how does it look like

# Gblocks: masking poorly aligned positions

In [None]:
# Define the Parameter Range
Defining the range of values we want to explore for each parameter
mafft_alignment = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments', 'vamp_mafft.fasta')

# Number of sequences and half of the sequences in the alignemnt as a baseline for Gblocks parameters
num_seqs_ = num_seqs(mafft_alignment)
print(f'Total number of sequences in the alignment: {num_seqs_}')
half_num_seqs = math.floor(num_seqs_ / 2) + 1 # half of the sequences, rounded down
print(f'Half of the sequences in the alignment (rounded down): {half_num_seqs}')

# b1
# Description: Minimum Number of Sequences for a Conserved Position
# Minimum is half of the sequences in the alignment.
step_size_b1 = 20
start_b1 = 0
stop_b1 = step_size_b1 * 10
b1_values = [half_num_seqs + i for i in range(start_b1, stop_b1, step_size_b1)]

# b2
# Description: Minimum Number Of Sequences For A Flank Position.
# The lower limit for b2 is the value you set for b1.
step_size_b2 = 20
start_b2 = 0
stop_b2 = step_size_b2 * 10
b2_values = [half_num_seqs + i for i in range(start_b2, stop_b2, step_size_b2)]

# b3:
# Description: Maximum Number Of Contiguous Nonconserved Positions.
# Setting this value too high can include regions of the alignment that are too variable, 
# whereas setting it too low can exclude potentially informative parts of the alignment.
# Default = 8.
step_size_b3 = 5
start_b3 = 8
stop_b3 = (step_size_b3 * 10) + start_b3
b3_values = list(range(start_b3, stop_b3, step_size_b3))

# b4:
# Description: Minimum Length Of A Block.
# Any block that's shorter than the number set will be excluded from the final alignment.
# Default = 10.
step_size_b4 = 5
start_b4 = 10
stop_b4 =  (step_size_b4 * 10) + start_b4
b4_values = list(range(start_b4, stop_b4, step_size_b4))

# v5
# Description: Allowed Gap Positions.
# a: all gaps allowed (the most relaxed setting)
# h: half (a balanced approach)
# n: no gaps allowed (the most strict setting)
b5_values = ['a', 'h', 'n']

print(f'''
b1: {b1_values}
b2: {b2_values}
b3: {b3_values}
b4: {b4_values}
b5: {b5_values}
''')

In [None]:
alignments_path = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments')
alignment = os.path.join(alignments_path, 'vamp_mafft.fasta')
gblocks_path = os.path.join('..', 'raw_data', 'packages', 'Gblocks_0.91b', 'Gblocks')


run_gblocks_grid_search(mafft_alignment = alignment,
                        b1_values = b1_values,
                        b2_values = b2_values,
                        b3_values = b3_values,
                        b4_values = b4_values,
                        b5_values = b5_values,
                        gblocks_path = gblocks_path)

# Move all the output files to the separate 'grid_search' directory
move_search_grid_files(directory_path = alignments_path, 
                       grid_search_path = os.path.join(alignments_path, 'grid_search'))

In [None]:
# Checking some basic statistics about our MAFFT alignment
alignments_path = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments')
grid_search_path = os.path.join(alignments_path, 'grid_search')

fasta_files = [fasta for fasta in os.listdir(grid_search_path) if '.fasta' in fasta and '_h_' in fasta]

for fasta_file in fasta_files:
    alignment_path = os.path.join(grid_search_path, fasta_file)
    parameters = fasta_file.split('grid_')[1].strip('.fasta')
    print(f'Paramters: {parameters}')
    num_sequences, alignment_length, average_gaps, percentage_gaps, percentage_identity = alignment_stats(alignment_file = alignment_path)
    print(f"Number of sequences: {num_sequences}")
    print(f"Alignment length: {alignment_length}")
    print(f"Average number of gaps per sequence: {average_gaps:.2f}")
    print(f"Percentage of gaps across the sequences: {percentage_gaps:.2f}%")
    print(f"Average percentage identity: {percentage_identity:.2f}%")
    print('\n')

In [None]:
alignments_path = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments')
grid_search_path = os.path.join(alignments_path, 'grid_search')
fasta_files = [fasta for fasta in os.listdir(grid_search_path) if '.fasta' in fasta and '_h_' in fasta]
results = []

for fasta_file in fasta_files:
    metrics = alignment_stats(os.path.join(grid_search_path, fasta_file))
    b5_value = fasta_file.split("_grid_")[1].split("_")[0]
    iteration_num = fasta_file.split("_grid_")[1].split("_")[1].strip('.fasta')
    results.append((b5_value, iteration_num) + metrics)

In [None]:
# Create a dataframe from the grid results
grid_df = pd.DataFrame(results, columns=['b5_value', 
                                         'iteration_num', 
                                         'num_sequences', 
                                         'alignment_length', 
                                         'avg_num_gaps', 
                                         'percentage_gaps', 
                                         'avg_percentage_identity'])
# Sort the dataframe based on the iteration number
grid_df = grid_df.sort_values(by='iteration_num').reset_index(drop=True)

# Add the values from the Gblocks parameters
grid_df['b1_value'] = b1_values
grid_df['b2_value'] = b2_values
grid_df['b3_value'] = b3_values
grid_df['b4_value'] = b4_values

In [None]:
grid_df

In [None]:
alignments_path = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments')
grid_search_path = os.path.join(alignments_path, 'grid_search')

# Visualization

# Set style of seaborn plot
sns.set(style="white")

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Define the metrics and their respective labels
metrics = ['alignment_length', 'avg_num_gaps', 'percentage_gaps', 'avg_percentage_identity']
labels = ['Alignment Length', 'Average Number of Gaps', 'Percentage of Gaps', 'Average Percentage Identity']

# Iterate over each metric and plot it in its subplot
for ax, metric, label in zip(axes.ravel(), metrics, labels):
    sns.lineplot(x='iteration_num', y=metric, data=grid_df, color='lightcoral', ax=ax)
    ax.set_title(label + ' vs Iteration Number', fontweight='bold')
    ax.set_xlabel('')
    ax.set_ylabel(label)
    # Set visibility for spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(True)
    ax.spines['left'].set_visible(True)

# # Adjust layout for better spacing
# plt.tight_layout()
# plt.show()

# save path
save_path = os.path.join(grid_search_path, 'pr2_ref_align_gblocks_grid_search_h.png')
# Save the figure
# plt.savefig(save_path, dpi=300, bbox_inches='tight')

# Create Maximum Likelihood tree

In [9]:
# Variables:
# alignment = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments', 'vamp_mafft.fasta')
model = 'GTR+G'

# alignment = os.path.join(results_path, 'phyl_trees', 'refrence_alignment_tree', 'vamp_mafft_cut.fasta')

alignment = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', 'vamp_mafft.fasta')

In [12]:
# # Run raxml-ng (https://github.com/amkozlov/raxml-ng/wiki/Tutorial) to check the alignment and save the output in the specified directory.
# prefix = 'T1'
# output_dir = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, prefix)

prefix = 'T1_data_check'
# output_dir = os.path.join(results_path, 'phyl_trees', 'refrence_alignment_tree', prefix)
output_dir = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', prefix)

# Check if the output directory exists, create it if not
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

run_raxmlng_check(alignment = alignment, 
                  output_dir = output_dir, 
                  model = model, 
                  prefix = prefix)


RAxML-NG v. 1.1.0 released on 29.11.2021 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum/#!forum/raxml

System: 11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz, 8 cores, 11 GB RAM

RAxML-NG was called at 11-Dec-2023 11:53:07 as follows:

raxml-ng --check --msa ../results/phyl_trees/all_projects_combined/vamp_mafft.fasta --model GTR+G --prefix T1_data_check

Analysis options:
  run mode: Alignment validation
  start tree(s): 
  random seed: 1702291987
  SIMD kernels: AVX2
  parallelization: coarse-grained (auto), PTHREADS (auto)

[00:00:00] Reading alignment from file: ../results/phyl_trees/all_projects_combined/vamp_mafft.fasta
[00:00:00] Loaded alignment with 183 taxa and 2038 sites



NOTE: Reduced alignment (with duplicates and gap-on

0

In [13]:
# # Run raxml-ng to compute the maximum likelihood tree and save the output in the specified directory
# prefix = 'T2'
# output_dir = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, prefix)


prefix = 'T2_MLtree'
# output_dir = os.path.join(results_path, 'phyl_trees', 'refrence_alignment_tree', prefix)
output_dir = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', prefix)

# Check if the output directory exists, create it if not
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

run_raxmlng_tree(alignment = alignment, 
                 output_dir = output_dir, 
                 model = model, 
                 prefix= prefix)


RAxML-NG v. 1.1.0 released on 29.11.2021 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum/#!forum/raxml

System: 11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz, 8 cores, 11 GB RAM

RAxML-NG was called at 11-Dec-2023 11:53:33 as follows:

raxml-ng --msa ../results/phyl_trees/all_projects_combined/vamp_mafft.fasta --model GTR+G --prefix T2_MLtree --threads 3 --seed 2 --tree pars{25},rand{25}

Analysis options:
  run mode: ML tree search
  start tree(s): random (25) + parsimony (25)
  random seed: 2
  tip-inner: OFF
  pattern compression: ON
  per-rate scalers: OFF
  site repeats: ON
  fast spr radius: AUTO
  spr subtree cutoff: 1.000000
  branch lengths: proportional (ML estimate, algorithm: NR-FAST)
  SIMD kernels: AVX2
  parallelization:

0

## Inferring bootstrap trees

In [14]:
prefix = 'T3_bootstraps'
model = 'GTR+G'
# output_dir = os.path.join(results_path, 'phyl_trees', 'refrence_alignment_tree', prefix)
output_dir = os.path.join(results_path, 'phyl_trees', 'all_projects_combined', prefix)

# Check if the output directory exists, create it if not
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


run_raxmlng_bootstrap(alignment = alignment,
                      output_dir = output_dir, 
                      model = model, 
                      prefix = prefix)


RAxML-NG v. 1.1.0 released on 29.11.2021 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum/#!forum/raxml

System: 11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz, 8 cores, 11 GB RAM

RAxML-NG was called at 11-Dec-2023 12:33:00 as follows:

raxml-ng --bootstrap --msa ../results/phyl_trees/all_projects_combined/vamp_mafft.fasta --model GTR+G --prefix T3_bootstraps --threads 3 --seed 2

Analysis options:
  run mode: Bootstrapping
  start tree(s): 
  bootstrap replicates: max: 1000 + bootstopping (autoMRE, cutoff: 0.030000)
  random seed: 2
  tip-inner: OFF
  pattern compression: ON
  per-rate scalers: OFF
  site repeats: ON
  branch lengths: proportional (ML estimate, algorithm: NR-FAST)
  SIMD kernels: AVX2
  parallelization: coarse-grained 

0

### Computing branch support

In [16]:
# main_dir = os.path.join(results_path, 'phyl_trees', 'refrence_alignment_tree')

main_dir = os.path.join(results_path, 'phyl_trees', 'all_projects_combined')

bootstraps = os.path.join(main_dir, 'T3_bootstraps', 'T3_bootstraps.raxml.bootstraps')
tree = os.path.join(main_dir, 'T2_MLtree', 'T2_MLtree.raxml.bestTree')
prefix = 'T4_supports'
output_dir = os.path.join(main_dir, prefix)

# Check if the output directory exists, create it if not
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

run_raxmlng_support(tree = tree,
                    output_dir = output_dir, 
                    bootstraps = bootstraps, 
                    num_threads=3, 
                    prefix=prefix)


RAxML-NG v. 1.1.0 released on 29.11.2021 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum/#!forum/raxml

System: 11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz, 8 cores, 11 GB RAM

RAxML-NG was called at 11-Dec-2023 19:03:02 as follows:

raxml-ng --support --tree ../results/phyl_trees/all_projects_combined/T2_MLtree/T2_MLtree.raxml.bestTree --bs-trees ../results/phyl_trees/all_projects_combined/T3_bootstraps/T3_bootstraps.raxml.bootstraps --prefix T4_supports --threads 3

Analysis options:
  run mode: Compute bipartition support (Felsenstein Bootstrap)
  start tree(s): user
  random seed: 1702317782
  SIMD kernels: AVX2
  parallelization: coarse-grained (auto), PTHREADS (3 threads), thread pinning: OFF

Reading reference tree from file: 

0

# Tree visualisation

In [None]:
tree_path

In [None]:
# Variables
tree_path = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'T2_gblocks_strong', 'T2.raxml.bestTree')
alignment_path = os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'alignments', 'vamp_mafft.fasta')
os.environ['QT_QPA_PLATFORM'] = 'offscreen'

In [None]:
# Creating name_mapping dictionary to rename the new sequences in the tree

name_mapping = {}

with open(alignment_path, 'r') as f:
    for line in f:
        # Reference sequences part
        if line.startswith('>') and 'Vampyrellida' in line:
            old_name = line.strip('>').strip()
            new_name = line.split('|')[-1].strip()
            id = line.split('|')[0].strip('>').strip()
            name_mapping[old_name] = new_name + '_' + id
        # New environmental sequences part
        elif line.startswith('>') and 'centroid' in line:
            old_name = line.strip('>').strip()
            seq_name = line.split('=')[1].strip('_seqs')
            sample_name = line.split('_')[-1].strip()
            new_name = 'env_otu_' + seq_name + '_' + sample_name
            name_mapping[old_name] = new_name

### Create inner nodes

In [None]:
# Load the tree
tree = Tree(tree_path)

# Create a tree style
ts = TreeStyle()

# Styling
ts.scale = 2000
ts.tree_width = 60000  # Increase this for a wider tree

# Assign names to inner nodes for easy reference
i = 0
for n in tree.traverse():
    if not n.is_leaf():  # Check if it is an inner node
        n.name = "internal_node_{}".format(i)
        i += 1

# for node in tree.traverse():
#     print(node.name)

# # Apply the renaming directly to the leaves of the tree
# for leaf in tree.iter_leaves():
#     if leaf.name in name_mapping:
#         leaf.name = name_mapping[leaf.name]

# Styling
ts.show_leaf_name = True # display leaf names

# Hides the node circles
for node in tree.traverse():
    node.img_style["size"] = 0

# Add node names/IDs
for n in tree.traverse():
    # Create a face to display node name/ID
    name_face = TextFace(n.name, fsize=10, fgcolor='black')
    n.add_face(name_face, column=0, position="branch-top")

tree.render(os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'tree_inner_nodes_gblocks_strong.png'), dpi=300, w=10000, tree_style=ts)
tree.render('%%inline', tree_style=ts)

In [None]:
# Load the tree
tree = Tree(tree_path)

# Create a tree style
ts = TreeStyle()
# Styling
ts.scale = 2000
ts.tree_width = 60000  # Increase this for a wider tree

# Assign names to inner nodes for easy reference
i = 0
for n in tree.traverse():
    if not n.is_leaf():  # Check if it is an inner node
        n.name = 'internal_node_{}'.format(i)
        i += 1


# Find the node you want to use as the new root
node_to_reroot = tree&'internal_node_20'  # Replace 'Node_Name' with the name of your desired node

# Use the assigned name to reroot
tree.set_outgroup(node_to_reroot)

# # Identify the node whose children you want to swap
node_to_swap_1 = tree&'internal_node_0'  
node_to_swap_2 = tree&'internal_node_12'
# node_to_swap_3 = tree&'internal_node_68'
# node_to_swap_4 = tree&'internal_node_64'
# node_to_swap_5 = tree&'internal_node_72'

# # Swap the children of the target node
node_to_swap_1.swap_children()
node_to_swap_2.swap_children()
# node_to_swap_3.swap_children()
# node_to_swap_4.swap_children()
# node_to_swap_5.swap_children()



# Apply the renaming directly to the leaves of the tree
for leaf in tree.iter_leaves():
    if leaf.name in name_mapping:
        leaf.name = name_mapping[leaf.name]


# Add node names/IDs
for n in tree.traverse():
    # Create a face to display node name/ID
    name_face = TextFace(n.name, fsize=10, fgcolor='black')
    n.add_face(name_face, column=0, position="branch-top")


# Hides the node circles
for node in tree.traverse():
    node.img_style['size'] = 0

tree.render('%%inline', tree_style=ts)
# tree.show()
# tree.render(os.path.join(results_path, 'tree_test.png'), tree_style=ts)

### Topology

In [None]:
# Load the tree
tree = Tree(tree_path)

# Create a tree style
ts = TreeStyle()
# Styling
ts.scale = 2000
ts.tree_width = 60000  # Increase this for a wider tree
line_width = 2


# Assign names to inner nodes for easy reference
i = 0
for n in tree.traverse():
    if not n.is_leaf():  # Check if it is an inner node
        n.name = 'internal_node_{}'.format(i)
        i += 1


# Find the node you want to use as the new root
node_to_reroot = tree&'internal_node_6'  # Replace 'Node_Name' with the name of your desired node

# Use the assigned name to reroot
tree.set_outgroup(node_to_reroot)

# Identify the node whose children you want to swap
node_to_swap_1 = tree&'internal_node_0'  
node_to_swap_2 = tree&'internal_node_2'
# node_to_swap_3 = tree&'internal_node_11'
# node_to_swap_4 = tree&'internal_node_64'
# node_to_swap_5 = tree&'internal_node_72'

# Swap the children of the target node
node_to_swap_1.swap_children()
node_to_swap_2.swap_children()
node_to_swap_3.swap_children()
# node_to_swap_4.swap_children()
# node_to_swap_5.swap_children()




# Apply the renaming directly to the leaves of the tree
for leaf in tree.iter_leaves():
    if leaf.name in name_mapping:
        leaf.name = name_mapping[leaf.name]


# # Add node names/IDs
# for n in tree.traverse():
#     # Create a face to display node name/ID
#     name_face = TextFace(n.name, fsize=10, fgcolor='black')
#     n.add_face(name_face, column=0, position="branch-top")


# Hides the node circles
for node in tree.traverse():
    node.img_style['size'] = 0

# Nodestyle
for node in tree.traverse():
    # Set the color and thickness for all nodes in the subtree
    nstyle = NodeStyle()
    nstyle["size"] = 0  # Set size to 0 to hide the default circle node
    nstyle["vt_line_width"] = line_width  # Set the thickness of vertical lines
    nstyle["hz_line_width"] = line_width  # Set the thickness of horizontal lines
    node.set_style(nstyle)

# Find the node you want to collapse
node_to_collapse_1 = tree&'internal_node_6'
node_to_collapse_2 = tree&'internal_node_7'
node_to_collapse_3 = tree&'internal_node_29'
node_to_collapse_4 = tree&'internal_node_28'
node_to_collapse_5 = tree&'internal_node_1'
node_to_collapse_6 = tree&'internal_node_8'
node_to_collapse_7 = tree&'internal_node_19'

# node_to_collapse.img_style['shape'] = 'sphere'
# node_to_collapse.img_style['size'] = 50
node_to_collapse_1.img_style['draw_descendants'] = False
node_to_collapse_2.img_style['draw_descendants'] = False
node_to_collapse_3.img_style['draw_descendants'] = False
node_to_collapse_4.img_style['draw_descendants'] = False
node_to_collapse_5.img_style['draw_descendants'] = False
node_to_collapse_6.img_style['draw_descendants'] = False
node_to_collapse_7.img_style['draw_descendants'] = False

# Rename the node
node_to_collapse_1.name = 'Sericomyxidae'
node_to_collapse_2.name = 'Thalassomyxa & Arachnula'
node_to_collapse_3.name = 'Placopodidae'
node_to_collapse_4.name = 'B4'
node_to_collapse_5.name = 'Vampyrella & Leptophrys'
node_to_collapse_6.name = 'B1'
node_to_collapse_7.name = 'B2'

tree.render('%%inline', tree_style=ts)
# tree.show()
tree.render(os.path.join(results_path, 'phyl_trees', project, marker, denoise_method, sim, 'tree_gblocks_strong.png'), dpi=600, w=1000, tree_style=ts)

In [None]:
import os
from ete3 import Tree, TreeStyle, NodeStyle, TextFace

os.environ['QT_QPA_PLATFORM'] = 'offscreen'
line_width = 1
tree_width = 200


# Create a Tree object from the Newick file
main_tree = Tree(f'{tree_file}/T2.raxml.bestTree')


# Apply the renaming directly to the leaves of the tree
for leaf in main_tree.iter_leaves():
    if leaf.name in name_mapping:
        leaf.name = name_mapping[leaf.name]
        

# Function to get color based on the label
def get_color(name):
    if 'Leptophrys_vorax_strain_LV02_' in name:
        return 'red'
    elif 'Leptophrys_vorax_strain_LV04_' in name:
        return '#990000'
    elif 'Vampyrella_lateritia_strain_VL09_' in name:
        return 'green'
    elif 'Vampyrella_lateritia_strain_NHR1' in name:
        return '#33CC33'
    elif 'Pseudovampyrella_closterii_strain_VC01_' in name:
        return 'purple'
    elif 'Placopus_flabellus_NHR1_' in name:
        return 'orange'
    elif 'Leptophrys_vorax_strain_LV01_' in name:
        return '#FF6666'
    return 'black'  # Default color if no match

for node in main_tree.traverse():
    # Set the color and thickness for all nodes in the subtree
    nstyle = NodeStyle()
    nstyle["size"] = 0  # Set size to 0 to hide the default circle node
    nstyle["vt_line_width"] = line_width  # Set the thickness of vertical lines
    nstyle["hz_line_width"] = line_width  # Set the thickness of horizontal lines
    node.set_style(nstyle)

for leaf in main_tree.iter_leaves():
    color = get_color(leaf.name)

    # Apply the color to the text (leaf name)
    name_face = TextFace(leaf.name, fgcolor=color)
    leaf.add_face(name_face, column=0, position="branch-right")

    # Apply the color to the branch leading to the leaf
    nstyle = NodeStyle()
    nstyle["fgcolor"] = color
    nstyle["size"] = 0  # Set size to 0 to hide the default circle node
    nstyle["vt_line_color"] = color
    nstyle["hz_line_color"] = color
    nstyle["vt_line_width"] = line_width  # Control the thickness of the vertical lines
    nstyle["hz_line_width"] = line_width  # Control the thickness of the horizontal lines
    leaf.set_style(nstyle)


ts = TreeStyle()
ts.scale = tree_width  # Increase this value to spread out the tree more
ts.show_leaf_name = False
# main_tree.render("%%inline", tree_style=ts)
# main_tree.render(f"{tree_file}/tree_mock.png", dpi=600, w=10000, tree_style=ts)

# save as Newick
# main_tree.write(outfile=f"{tree_file}/tree_mock.newick")

In [15]:
## Cola

In [None]:
from ete3 import Tree

def collapse_low_support_nodes(tree_path, threshold=70):
    """
    Collapse nodes in a phylogenetic tree that have bootstrap support values below the given threshold.
    
    Parameters:
    tree_path (str): Path to the Newick tree file.
    threshold (int): Bootstrap support threshold below which nodes will be collapsed. Defaults to 70.
    
    Returns:
    Tree: The tree with nodes collapsed below the bootstrap support threshold.
    """
    
    # Load the tree
    tree = Tree(tree_path)
    
    # Iterate over each node
    for node in tree.traverse():
        # If the node has a bootstrap support value and it's below the threshold
        if node.support < threshold:
            # Collapse the node
            node.delete()
    
    return tree

# Example usage
collapsed_tree = collapse_low_support_nodes('path/to/your/tree.nw', threshold=70)
collapsed_tree.write(outfile='path/to/your/collapsed_tree.nw')


# Subtrees