In [4]:
# imports
import Bio
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline
from Bio.Align import MultipleSeqAlignment
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import subprocess
from ete3 import Tree

In [98]:
# variables
project = 'Suthaus_2022'
cell = 'cellCombined'
marker = 'Full18S'
sim = 'sim95'
raw_data = '../raw_data'
denoise_method = 'FAD'
vamp_spec_dir = f'{raw_data}/vamp_specific_seqs/{project}/{marker}/{cell}/{sim}/{denoise_method}'
vamp_ref_seqs = f'{raw_data}/reference_alignments/vamp/2023_VAMPYRELLIDA_SSU_annotated_PR2_AM_AS_not_aligned_version.fasta'
clustalw_path = f'{raw_data}/packages/clustalw-2.1-linux-x86_64-libcppstatic/clustalw2'
tree_file = f'{raw_data}/phylogenetic_inference/{project}/{marker}/{cell}/{sim}'
pics_dir = f'{raw_data}/OTU_results/{project}'
tax_assign_dir = f'{raw_data}/tax_assign_results/{project}/{marker}/{cell}/{sim}/{denoise_method}'

In [None]:
# functions

def strip(file_path, output_path='out_stripped.fasta'):
    '''
    Remove line breaks in FASTA files
    '''
    seqs = []
    # read a file and stripped it
    with open(file_path,'rt') as f:
        lines = f.readlines()
        for line in lines:
            if line.startswith('>'):
                seqs.append('\n' + line)
            else:
                line_cleaned = line.rstrip()
                seqs.append(line_cleaned)
        seqs[0] = seqs[0].lstrip()
    # write the file into fasta
    with open(output_path, 'w') as fp:
        for seq in seqs:
            fp.write(seq)

# Creating alignments

In [None]:
# format the ref. sequences fasta file if needed

strip(file_path = vamp_ref_seqs, output_path = vamp_ref_seqs)

In [None]:
# combine the ref. sequences fasta file and denoised sequences


# Path to the output file
file_combined_path = f'{vamp_spec_dir}/reference_and_mock_seqs.fasta'

# Path to the denoised sequences
denoised_sequences = f'{vamp_spec_dir}/Mock_18S_otu.fasta'

# Read the contents of the reference sequences file
with open(vamp_ref_seqs, 'r') as file1:
    content1 = file1.read()

# Read the contents of the denoised sequences file
with open(denoised_sequences, 'r') as file2:
    content2 = file2.read()

# Write the combined contents to the new file
with open(file_combined_path, 'w') as file_combined:
    file_combined.write(content1 + '\n' + content2)

print(f"Combined files {denoised_sequences} and {vamp_ref_seqs} into {file_combined_path}")

In [None]:
# Creating alignment

# paths
input_sequences = f'{vamp_spec_dir}/reference_and_mock_seqs.fasta'



#     creating an alingnment using clustalw:
#     see more here: https://manpages.ubuntu.com/manpages/impish/man1/clustalw.1.html
cmd = ClustalwCommandline(clustalw_path,
                          infile=input_sequences,
                          output='fasta',
                          outfile=f'{vamp_spec_dir}/aligned_reference_and_mock_seqs.fasta')
# print clustal command in a linux style
print(cmd)
# run clustalw alignment
stdout, stderr = cmd()

In [None]:
# if you want to check the resulting alignments using biopython

align = AlignIO.read(f'{vamp_spec_dir}/aligned_reference_and_mock_seqs.fasta', 'fasta')
print(align)

# Creating a maximum likelihood phylogenetic tree

In [None]:
# Path to your alignment file
alignment_path = f'{vamp_spec_dir}/aligned_reference_and_mock_seqs.fasta'

# Output prefix for the tree
output_prefix = f'{vamp_spec_dir}/raxml_tree'

# The command to run RAxML-NG
cmd = f"raxml-ng --all --msa {alignment_path} --model GTR+G --prefix {output_prefix} --threads 2"

# Run the command
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()

# Check if the command was successful
if process.returncode != 0:
    print(f"RAxML-NG encountered an error:\n{stderr.decode()}")
else:
    print(f"RAxML-NG ran successfully. Tree(s) saved to {output_prefix}.bestTree and other files.")


# Tree visualizing

In [44]:
# Creating name_mapping dictionary to rename the new sequences in the tree

name_mapping = {}

with open(f'{tax_assign_dir}/blast6_Mock_18S.tab') as f:
    content = f.read
    for line in f:
        if 'Vampyrellida' in line:
            old_name = line.strip().split('\t')[0].replace(';', '_')
            new_name_second_part = line.strip().split(';')[0].strip('centroid=')
            new_name_first_part = line.split(',s:')[1].split('\t')[0]
            name_mapping[old_name] = new_name_first_part + "_" + new_name_second_part
            
name_mapping

{'centroid=seq12_300_seqs=5': 'Vampyrella_lateritia_strain_VL09_seq12_300',
 'centroid=seq173_22_seqs=10': 'Leptophrys_vorax_strain_LV02_seq173_22',
 'centroid=seq199_18_seqs=1': 'Leptophrys_vorax_strain_LV02_seq199_18',
 'centroid=seq179_25_seqs=3': 'Pseudovampyrella_closterii_strain_VC01_seq179_25',
 'centroid=seq18_128_seqs=2': 'Placopus_flabellus_HR01_seq18_128',
 'centroid=seq254_8_seqs=21': 'Leptophrys_vorax_strain_LV02_seq254_8',
 'centroid=seq308_6_seqs=1': 'Leptophrys_vorax_strain_LV04_seq308_6'}

In [45]:
import os
from ete3 import Tree, TreeStyle, NodeStyle, TextFace

os.environ['QT_QPA_PLATFORM'] = 'offscreen'
line_width = 1
tree_width = 200


# Create a Tree object from the Newick file
main_tree = Tree(f'{tree_file}/T2.raxml.bestTree')


# Apply the renaming directly to the leaves of the tree
for leaf in main_tree.iter_leaves():
    if leaf.name in name_mapping:
        leaf.name = name_mapping[leaf.name]
        

# Function to get color based on the label
def get_color(name):
    if 'Leptophrys_vorax_strain_LV02_' in name:
        return 'red'
    elif 'Leptophrys_vorax_strain_LV04_' in name:
        return '#990000'
    elif 'Vampyrella_lateritia_strain_VL09_' in name:
        return 'green'
    elif 'Vampyrella_lateritia_strain_NHR1' in name:
        return '#33CC33'
    elif 'Pseudovampyrella_closterii_strain_VC01_' in name:
        return 'purple'
    elif 'Placopus_flabellus_NHR1_' in name:
        return 'orange'
    elif 'Leptophrys_vorax_strain_LV01_' in name:
        return '#FF6666'
    return 'black'  # Default color if no match

for node in main_tree.traverse():
    # Set the color and thickness for all nodes in the subtree
    nstyle = NodeStyle()
    nstyle["size"] = 0  # Set size to 0 to hide the default circle node
    nstyle["vt_line_width"] = line_width  # Set the thickness of vertical lines
    nstyle["hz_line_width"] = line_width  # Set the thickness of horizontal lines
    node.set_style(nstyle)

for leaf in main_tree.iter_leaves():
    color = get_color(leaf.name)

    # Apply the color to the text (leaf name)
    name_face = TextFace(leaf.name, fgcolor=color)
    leaf.add_face(name_face, column=0, position="branch-right")

    # Apply the color to the branch leading to the leaf
    nstyle = NodeStyle()
    nstyle["fgcolor"] = color
    nstyle["size"] = 0  # Set size to 0 to hide the default circle node
    nstyle["vt_line_color"] = color
    nstyle["hz_line_color"] = color
    nstyle["vt_line_width"] = line_width  # Control the thickness of the vertical lines
    nstyle["hz_line_width"] = line_width  # Control the thickness of the horizontal lines
    leaf.set_style(nstyle)


ts = TreeStyle()
ts.scale = tree_width  # Increase this value to spread out the tree more
ts.show_leaf_name = False
# main_tree.render("%%inline", tree_style=ts)
# main_tree.render(f"{tree_file}/tree_mock.png", dpi=600, w=10000, tree_style=ts)

# save as Newick
# main_tree.write(outfile=f"{tree_file}/tree_mock.newick")

## Subtrees

### Subtrees tree

In [91]:
# Function to get color based on the label
def get_color(name, subtaxa):
    
    # color code for Leptophrys_vorax
    if subtaxa == 'Leptophrys_vorax':
        if 'Leptophrys_vorax_strain_LV02_' in name:
            return 'red'
        elif 'Leptophrys_vorax_strain_LV04_' in name:
            return '#990000'
        elif 'Leptophrys_vorax_strain_LV01_' in name:
            return '#FF6666'
        return 'black'  # Default color if no match
    
    # color code for Vampyrella_lateritia
    elif subtaxa == 'Vampyrella_lateritia':
        if 'Vampyrella_lateritia_strain_VL09_' in name:
            return 'green'
#         elif 'Vampyrella_lateritia_strain_NHR1' in name:
#             return '#33CC33'
    
    # color code for Pseudovampyrella_closterii
    elif subtaxa == 'Pseudovampyrella_closterii':
        if 'Pseudovampyrella_closterii_strain_VC01_' in name:
            return 'purple'
    
    # color code for Placopus_flabellus
    elif subtaxa == 'Placopus_flabellus':
        if 'Placopus_flabellus' in name:
            return 'orange'
    else:
        print('Subtaxa name is probably incorrect. Choose among: Leptophrys_vorax, Vampyrella_lateritia, or Pseudovampyrella_closterii')

In [99]:
# Subtaxa
subtaxa = 'Pseudovampyrella_closterii'

# load the tree
main_tree = Tree(f"{tree_file}/tree_mock.newick")

# Collect leaves that match the condition for new sequences
matching_leaves = [leaf for leaf in main_tree.iter_leaves() if f'{subtaxa}_strain_' in leaf.name and 'seq' in leaf.name]




# Find the common ancestor of these leaves
common_ancestor = main_tree.get_common_ancestor(matching_leaves)

# Use this common ancestor as the root of a new subtree
subtree = common_ancestor.copy(method="newick")


for node in subtree.traverse():
    # Set the color and thickness for all nodes in the subtree
    nstyle = NodeStyle()
    nstyle["size"] = 0  # Set size to 0 to hide the default circle node
    nstyle["vt_line_width"] = 2  # Set the thickness of vertical lines
    nstyle["hz_line_width"] = 2  # Set the thickness of horizontal lines
    node.set_style(nstyle)

for leaf in subtree.iter_leaves():
    color = get_color(name = leaf.name, subtaxa = subtaxa)

    # Apply the color to the text (leaf name)
    name_face = TextFace(leaf.name, fgcolor=color)
    leaf.add_face(name_face, column=0, position="branch-right")

    # Apply the color to the branch leading to the leaf
    nstyle = NodeStyle()
    nstyle["fgcolor"] = color
    nstyle["size"] = 0  # Set size to 0 to hide the default circle node
    nstyle["vt_line_color"] = color
    nstyle["hz_line_color"] = color
    nstyle["vt_line_width"] = 2  # Control the thickness of the vertical lines
    nstyle["hz_line_width"] = 2  # Control the thickness of the horizontal lines
    leaf.set_style(nstyle)


ts = TreeStyle()
ts.show_leaf_name = False
# subtree.render("%%inline", tree_style=ts)
subtree.render(f"{tree_file}/subtree_mock_{subtaxa}.png", dpi=600, w=6000, tree_style=ts)

{'nodes': [],
 'faces': [[291.0749442908125,
   8.206594517438232,
   4624.156849498198,
   147.71870131388818,
   3,
   'KF141791|Vampyrellida|Leptophryidae|Vernalophrys|Vernalophrys_algivore'],
  [301.8253593246155,
   147.71870131388818,
   4208.164349625214,
   287.2308081103381,
   4,
   'JF720748|Vampyrellida|Leptophryidae|Vernalophrys|Vernalophrys_X'],
  [605.3515571892315,
   287.2308081103381,
   4963.053245948932,
   426.74291490678803,
   8,
   'DQ409121|Vampyrellida|Leptophryidae|Leptophryidae_X|Leptophryidae_XX'],
  [726.5086627751365,
   426.74291490678803,
   5084.210351534837,
   566.255021703238,
   9,
   'DQ409086|Vampyrellida|Leptophryidae|Leptophryidae_X|Leptophryidae_XX'],
  [233.74019572224998,
   566.255021703238,
   4673.507829656333,
   705.7671284996879,
   10,
   'MF621964|Vampyrellida|Leptophryidae|Planctomyxa|Planctomyxa_polycarya'],
  [238.67834582242796,
   705.7671284996879,
   4571.760251029814,
   845.2792352961379,
   12,
   'EU567260|Vampyrellida|Lep