In [3]:
# Libraries
from Bio import AlignIO
import os

In [2]:
def find_molecular_characters(alignment_file, reference_names):
    alignment = AlignIO.read(alignment_file, "fasta")
    reference_seqs = [record for record in alignment if record.id in reference_names]
    query_seqs = [record for record in alignment if record.id not in reference_names]

    asymmetric_positions = []
    binary_positions = []

    for i in range(len(alignment[0].seq)):
        ref_bases = [record.seq[i] for record in reference_seqs]
        query_bases = [record.seq[i] for record in query_seqs]

        # Check for binary character state
        if len(set(ref_bases)) == 1 and set(ref_bases) != set(query_bases):
            binary_positions.append(i)

        # Check for asymmetric character state
        elif len(set(ref_bases)) > 1 and all(base not in ref_bases for base in query_bases):
            asymmetric_positions.append(i)

    return binary_positions, asymmetric_positions

### Create groups

In [50]:
# Variables
results_dir = os.path.join('../results')
phyl_tree_dir = os.path.join(results_dir, 'phyl_trees')
alignment_path = os.path.join(phyl_tree_dir, 'molecular_signatures', 'vamp_mafft.fasta')

In [46]:
# grep the seq names


def get_seq_names(alignment_path):
    seq_names = []
    with open(alignment_path, 'r') as infile:
        for line in infile:
            if line.startswith('>'):
                seq_names.append(line.strip())
    return seq_names

def find_subtaxa(alignment_path, tax_pattern):
    subtaxa = []
    seq_names = get_seq_names(alignment_path = alignment_path)
    for seq_name in seq_names:
        if tax_pattern.lower() in seq_name.lower():
            subtaxa.append(seq_name)
    return subtaxa

In [51]:
# sericomyxidae
tax_pattern = 'Serico'

serico_taxa = find_subtaxa(alignment_path = alignment_path, 
                       tax_pattern = tax_pattern)

serico_taxa

['>Sericomyxa_perlucida_strain_BWH1_7_MW969735',
 '>Sericomyxidae_XX_LC150164',
 '>Sericomyxidae_XX_AF372743',
 '>Sericomyxidae_XX_AF372744',
 '>Sericomyxidae_X_strain_NVam1_Penardia_KC779511',
 '>Sericomyxidae_XX_GU479951']

In [55]:
binary_pos, asymmetric_pos = find_molecular_characters(alignment_file = alignment_path, 
                                                       reference_names = serico_taxa)


print("Binary positions:", binary_pos)
print("Asymmetric positions:", asymmetric_pos)

Binary positions: []
Asymmetric positions: []
