In [7]:
from Bio import SeqIO
import os
import glob

In [5]:
os.chdir("/data/huyou/orthofinder/selected/OrthoFinder/Results_Aug08/Single_Copy_Orthologue_Sequences")

In [6]:
ordered_species=["Hap1",
"Hap2",
"SD1",
"SD2",
"Aegle_marmelos",
"Citrus_australasica",
"Citropsis_gilletiana",
"Citrus_clementina",
"Citrus_hongheensis",
"Atalantia_buxfoliata",
"Clausena_lansium",
"Citrus_grandis_wanbaiyou",
"Murraya_paniculata",
"Citrus_reticulata",
"Citrus_linwuensis",
"Citrus_mangshanensis",
"Citrus_medica",
"Citrus_sinensis",
"Luvunga_scandens",
"Citrus_maxima_majia",
"Citrus_ichangensis",
"Poncirus_trifoliata"]

In [8]:
def rename_sequences(fasta_file, species_names):
    records = list(SeqIO.parse(fasta_file, "fasta"))

    for i, record in enumerate(records):
        if i < len(species_names):
            record.id = species_names[i]
            record.description = ""
        else:
            print(f"Warning: More sequences in {fasta_file} than species names provided.")

    SeqIO.write(records, f"renamed_{fasta_file}", "fasta")

if __name__ == "__main__":

    # List of FASTA files
    fasta_files = glob.glob("OG*.fa")

    for fasta_file in fasta_files:
        rename_sequences(fasta_file, ordered_species)

In [9]:
os.chdir("/data/huyou/orthofinder/selected/timetree/renamed_OG_sequences")

In [None]:
## perform sequence alignment using muscle command
!ls renamed_OG00*|while read R;do muscle -align $R -output "aligned_"$R;done

In [11]:
## convert aligned fasta to nexus format
!ls aligned_renamed_OG00*|while read R;do seqmagick convert --output-format nexus --alphabet protein $R $R".nex";done

In [13]:
from Bio.Nexus import Nexus

In [14]:
## concatenate multiple sequences into a single nexus file
# the combine function takes a list of tuples [(name, nexus instance)...],
# if we provide the file names in a list we can use a list comprehension to
# create these tuples

nexi = []
file_list = glob.glob("*.nex")

nexi = [(fname, Nexus.Nexus(fname)) for fname in file_list]

combined = Nexus.combine(nexi)

with open("combined.nexus", "w") as f:
    combined.write_nexus_data(filename=f)

In [None]:
def wrap_nexus_sequences(input_file, output_file, line_length=80):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        in_matrix = False
        for line in infile:
            if 'MATRIX' in line:
                in_matrix = True
                outfile.write(line)
            elif ';' in line and in_matrix:
                in_matrix = False
                outfile.write(line)
            elif in_matrix and line.strip() != '':
                taxon_name = line.split()[0]
                sequence = ''.join(line.split()[1:])
                for i in range(0, len(sequence), line_length):
                    outfile.write(f'{taxon_name: <10} {sequence[i:i+line_length]}\n')
                    taxon_name = ''  # Only write the taxon name once
            else:
                outfile.write(line)

