In [17]:
import cogent3
from cogent3 import load_unaligned_seqs
from cogent3 import get_app
from cogent3 import make_aligned_seqs
from os import listdir
from os.path import isfile, join

@cogent3.app.composable.define_app
def renamer(seqs: cogent3.app.typing.UnalignedSeqsType) -> cogent3.app.typing.UnalignedSeqsType:
    """
    A function to rename sequences in a dataset.
    """

    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }

    seqs = seqs.rename_seqs(lambda x: name_map.get(x.split("-")[0], x))

    return seqs.take_seqs(list(name_map.values()))


In [21]:
folder_in = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_112/aligns_ortho/'
folder_out = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_112/aligns_ortho/outdir'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa')
print(in_dstore)

100x member DataStoreDirectory(source='/xdisk/masel/uliseshmc/EstimatingUd/primates10_112/aligns_ortho', members=[DataMember(data_store=/xdisk/masel/uliseshmc/EstimatingUd/primates10_112/aligns_ortho, unique_id=ENSG00000117054.fa), DataMember(data_store=/xdisk/masel/uliseshmc/EstimatingUd/primates10_112/aligns_ortho, unique_id=ENSG00000035687.fa)]...)


In [22]:
loader = cogent3.get_app("load_unaligned", moltype="dna")
rename = renamer()
nt_align = get_app("progressive_align", "nucleotide", guide_tree="(Human:0.06,Chimpanzee:0.06,Gorilla:0.1)")
out_dstore = cogent3.open_data_store(folder_out, suffix='fa', mode="w")
writer = cogent3.get_app("write_seqs", data_store = out_dstore)
app = loader + rename + nt_align + writer
_ = list(app.apply_to(in_dstore[:5], parallel = True, show_progress=True))
out_dstore.describe

   0%|          |00:00<?

record type,number
completed,5
not_completed,0
logs,1


In [23]:
out_dstore.summary_logs

time,name,python version,who,command,composable
2025-05-21 17:11:51,logs/load_unaligned-renamer-progressive_align-write_seqs-9b49a4ca.log,3.10.16,uliseshmc,/home/u12/uliseshmc/.conda/envs/EstimatingUd/lib/python3.10/site-packages/ipykernel_launcher.py -f /home/u12/uliseshmc/.local/share/jupyter/runtime/kernel-ab7b873a-caed-422b-b374-4a21acd5b4c9.json,"load_unaligned(moltype='dna', format='fasta') + renamer() +progressive_align(model='nucleotide', gc=None, param_vals=None,guide_tree='(Human:0.06,Chimpanzee:0.06,Gorilla:0.1)', unique_guides=False,indel_length=0.1, indel_rate=1e-10, distance='pdist', iters=None,approx_dists=True) +write_seqs(data_store=DataStoreDirectory(source=/xdisk/masel/uliseshmc/EstimatingUd/primates10_112/aligns_ortho/outdir,mode=Mode.w, suffix=fa, limit=None, verbose=False), id_from_source=, format='fasta')"


In [2]:
concat_alns_app = get_app("concat", moltype="dna")
codon_align = get_app("progressive_align", "nucleotide", guide_tree="(Human:0.06,Chimpanzee:0.06,Gorilla:0.1)")
ffold = get_app("take_codon_positions", fourfold_degenerate=True)

In [3]:
#store all the alignments on onlyfiles list
folder = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_112/aligns_ortho/'
onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]    

#I want to reach the next line. Before I have to construct an auxilliary function that removes orthologs where one of the species is missing
#for i in range(0, len(onlyfiles)):
for i in range(0, 5):
    #Read alignments
    aln = load_unaligned_seqs(folder + onlyfiles[i], moltype='dna')
    aln = aln.rename_seqs(lambda x: x.split("-")[0])

    #Rename alignments rows
    hcm = aln.take_seqs(["homo_sapiens", "pan_troglodytes", "gorilla_gorilla"]).degap()
    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }
    #auxilliary function to rename sequences
    def renamer(name):
        return name_map.get(name, name)

    hcm = hcm.rename_seqs(renamer)
    hcm

    #Doing nucleotide model meanwhile. I got to change this for codon eventually
    aln_codonaligned = codon_align(hcm)

    #Extract four fold sites
    #Do I need omit_gap_pos() or degap()
    aln_ffold = ffold(aln_codonaligned).omit_gap_pos()

    if i==0:
        concat_aln = aln_ffold
    else:
        concat_aln = concat_alns_app([concat_aln, aln_ffold])

In [4]:
#Get distances of concatenated sequences 
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result = sm(concat_aln)
lengthstree = result.lf.get_lengths_as_ens()
lengthstree

   0%|          |00:00<?

   0%|          |00:00<?

{'Chimpanzee': np.float64(9.999983869577135e-07),
 'Gorilla': np.float64(1.0000069447833995e-06),
 'Human': np.float64(9.999983428411142e-07)}

In [None]:
def renamer(seqs):
    seqs = seqs.rename_seqs(lambda x: x.split("-")[0])

    #Rename alignments rows
    seqs = seqs.take_seqs(["homo_sapiens", "pan_troglodytes", "gorilla_gorilla"]).degap()
    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }
    #auxilliary function to rename sequences
    def renamer_aux(name):
        return name_map.get(name, name)

    seqs = seqs.renamer_aux(renamer)
    return seqs