In [1]:
import cogent3
from cogent3 import load_unaligned_seqs
from cogent3 import get_app
from cogent3 import make_aligned_seqs
from cogent3 import open_data_store

@cogent3.app.composable.define_app
def renamer(seqs: cogent3.app.typing.UnalignedSeqsType) -> cogent3.app.typing.UnalignedSeqsType:
    """
    A function to rename sequences in a dataset.
    """

    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }

    seqs = seqs.rename_seqs(lambda x: name_map.get(x.split("-")[0], x))

    return seqs.take_seqs(list(name_map.values()))

In [2]:
folder_in = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_114/aligns_cds/'
dstore = open_data_store(folder_in, suffix="fa", mode="r")

print(dstore.describe)
print(dstore.summary_not_completed)

Directory datastore
record type      number
-----------------------
completed          1916
not_completed         0
logs                  1
-----------------------
Cannot summarise not_completed as they are all bytes, use an appropriate reader
type    origin    message    num    source
------------------------------------------
------------------------------------------


In [40]:
whichseq = 1
loader = get_app("load_unaligned", moltype="dna")
aln = loader(dstore[whichseq])
aln

0,1
,0
pan_paniscus-ENSPPAG00000035584,ATGGCGTTAGCCAAGACCTACCCGGCGACATCCTCCCTGCCCAACGGCGATTGCGGCCGC
gorilla_gorilla-ENSGGOG00000011109,ATGGCGTTAGCCAAGACCTACCCGGCGACATCCTCCCTGCCCAACGGCGATTGCGGCCGC
nomascus_leucogenys-ENSNLEG00000028310,ATGGCGTTCGCCGAGACCTACCCGGCGGCATCCTCCCTGCCCAACGGCGATTGCGGCCGC
chlorocebus_sabaeus-ENSCSAG00000009306,ATGGCGTTAGCCAAGACCTACCCGGCGGCATCCTCCCTGCCCAACGGCGATTGCGGCCGC
pan_troglodytes-ENSPTRG00000002178,ATGGCGTTAGCCAAGACCTACCCGGCGACATCCTCCCTGCCCAACGGCGATTGCGGCCGC
pan_troglodytes-ENSPTRG00000044720,ATGTCGTTCGACGAGACCAACCCGCCGGCATCCTCCCTGCCCAACGGCGACTACGGCCGC
homo_sapiens-ENSG00000035687,ATGGCGTTAGCCAAGACCTACCCGGCGACATCCTCCCTGCCCAACGGCGATTGCGGCCGC
macaca_mulatta-ENSMMUG00000000016,ATGGCGTTAGCCAAGACCTACCCGGCGACATCCTCCCTGCCCAACGGCGATTGCGGCCGC
pongo_abelii-ENSPPYG00000000057,ATGGCGTTAGCCAAGACCTACCCGGCGACATCCTCCCTGCCCAACGGCGATTGCGGCCGC


In [39]:
whichseq = 1
loader = get_app("load_unaligned", moltype="dna")
aln = loader(dstore[whichseq])

rename = renamer()
trim_stops = get_app("trim_stop_codons")
codon_align = get_app("progressive_align", "codon", guide_tree="(Human:0.06,Chimpanzee:0.06,Gorilla:0.1)")
#writer = get_app("write_seqs", data_store = out_dstore)
app = rename 
app(aln)

NotCompleted(type=ERROR, origin=renamer, source="ENSG00000035687.fa", message="Traceback (most recent call last):
  File "/home/u12/uliseshmc/repos/cogent3/src/cogent3/app/composable.py", line 407, in _call
    result = self.main(val, *args, **kwargs)
  File "/home/u12/uliseshmc/repos/cogent3/src/cogent3/app/composable.py", line 468, in _main
    return self._user_func(**bound.arguments)
           ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1879/4222804600.py", line 19, in renamer
    seqs = seqs.rename_seqs(lambda x: name_map.get(x.split("-")[0], x))
  File "/home/u12/uliseshmc/repos/cogent3/src/cogent3/core/alignment.py", line 1046, in rename_seqs
    raise ValueError(msg)
ValueError: non-unique names produced by renamer=<function renamer.<locals>.<lambda> at 0x7f0a6e7e3a60>
")

In [14]:
cogent3.app_help("load_unaligned")

Overview
--------
Loads unaligned sequences. Returns a SequenceCollection.

Options for making the app
--------------------------
load_unaligned_app = get_app(
    'load_unaligned',
    moltype=None,
    format='fasta',
)

Parameters
----------
moltype
    molecular type, string or instance
format
    sequence file format

Examples
--------
See https://cogent3.org/doc/app/app_cookbook/load-unaligned.html

Input type
----------
IdentifierType

Output type
-----------
SequenceCollection, SerialisableType


Directory datastore
record type      number
-----------------------
completed             0
not_completed       100
logs                  1
-----------------------
not completed records
type     origin         message                                          num    source                               
---------------------------------------------------------------------------------------------------------------------
ERROR    collect_cds    'TypeError: make_unal... positional argument'    100    ENSG00000243480, ENSG00000240038, ...
---------------------------------------------------------------------------------------------------------------------


In [56]:
hcm = aln.take_seqs(["homo_sapiens", "pan_troglodytes", "gorilla_gorilla"]).degap()
name_map = {
    "homo_sapiens": "Human",
    "pan_troglodytes": "Chimpanzee",
    "gorilla_gorilla": "Gorilla"
}
def renamer(name):
    return name_map.get(name, name)
hcm = hcm.rename_seqs(renamer)
hcm

0,1
,0
Human,TGACCCTTCTGTACCCCACCAGAACATGCCCGGGTGACCTCCTCCCAGATCTTCCTTGTG
Chimpanzee,TGACCCTTCTGTACCCCACCAGAACATGCCCGGGTGACCTCCTCCCAGATCTTCCTTGTG
Gorilla,TGACCCTTCTGTACCCCACCAGAACATGCCCGGGTGACCTCCTCCCAGATCTTCCTTGTG


In [57]:
homoseq = hcm.get_seq("Human")
translated = homoseq.get_translation(include_stop=True)
for i in range(0, len(translated)):
    if translated[i] == "*":
        print(i)

0
522


In [58]:
trimmed_hcm = trim_stops(hcm)

homoseq = trimmed_hcm.get_seq("Human")
translated = homoseq.get_translation(include_stop=True)
for i in range(0, len(translated)):
    if translated[i] == "*":
        print(i)

0


In [59]:
#Doing nucleotide model meanwhile. I got to change this for codon eventually
trim_stops = get_app("trim_stop_codons")

#Check the tree! 
tree = "(Human:0.06,Chimpanzee:0.06,Gorilla:0.1)"
codon_aligner = get_app("progressive_align", "codon", guide_tree=tree)

align_app = trim_stops + codon_aligner

aln_codonaligned = align_app(hcm)
aln_codonaligned

NotCompleted(type=ERROR, origin=progressive_align, source="ENSG00000181754.fa", message="'TGA' at 'Human':0 not in alphabet")

In [60]:
ffold = get_app("take_codon_positions", fourfold_degenerate=True)
#Do I need omit_gap_pos() or degap()
aln_ffold = ffold(aln_codonaligned).omit_gap_pos()

AttributeError: 'NotCompleted' object has no attribute 'omit_gap_pos'

In [None]:
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result = sm(aln_ffold)
lengthstree = result.lf.get_lengths_as_ens()
lengthstree

   0%|          |00:00<?

   0%|          |00:00<?

{'Human': np.float64(9.99997582679352e-07),
 'Chimpanzee': np.float64(9.999981644403043e-07),
 'Gorilla': np.float64(1.0000012313770403e-06)}