In [1]:
import cogent3
from cogent3 import load_unaligned_seqs
from cogent3 import load_aligned_seqs
from cogent3 import get_app
from cogent3 import make_aligned_seqs

@cogent3.app.composable.define_app
def renamer(seqs: cogent3.app.typing.UnalignedSeqsType) -> cogent3.app.typing.UnalignedSeqsType:
    """
    A function to rename sequences in a dataset.
    """

    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }

    seqs = seqs.rename_seqs(lambda x: name_map.get(x.split("-")[0], x))

    return seqs.take_seqs(list(name_map.values()))


In [37]:
#set folders to read and export data
folder_in = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_114/aligns_cds/'
folder_out = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_114/aligns_cds/codon_aligned'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')
out_dstore = cogent3.open_data_store(folder_out, suffix='fa', mode="w")

#perform a codon alignment and remove stop codons
loader = get_app("load_unaligned", moltype="dna")
rename = renamer()
#why do I trim stop codons? Does this mean that the stop codon is ereased and the rest of the sequence is still analyzed
trim_stops = get_app("trim_stop_codons")
codon_align = get_app("progressive_align", "codon", guide_tree="(Human:0.06,Chimpanzee:0.06,Gorilla:0.1)")
writer = get_app("write_seqs", data_store = out_dstore)
app = loader + rename + trim_stops + codon_align + writer
#underscore is to specify a variable we are not gonna use later
_ = list(app.apply_to(in_dstore[:], parallel = True, show_progress=True))
print(out_dstore.describe)
print(out_dstore.summary_not_completed)

   0%|          |00:00<?

Directory datastore
record type      number
-----------------------
completed          1694
not_completed       444
logs                 11
-----------------------
not completed records
type     origin     message                                          num    source                                     
-----------------------------------------------------------------------------------------------------------------------
ERROR    renamer    'ValueError: The foll...> at 0x7f351babac00>'    444    ENSG00000243480.fa, ENSG00000185668.fa, ...
-----------------------------------------------------------------------------------------------------------------------


In [2]:
folder_in = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_114/aligns_cds/codon_aligned'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

loader = get_app("load_aligned", moltype="dna")   
omit_degs = get_app("omit_degenerates", moltype="dna", motif_length=3)
ffold = get_app("take_codon_positions", fourfold_degenerate=True)
concat_alns_app = get_app("concat", moltype="dna")
concat = get_app("concat", moltype="dna")

#create a concatenated alignment with all coding positions
omit_degs.disconnect()
allposprocess = loader+omit_degs
# result below is only valid alignments (no NotCompleted)
results_allpos = [r for r in allposprocess.as_completed(in_dstore, parallel=True) if r]
alpos_alns = concat(results_allpos)

#create a concatenated alignment only with four fold positions (should I also do 0fold)
ffoldprocess = loader+omit_degs+ffold
# result below is only valid alignments (no NotCompleted)
results_ffold = [r for r in ffoldprocess.as_completed(in_dstore, parallel=True) if r]
ffold_alns = concat(results_ffold)


reason='has been renamed'
  from cogent3.core import new_alignment as c3_alignment
reason='has been renamed'
  from cogent3.core import new_alphabet as c3_alphabet
reason='has been renamed'
  from cogent3.core import new_moltype as c3_moltype
reason='has been renamed'
  from cogent3.core import new_sequence as c3_sequence


   0%|          |00:00<?

   0%|          |00:00<?

In [31]:
#Testing that the concatenated alignemnts have non degenerate variable positions 
new_subset = alpos_alns.take_seqs(["Human", "Gorilla"])
#new_subset = alpos_alns.take_seqs(["Human", "Chimpanzee"])

#after degap and omit_gap_pos the gaps in Gorilla stay there
new_subset.degap()
varpos = new_subset.variable_positions()
just_variable_aln = new_subset.take_positions(varpos)
just_variable_aln

0,1
,0
Gorilla,TAATGACTCTCGCTACACACAGACGAGATCGCTCATCGAGCTCAGAGTATGTCGTCCTTT
Human,CTGGTTGGTCTCGCTACACACAGACGAGATCGCTCATCGAGCTCAGAGTAAGTTGTTCGC


In [56]:
#Do a general nucleotide model of conncatenated sequences concatenated sequences 
#time_het=max sets a 36 params substitution model
checker = get_app("phylim")
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True, split_codons=True)
result_allpos = sm(alpos_alns)
print("Model results : \n" + str(result_allpos.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result_allpos[1].get_lengths_as_ens()
print(lengthstree_pos1)

lengthstree_pos2 = result_allpos[2].get_lengths_as_ens()
print(lengthstree_pos2)

lengthstree_pos3 = result_allpos[3].get_lengths_as_ens()
print(lengthstree_pos1)

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
OrderedDict({1: GN pos-1
log-likelihood = -1367959.0612
number of free parameters = 39
edge          parent    length     A>C     A>G     A>T     C>A     C>G
----------------------------------------------------------------------
Human         root        0.00    3.44    1.29    0.65    1.38    1.38
Chimpanzee    root        0.00    0.91    3.01    0.39    0.41    0.43
Gorilla       root        0.00    1.17    1.88    0.70    1.51    1.55
----------------------------------------------------------------------

continued: 
 C>T     G>A     G>C     G>T     T>A     T>C
--------------------------------------------
1.38    0.00    2.53    1.10    0.00    6.27
1.47    1.10    0.55    0.67    0.20    1.60
2.75    3.62    0.86    1.13    0.61    2.06
--------------------------------------------

   A       C       G       T
----------------------------
0.27    0.25    0.31    0.17
----------------------------, 2: GN pos-2
log-likelihood = -1375737.2225
number of free parameters 

In [54]:
#this part is failing
checked = checker(result_allpos)
checked

NotCompleted(type=ERROR, origin=phylim, source="unknown", message="Traceback (most recent call last):
  File "/home/u12/uliseshmc/.conda/envs/EstimatingUd/lib/python3.13/site-packages/cogent3/app/composable.py", line 407, in _call
    result = self.main(val, *args, **kwargs)
  File "/home/u12/uliseshmc/.conda/envs/EstimatingUd/lib/python3.13/site-packages/phylim/apps.py", line 136, in main
    tree = model_result.lf.tree  # type: ignore
           ^^^^^^^^^^^^^^^^^^^^
AttributeError: 'collections.OrderedDict' object has no attribute 'tree'
")

In [26]:
#Get distances of concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_ffold = sm(ffold_alns)
checked = checker(result_ffold)
lengthstree = result_ffold.lf.get_lengths_as_ens()
print("Is model identifiable? " + str(checked.is_identifiable) + "\n")
print("Model results : \n" + str(result_ffold.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Is model identifiable? True

Model results : 
GN
log-likelihood = -658330.4160
number of free parameters = 39
edge          parent    length      A>C      A>G     A>T     C>A     C>G
------------------------------------------------------------------------
Human         root        0.00    29.23    29.23    0.00    0.00    0.00
Chimpanzee    root        0.00     0.42     5.09    0.21    0.73    1.17
Gorilla       root        0.00     2.65    14.31    1.06    2.92    1.82
------------------------------------------------------------------------

continued: 
  C>T      G>A     G>C     G>T     T>A      T>C
-----------------------------------------------
20.07    50.00    0.00    0.00    0.00    13.77
 1.17     2.77    1.48    0.37    0.20     4.60
 8.39     5.08    4.62    3.23    3.50    13.99
-----------------------------------------------

   A       C       G       T
----------------------------
0.21    0.31    0.25    0.23
----------------------------

Distances: {'Human': np.float64(2

In [None]:
print(cogent3.app_help("take_codon_positions"))