This code requires codon_aligner.ipynb to be run before

In [29]:
import cogent3
from cogent3 import get_app

#this is a workaround to use phylim on models with splitted codons
#I'm using a workaround to check identifiability of a nucleotide model split by position
#Latter I should check this workaround in case I actually use this model
#Phylim is getting updated to fix such bug

from cogent3.app.result import model_result
from cogent3.app.composable import define_app
from phylim.apps import phylim, PhyloLimitRec

@define_app
def phylim_split_codon(result: model_result, check_one: phylim) -> PhyloLimitRec:
    """checks individual likelihood functions from a split_codon model_result"""
    for k in range (1, 4):
        value = result[k]
        one = model_result(name=result.name, source=result.source)
        one['value'] = value
        checked = check_one(one)
        if not checked.is_identifiable:
            return checked
    return checked

In [36]:
folder_in = '/xdisk/masel/uliseshmc/EstimatingUd/primates10_114/aligns_cds/codon_aligned'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

loader = get_app("load_aligned", moltype="dna")   
omit_degs = get_app("omit_degenerates", moltype="dna", motif_length=3)
ffold = get_app("take_codon_positions", fourfold_degenerate=True)
concat_alns_app = get_app("concat", moltype="dna")
concat = get_app("concat", moltype="dna")

#create a concatenated alignment with all coding positions
allposprocess = loader+omit_degs
# result below is only valid alignments (no NotCompleted)
results_allpos = [r for r in allposprocess.as_completed(in_dstore[:10], parallel=True) if r]
alpos_alns = concat(results_allpos)
alpos_alns.source = "cds_test_alignments"

#create a concatenated alignment only with four fold positions (should I also do 0fold)
omit_degs.disconnect()
ffoldprocess = loader+omit_degs+ffold
# result below is only valid alignments (no NotCompleted)
results_ffold = [r for r in ffoldprocess.as_completed(in_dstore[:10], parallel=True) if r]
ffold_alns = concat(results_ffold)
ffold_alns.source = "cds_ffold_test_alignments"



   0%|          |00:00<?

   0%|          |00:00<?

In [None]:
#Testing that the concatenated alignemnts have non degenerate variable positions 
new_subset = alpos_alns.take_seqs(["Human", "Gorilla"])
#new_subset = alpos_alns.take_seqs(["Human", "Chimpanzee"])

#after degap and omit_gap_pos the gaps in Gorilla stay there
new_subset.degap()
varpos = new_subset.variable_positions()
just_variable_aln = new_subset.take_positions(varpos)
just_variable_aln

0,1
,0
Gorilla,TAATGACTCTCGCTACACACAGACGAGATCGCTCATCGAGCTCAGAGTATGTCGTCCTTT
Human,CTGGTTGGTCTCGCTACACACAGACGAGATCGCTCATCGAGCTCAGAGTAAGTTGTTCGC


In [37]:
#Do a general nucleotide model of conncatenated sequences concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True, split_codons=True)
result_allpos = sm(alpos_alns)
print("Model results : \n" + str(result_allpos.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result_allpos[1].get_lengths_as_ens()
print(lengthstree_pos1)

lengthstree_pos2 = result_allpos[2].get_lengths_as_ens()
print(lengthstree_pos2)

lengthstree_pos3 = result_allpos[3].get_lengths_as_ens()
print(lengthstree_pos1)

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
OrderedDict({1: GN pos-1
log-likelihood = -8466.8883
number of free parameters = 39
edge          parent    length     A>C     A>G     A>T      C>A      C>G
------------------------------------------------------------------------
Human         root        0.00    0.00    0.00    0.00    49.90    49.96
Gorilla       root        0.00    0.00    0.00    0.00    49.90    49.96
Chimpanzee    root        0.00    0.00    0.00    0.00    49.90    49.96
------------------------------------------------------------------------

continued: 
  C>T     G>A     G>C     G>T     T>A     T>C
---------------------------------------------
50.00    0.00    0.00    0.00    0.00    0.00
50.00    0.00    0.00    0.00    0.00    0.00
50.00    0.00    0.00    0.00    0.00    0.00
---------------------------------------------

   A       C       G       T
----------------------------
0.27    0.24    0.31    0.18
----------------------------, 2: GN pos-2
log-likelihood = -8462.1306
number of free

In [41]:
checker = get_app("phylim")


checked_fitted = phylim_split_codon(checker)
checked = checked_fitted(result_allpos)
checked

source,model name,identifiable,has boundary values,version
cds_test_alignments,GN,True,False,2025.7.24


In [None]:
#Get distances of concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_ffold = sm(ffold_alns)
checked = checker(result_ffold)
lengthstree = result_ffold.lf.get_lengths_as_ens()
print("Is model identifiable? " + str(checked.is_identifiable) + "\n")
print("Model results : \n" + str(result_ffold.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Is model identifiable? True

Model results : 
GN
log-likelihood = -658330.4160
number of free parameters = 39
edge          parent    length      A>C      A>G     A>T     C>A     C>G
------------------------------------------------------------------------
Human         root        0.00    29.23    29.23    0.00    0.00    0.00
Gorilla       root        0.00     2.65    14.33    1.06    2.92    1.83
Chimpanzee    root        0.00     0.42     5.10    0.21    0.73    1.17
------------------------------------------------------------------------

continued: 
  C>T      G>A     G>C     G>T     T>A      T>C
-----------------------------------------------
20.07    50.00    0.00    0.00    0.00    13.77
 8.40     5.09    4.63    3.24    3.50    14.01
 1.17     2.78    1.48    0.37    0.20     4.61
-----------------------------------------------

   A       C       G       T
----------------------------
0.21    0.31    0.25    0.23
----------------------------

Distances: {'Human': np.float64(2

In [None]:
print(cogent3.app_help("take_codon_positions"))

Overview
--------
Extracts the specified codon position(s) from an alignment.

Options for making the app
--------------------------
take_codon_positions_app = get_app(
    'take_codon_positions',
    *positions: int,
    fourfold_degenerate=False,
    gc='Standard',
    moltype='dna',
)

Parameters
----------
positions
    either a single integer from (1, 2, 3), or additional keyword
    arguments of position numbers, e.g. 3 is third position, (1,2)
    is first and second codon position
fourfold_degenerate
    if True, returns third positions from four-fold degenerate codons.
    Overrides positions.
gc
    identifier for a genetic code or a genetic code instance.
    see https://cogent3.org/doc/cookbook/what_codes.html
moltype
    molecular type, must be either DNA or RNA

Examples
--------

Create a sample alignment and an app that extracts the 3rd codon
position from an alignment.

>>> from cogent3 import make_aligned_seqs, get_app
>>> aln = make_aligned_seqs(
...     {"s1": "ACGA