This code requires to run codon_aligner.ipynb before

In [1]:
import cogent3
from cogent3 import get_app
import paths
import libs


In [2]:
folder_in = paths.DATA_APES114 + 'cds/codon_aligned/'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

print(in_dstore.describe)

Directory datastore
record type      number
-----------------------
completed            36
not_completed         8
logs                  5
-----------------------


In [3]:
loader = get_app("load_aligned", moltype="dna")   
omit_degs = get_app("omit_degenerates", moltype="dna", motif_length=3)
ffold = get_app("take_codon_positions", fourfold_degenerate=True)
concat_alns_app = get_app("concat", moltype="dna")
concat = get_app("concat", moltype="dna")

#create a concatenated alignment with all coding positions
allposprocess = loader+omit_degs
# result below is only valid alignments (no NotCompleted).
results_allpos = [r for r in allposprocess.as_completed(in_dstore[:10], parallel=True) if r]
alpos_alns = concat(results_allpos)
alpos_alns.source = "cds_alignments"

#create a concatenated alignment only with four fold positions (should I also do 0fold)
omit_degs.disconnect()
ffoldprocess = loader+omit_degs+ffold
# result below is only valid alignments (no NotCompleted)
results_ffold = [r for r in ffoldprocess.as_completed(in_dstore, parallel=True) if r]
ffold_alns = concat(results_ffold)
ffold_alns.source = "cds_ffold_test_alignments"



   0%|          |00:00<?

   0%|          |00:00<?

In [4]:
alpos_alns

0,1
,0
Chimpanzee,ATGAATCCACAGATCAGAAATCCGATGGAGCGGATGTATCGACGCACATTCTACAACCAC
Human,............................................................
Gorilla,............................................................


In [5]:
#Testing that the concatenated alignemnts have non degenerate variable positions 
new_subset = alpos_alns.take_seqs(["Human", "Gorilla"])
#new_subset = alpos_alns.take_seqs(["Human", "Chimpanzee"])

#after degap and omit_gap_pos the gaps in Gorilla stay there
new_subset.degap()
varpos = new_subset.variable_positions()
just_variable_aln = new_subset.take_positions(varpos)
just_variable_aln

ValueError: Cannot apply_along_axis when any iteration dimensions are 0

2 x 0 dna alignment: Human[], Gorilla[]

In [6]:
#Do a general nucleotide model of conncatenated sequences concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True, split_codons=True)
result_allpos = sm(alpos_alns)
print("Model results : \n" + str(result_allpos.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result_allpos[1].get_lengths_as_ens()
print(lengthstree_pos1)

lengthstree_pos2 = result_allpos[2].get_lengths_as_ens()
print(lengthstree_pos2)

lengthstree_pos3 = result_allpos[3].get_lengths_as_ens()
print(lengthstree_pos1)

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
OrderedDict({1: GN pos-1
log-likelihood = -7345.8711
number of free parameters = 39
edge          parent    length     A>C     A>G     A>T      C>A      C>G
------------------------------------------------------------------------
Chimpanzee    root        0.00    0.00    0.00    0.00    49.99    49.90
Human         root        0.00    0.00    0.00    0.00    49.99    49.90
Gorilla       root        0.00    0.00    0.00    0.00    49.99    49.90
------------------------------------------------------------------------

continued: 
  C>T     G>A     G>C     G>T     T>A     T>C
---------------------------------------------
50.00    0.00    0.00    0.00    0.00    0.00
50.00    0.00    0.00    0.00    0.00    0.00
50.00    0.00    0.00    0.00    0.00    0.00
---------------------------------------------

   A       C       G       T
----------------------------
0.27    0.25    0.32    0.16
----------------------------, 2: GN pos-2
log-likelihood = -7336.9831
number of free

In [7]:
checker = get_app("phylim")


checked_fitted = libs.phylim_split_codon(checker)
checked = checked_fitted(result_allpos)
checked

source,model name,identifiable,has boundary values,version
cds_alignments,GN,True,True,2025.8.27


In [8]:
#Get distances of concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_ffold = sm(ffold_alns)
checked = checker(result_ffold)
lengthstree = result_ffold.lf.get_lengths_as_ens()
print("Is model identifiable? " + str(checked.is_identifiable) + "\n")
print("Model results : \n" + str(result_ffold.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Is model identifiable? True

Model results : 
GN
log-likelihood = -10509.5818
number of free parameters = 39
edge          parent    length     A>C     A>G     A>T      C>A      C>G
------------------------------------------------------------------------
Chimpanzee    root        0.00    0.00    0.00    0.00     0.00     0.01
Human         root        0.00    0.00    0.00    0.00     0.00     0.00
Gorilla       root        0.00    0.00    0.00    0.00    24.39    24.40
------------------------------------------------------------------------

continued: 
  C>T      G>A     G>C     G>T     T>A      T>C
-----------------------------------------------
24.20     0.00    0.00    0.00    0.00    49.99
11.92     0.00    0.00    0.00    0.00    50.00
24.41    29.23    0.00    0.00    0.00    50.00
-----------------------------------------------

   A       C       G       T
----------------------------
0.16    0.36    0.30    0.17
----------------------------

Distances: {'Chimpanzee': np.float