This code requires to run codon_aligner.ipynb before

In [1]:
import cogent3
from cogent3 import get_app
import paths
import libs


In [2]:
folder_in = paths.DATA_APES114 + 'cds/codon_aligned/'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

print(in_dstore.describe)

Directory datastore
record type      number
-----------------------
completed           348
not_completed        48
logs                  3
-----------------------


In [3]:
loader = get_app("load_aligned", moltype="dna")   
omit_degs = get_app("omit_degenerates", moltype="dna", motif_length=3)
ffold = get_app("take_codon_positions", fourfold_degenerate=True)
concat_alns_app = get_app("concat", moltype="dna")
concat = get_app("concat", moltype="dna")

#create a concatenated alignment with all coding positions
allposprocess = loader+omit_degs
# result below is only valid alignments (no NotCompleted).
results_allpos = [r for r in allposprocess.as_completed(in_dstore[:], parallel=True) if r]
alpos_alns = concat(results_allpos)
alpos_alns.source = "cds_alignments"

#create a concatenated alignment only with four fold positions (should I also do 0fold)
omit_degs.disconnect()
ffoldprocess = loader+omit_degs+ffold
# result below is only valid alignments (no NotCompleted)
results_ffold = [r for r in ffoldprocess.as_completed(in_dstore[:], parallel=True) if r]
ffold_alns = concat(results_ffold)
ffold_alns.source = "cds_ffold_test_alignments"



   0%|          |00:00<?

   0%|          |00:00<?

In [4]:
alpos_alns

0,1
,0
Chimpanzee,ATGGCATTCAGGAGACAAGTGAAAAACTTTGTGAAAAATTACTCAGATGCTGAAATAAAA
Human,............................................................
Gorilla,............................................................


In [6]:
#Testing that the concatenated alignemnts have non degenerate variable positions 
#new_subset = alpos_alns.take_seqs(["Human", "Gorilla"])
new_subset = alpos_alns

#after degap and omit_gap_pos the gaps in Gorilla stay there
new_subset.degap()
varpos = new_subset.variable_positions()
just_variable_aln = new_subset.take_positions(varpos)
just_variable_aln

0,1
,0
Chimpanzee,AGGCTTATCCCTCCACGCAGATCGCAGCCCGAGCTAGGTACAGAGGACCATTTGTGACGT
Gorilla,GAATCCGCTTTCGAGTAGTTGCAATTCTGTAGAGAGTACTTTTTATTTTGCACCCAGTCC
Human,...........................................................C


In [7]:
#Do a general nucleotide model of conncatenated sequences concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True, split_codons=True)
result_allpos = sm(alpos_alns)
print("Model results : \n" + str(result_allpos.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result_allpos[1].get_lengths_as_ens()
print(lengthstree_pos1)

lengthstree_pos2 = result_allpos[2].get_lengths_as_ens()
print(lengthstree_pos2)

lengthstree_pos3 = result_allpos[3].get_lengths_as_ens()
print(lengthstree_pos3)

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
OrderedDict({1: GN pos-1
log-likelihood = -258252.4844
number of free parameters = 39
edge          parent    length      A>C      A>G     A>T     C>A      C>G
-------------------------------------------------------------------------
Gorilla       root        0.00     1.48     1.48    0.21    0.74     0.56
Human         root        0.00     0.00    50.00    0.00    0.00     0.00
Chimpanzee    root        0.00    10.13    50.00    0.00    8.87    26.62
-------------------------------------------------------------------------

continued: 
  C>T      G>A      G>C     G>T      T>A      T>C
-------------------------------------------------
 1.48     1.43     1.59    1.12     0.33     1.00
 0.00     0.00     0.00    0.00     0.00     0.00
44.36    38.17    22.90    0.00    31.97    47.96
-------------------------------------------------

   A       C       G       T
----------------------------
0.24    0.28    0.32    0.15
----------------------------, 2: GN pos-2
log-likeli

In [8]:
checker = get_app("phylim")


checked_fitted = libs.phylim_split_codon(checker)
checked = checked_fitted(result_allpos)
checked

source,model name,identifiable,has boundary values,version
cds_alignments,GN,True,False,2025.8.27


In [9]:
#Get distances of concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_ffold = sm(ffold_alns)
checked = checker(result_ffold)
lengthstree = result_ffold.lf.get_lengths_as_ens()
print("Is model identifiable? " + str(checked.is_identifiable) + "\n")
print("Model results : \n" + str(result_ffold.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Is model identifiable? True

Model results : 
GN
log-likelihood = -125901.6556
number of free parameters = 39
edge          parent    length      A>C      A>G     A>T     C>A     C>G
------------------------------------------------------------------------
Gorilla       root        0.00     0.00     6.38    2.13    1.26    1.26
Human         root        0.00     0.00    50.00    0.00    0.00    0.00
Chimpanzee    root        0.00    12.76    50.00    0.00    5.03    0.00
------------------------------------------------------------------------

continued: 
  C>T     G>A     G>C     G>T     T>A      T>C
----------------------------------------------
 2.93    3.56    0.51    0.00    1.00     7.03
 0.00    0.00    0.00    0.00    0.00     0.00
15.09    6.11    0.00    6.11    0.00    12.05
----------------------------------------------

   A       C       G       T
----------------------------
0.15    0.38    0.31    0.16
----------------------------

Distances: {'Gorilla': np.float64(0.000

In [11]:
hum_chimps_aln = alpos_alns.take_seqs(["Human", "Chimpanzee"])
hum_chimps_aln

0,1
,0
Chimpanzee,ATGGCATTCAGGAGACAAGTGAAAAACTTTGTGAAAAATTACTCAGATGCTGAAATAAAA
Human,............................................................


In [15]:
#Do a Kimura 1980 substitution model
sm = get_app("model", "K80", optimise_motif_probs=True, show_progress=True)
result = sm(hum_chimps_aln)
print("Model results : \n" + str(result.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result.lf.get_lengths_as_ens()
print(lengthstree_pos1)


   0%|          |00:00<?

Model results : 
K80
log-likelihood = -791665.0255
number of free parameters = 3
=====
kappa
-----
 3.46
-----
edge          parent    length
------------------------------
Human         root        0.00
Chimpanzee    root        0.00
------------------------------
   A       C       G       T
----------------------------
0.25    0.25    0.25    0.25
----------------------------

Distances: 
{'Human': np.float64(0.00015084262330534678), 'Chimpanzee': np.float64(2.6280982611490735e-05)}
