Testing analysing cds data without aligning it first 

In [1]:
import cogent3
from cogent3 import get_app
import paths
import libs


In [2]:
folder_in = paths.DATA_APES114 + 'cds'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

print(in_dstore.describe)

Directory datastore
record type      number
-----------------------
completed           396
not_completed         0
logs                  1
-----------------------


In [3]:
loader = get_app("load_aligned", moltype="dna")
omit_degs = get_app("omit_degenerates", moltype="dna")

#rename app makes sure that each alignment contains only one alignment per species
#some alignment files include duplicates. We take randomly a sequence per species and discard the others
rename = libs.temp_renamer_aligned_cds()
concat = get_app("concat", moltype="dna")

#create a concatenated alignment with all coding positions
allposprocess = loader + omit_degs + rename
# result below is only valid alignments (no NotCompleted).
results_allpos = [r for r in allposprocess.as_completed(in_dstore[:], parallel=True) if r]
alpos_alns = concat(results_allpos)
alpos_alns.source = "cds_alignments"


   0%|          |00:00<?

In [4]:
alpos_alns

0,1
,0
Chimpanzee,ATGCAGGCGGTGCGGCACGTCGTGTGCGCCCTGTCCGGCGGCGTGGACAGCGCCGTGGCC
Gorilla,............................................................
Human,............................................................


In [5]:
#Testing that the concatenated alignemnts have non degenerate variable positions 
#new_subset = alpos_alns.take_seqs(["Human", "Gorilla"])
new_subset = alpos_alns

#after degap and omit_gap_pos the gaps in Gorilla stay there
new_subset.degap()
varpos = new_subset.variable_positions()
just_variable_aln = new_subset.take_positions(varpos)
just_variable_aln

0,1
,0
Chimpanzee,ACTTACTAAAGACCACAGTTACGCCAG
Gorilla,GTCCGTCCGTAGTTGTGAGCCAATTGA
Human,.......................T...


In [6]:
#Do a general nucleotide model of conncatenated sequences concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True, split_codons=True)
result_allpos = sm(alpos_alns)
print("Model results : \n" + str(result_allpos.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result_allpos[1].get_lengths_as_ens()
print(lengthstree_pos1)

lengthstree_pos2 = result_allpos[2].get_lengths_as_ens()
print(lengthstree_pos2)

lengthstree_pos3 = result_allpos[3].get_lengths_as_ens()
print(lengthstree_pos3)

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
OrderedDict({1: GN pos-1
log-likelihood = -257562.2835
number of free parameters = 39
edge          parent    length      A>C      A>G     A>T     C>A     C>G
------------------------------------------------------------------------
Chimpanzee    root        0.00     0.00    50.00    0.00    0.00    0.00
Gorilla       root        0.00    50.00    50.00    0.00    0.00    0.00
Human         root        0.00     0.00    50.00    0.00    0.00    0.00
------------------------------------------------------------------------

continued: 
 C>T      G>A     G>C     G>T     T>A     T>C
---------------------------------------------
0.00     0.00    0.00    0.00    0.00    0.00
0.00    23.56    0.00    0.00    0.00    0.00
0.00     0.00    0.00    0.00    0.00    0.00
---------------------------------------------

   A       C       G       T
----------------------------
0.26    0.27    0.28    0.19
----------------------------, 2: GN pos-2
log-likelihood = -259018.3287
number of 

In [7]:
checker = get_app("phylim")

checked_fitted = libs.phylim_split_codon(checker)
checked = checked_fitted(result_allpos)
checked

source,model name,identifiable,has boundary values,version
cds_alignments,GN,True,True,2025.8.27


In [8]:
#Get distances of concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_allcodons = sm(alpos_alns)

checked = checker(result_allcodons)
lengthstree = result_allcodons.lf.get_lengths_as_ens()
print("Is model identifiable? " + str(checked.is_identifiable) + "\n")
print("Model results : \n" + str(result_allcodons.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Is model identifiable? True

Model results : 
GN
log-likelihood = -770287.6904
number of free parameters = 39
edge          parent    length     A>C     A>G     A>T     C>A     C>G
----------------------------------------------------------------------
Chimpanzee    root        0.00    0.00    0.00    0.00    0.00    0.00
Gorilla       root        0.00    1.68    5.88    0.84    0.64    0.00
Human         root        0.00    0.00    0.00    0.00    0.00    0.00
----------------------------------------------------------------------

continued: 
  C>T     G>A     G>C     G>T     T>A      T>C
----------------------------------------------
 0.00    0.00    0.00    0.00    0.00    50.00
 3.84    2.62    0.00    0.00    0.00     4.01
49.97    0.00    0.00    0.00    0.00     0.00
----------------------------------------------

   A       C       G       T
----------------------------
0.23    0.30    0.29    0.19
----------------------------

Distances: {'Chimpanzee': np.float64(1.779192803976