This code requires to run location_inter_intragenic.ipynb before

In [1]:
import cogent3
from cogent3 import load_aligned_seqs
from cogent3 import get_app
from phylim.apps import phylim

import libs
import paths

In [2]:
folder_in = paths.DATA_APES114 + 'intergenic/'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

print(in_dstore.describe)

Directory datastore
record type      number
-----------------------
completed           706
not_completed         0
logs                  1
-----------------------


In [3]:
#perform a codon alignment and remove stop codons
loader = get_app("load_aligned", moltype="dna")
omit_degs = get_app("omit_degenerates", moltype="dna")
#rename app makes sure that each alignment contains only one alignment per species
#some alignment files include duplicates. We take randomly a sequence per species and discard the others
rename = libs.renamer_aligned()
concat = get_app("concat", moltype="dna")

app = loader + omit_degs + rename
#By using this renamer_aligned function I throw away sequences with paralogs
results_allpos = [r for r in app.as_completed(in_dstore[:], parallel=True) if r]
alpos_alns = concat(results_allpos)
alpos_alns.source = "intergenic_alignments"
alpos_alns


   0%|          |00:00<?

0,1
,0
Chimpanzee,GCGTCGCCGACGTGGGCGTGGACTGTCGTCATTGGATGGAGTCGGTCAAGGCTGGGGACG
Gorilla,.......T..................T................C................
Human,....................................C......C...............A


In [4]:
#Get distances of concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_intergenic = sm(alpos_alns)

checker = get_app("phylim")
checked = checker(result_intergenic)
lengthstree = result_intergenic.lf.get_lengths_as_ens()
print("Is model identifiable? " + str(checked.is_identifiable) + "\n")
print("Model results : \n" + str(result_intergenic.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Is model identifiable? True

Model results : 
GN
log-likelihood = -2982321.1610
number of free parameters = 39
edge          parent    length     A>C     A>G     A>T     C>A     C>G
----------------------------------------------------------------------
Human         root        0.01    1.07    4.01    0.70    1.30    1.60
Gorilla       root        0.01    0.95    4.33    0.67    1.18    1.52
Chimpanzee    root        0.01    0.98    3.69    0.66    1.12    1.48
----------------------------------------------------------------------

continued: 
 C>T     G>A     G>C     G>T     T>A     T>C
--------------------------------------------
6.30    6.56    1.46    1.31    0.63    4.02
6.09    6.13    1.53    1.22    0.64    4.29
5.70    5.73    1.44    1.08    0.68    3.38
--------------------------------------------

   A       C       G       T
----------------------------
0.24    0.26    0.26    0.24
----------------------------

Distances: {'Human': np.float64(0.007351086704508361), 'Gorill

In [None]:
hum_chimps_aln = alpos_alns.take_seqs(["Human", "Chimpanzee"])
#Do a Kimura 1980 substitution model
sm = get_app("model", "K80", optimise_motif_probs=True, show_progress=True)
result = sm(hum_chimps_aln)
print("Model results : \n" + str(result.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result.lf.get_lengths_as_ens()
print(lengthstree_pos1)