In [3]:
import cogent3
from cogent3 import get_app
from cogent3 import load_aligned_seqs
from cogent3 import load_unaligned_seqs
#import paths
import libs

# CDS sample alignment

I sampled chromosome 22 cds using 
```
eti homologs -i install/ --outdir cds --ref human --coord_names 22 
```
Then I trim stop codons and use a codon alignment model for all the sequences using
```
trim_stops = get_app("trim_stop_codons")
codon_align = get_app("progressive_align", "codon", guide_tree="(Human:0.06,Chimpanzee:0.06,Gorilla:0.1)")
```
I store the aligned sequences in the folder cds/codon_aligned/

In [2]:
#folder_in = paths.DATA_APES114 + 'cds/codon_aligned/'
folder_in = 'cds/codon_aligned/'

sequence = 'ENSG00000100304.fa'

aln = load_aligned_seqs(filename = folder_in + sequence, format="fasta", moltype='dna')
aln

reason='no longer has an effect'
  deprecated(


0,1
,0
Chimpanzee,ATGGAGGCCGAGCGGGGTCCCGAGCGCCCGCCTGCGGAGCGTAGCAGCCCGGGCCAGACG
Human,............................................................
Gorilla,............................................................


In [3]:
omit_degs = get_app("omit_degenerates", moltype="dna", motif_length=3)
aln = omit_degs(aln)
#Do a general nucleotide model of conncatenated sequences concatenated sequences 
#time_het=max sets a 36 params substitution model
sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True, split_codons=True)
result_sm = sm(aln)
print("Model results : \n" + str(result_sm.lf) + "\n")

print("Distances: ")
lengthstree_pos1 = result_sm[1].get_lengths_as_ens()
print(lengthstree_pos1)

lengthstree_pos2 = result_sm[2].get_lengths_as_ens()
print(lengthstree_pos2)

lengthstree_pos3 = result_sm[3].get_lengths_as_ens()
print(lengthstree_pos3)


   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
OrderedDict({1: GN pos-1
log-likelihood = -865.6992
number of free parameters = 39
edge          parent    length     A>C     A>G     A>T      C>A      C>G
------------------------------------------------------------------------
Human         root        0.00    0.00    0.00    0.00    50.00    50.00
Chimpanzee    root        0.00    0.00    0.00    0.00    50.00    50.00
Gorilla       root        0.00    0.00    0.00    0.00    50.00    50.00
------------------------------------------------------------------------

continued: 
  C>T     G>A     G>C     G>T     T>A     T>C
---------------------------------------------
50.00    0.00    0.00    0.00    0.00    0.00
50.00    0.00    0.00    0.00    0.00    0.00
50.00    0.00    0.00    0.00    0.00    0.00
---------------------------------------------

   A       C       G       T
----------------------------
0.21    0.31    0.33    0.16
----------------------------, 2: GN pos-2
log-likelihood = -874.7426
number of free p

# Intron sample alignment

I sampled chromosome 22 introns masking ancestral repeats and cds using 
```
eti alignments -i install -od introns --align_name 10_primates* --ref human --mask cds_allAR_1column.txt --coord_names 22 
```
where cds_allAR_1column.txt is a list containing all the biotypes for ancestral repeats and cds

In [5]:
#folder_in = paths.DATA_APES114 + 'introns/'
folder_in = 'introns/'

sequence = 'ENSG00000100373.fa'

aln = load_aligned_seqs(filename = folder_in + sequence, format="fasta", moltype='dna')
aln

reason='no longer has an effect'
  deprecated(


0,1
,0
homo_sapiens:22:45284948-45295874:1,AGGTC-------------------------------------------------------
gorilla_gorilla:22:28853875-28862222:1,............................................................
pan_troglodytes:22:32060815-32072454:1,............................................................


In [5]:
rename = libs.renamer_aligned()
omit_degs = get_app("omit_degenerates", moltype="dna")

aln = rename(aln)
aln = omit_degs(aln)

sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_intergenic = sm(aln)


lengthstree = result_intergenic.lf.get_lengths_as_ens()
print("Model results : \n" + str(result_intergenic.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
GN
log-likelihood = -5179.6405
number of free parameters = 39
edge          parent    length      A>C      A>G     A>T     C>A     C>G
------------------------------------------------------------------------
Human         root        0.01     0.90     3.60    0.89    0.00    3.03
Chimpanzee    root        0.01    10.92    21.52    5.35    4.58    4.56
Gorilla       root        0.01     0.59     1.80    0.00    0.25    0.50
------------------------------------------------------------------------

continued: 
  C>T      G>A     G>C     G>T     T>A      T>C
-----------------------------------------------
 6.90     4.72    0.00    0.66    0.00     2.95
50.00    16.28    3.96    0.00    0.00    11.76
 2.05     2.91    0.00    0.66    0.00     0.00
-----------------------------------------------

   A       C       G       T
----------------------------
0.23    0.27    0.30    0.20
----------------------------

Distances: {'Human': np.float64(0.00935349688630193), 'Chimpanze

# Intron AR sample alignment

I sampled chromosome 22 introns masking everythin but ancestral repeats using 
```
eti alignments -i install -od intronsAR --align_name 10_primates* --ref human --mask_shadow ancestralrepeats_1column.txt --coord_names 22 
```
where ancestralrepeats_1column.txt is a list containing the ancestral repeats biotypes: 
Type I Transposons/LINE,
Type I Transposons/SINE,
Type II Transposons,
LTRs

In [6]:
#folder_in = paths.DATA_APES114 + 'intronsAR/'
folder_in = 'intronsAR/'

sequence = 'ENSG00000100373.fa'

aln = load_aligned_seqs(filename = folder_in + sequence, format="fasta", moltype='dna')
aln

reason='no longer has an effect'
  deprecated(


0,1
,0
pan_troglodytes:22:32060815-32072454:1,?????-------------------------------------------------------
homo_sapiens:22:45284948-45295874:1,............................................................
gorilla_gorilla:22:28853875-28862222:1,............................................................


In [7]:
rename = libs.renamer_aligned()
omit_degs = get_app("omit_degenerates", moltype="dna")

aln = rename(aln)
aln = omit_degs(aln)

sm = get_app("model", "GN", time_het="max", optimise_motif_probs=True, show_progress=True)
result_intergenic = sm(aln)


lengthstree = result_intergenic.lf.get_lengths_as_ens()
print("Model results : \n" + str(result_intergenic.lf) + "\n")
print("Distances: " + str(lengthstree))

   0%|          |00:00<?

   0%|          |00:00<?

Model results : 
GN
log-likelihood = -4155.1759
number of free parameters = 39
edge          parent    length     A>C      A>G      A>T     C>A      C>G
-------------------------------------------------------------------------
Human         root        0.02    1.00     2.54     0.00    0.46     0.47
Chimpanzee    root        0.01    1.03     3.08     1.03    0.93     3.88
Gorilla       root        0.01    0.00    47.72    17.84    0.00    11.08
-------------------------------------------------------------------------

continued: 
  C>T      G>A     G>C     G>T     T>A      T>C
-----------------------------------------------
 3.80     4.55    2.21    0.00    0.49     3.03
 5.90     9.27    0.91    0.00    2.04     7.25
50.00    32.30    0.00    0.00    0.00    41.75
-----------------------------------------------

   A       C       G       T
----------------------------
0.24    0.25    0.27    0.24
----------------------------

Distances: {'Human': np.float64(0.016518668601997945), 'Ch