In [None]:
import numpy as np

from data_simulation import generate_case, rate_sub_HKY, scale_branches_length
from felsenstein import pruning
from tree_serialisation import load_tree

In [None]:
# SIMULATION PARAMETERS
tree_path = "tree.json"
number_of_nucleotids = 100
alphabet = ['A', 'C', 'T', 'G']
alphabetSize = len(alphabet)

nbState = 4
# transition matrix of the toy gene finder
A = np.zeros((nbState, nbState))
A[0, 1] = 1
A[1, 2] = 1
A[2, 3] = 0.011
A[2, 0] = 1 - A[2, 3]
A[3, 3] = 0.33  # 0.9999  # unrealistic ...
A[3, 0] = 1 - A[3, 3]

# state initial probability
b = np.array([0.25, 0.25, 0.26, 0.24])

animalNames = ["dog", "cat", "pig", "cow", "rat", "mouse", "baboon",
               "human"]
n_species = len(animalNames)
"""[...], such as the higher average rate of substitution and the greater
transition/transversion ratio, in noncoding and third-codon-position sites
than in firstand second- codon-position sites[...]"""

pi = np.zeros((nbState, alphabetSize))
# substitution rates for pi 0 and 1 are between 0 and 0.001
pi[0] = np.random.rand(alphabetSize) * 0.001
pi[1] = np.random.rand(alphabetSize) * 0.001
# but between 0 and 0.01 for pi 2 and 3
pi[2] = np.random.rand(alphabetSize) * 0.01
pi[3] = np.random.rand(alphabetSize) * 0.01
pi /= pi.sum(axis=1)

# translation/transversion rate
kappa = np.array([2.3, 2.7, 4.3, 5.4])

In [None]:
# load the phylogenetic model from JSON
tree = load_tree(tree_path)
trees = []

for j in range(nbState):
    trees.append(scale_branches_length(tree))

strands, states = generate_case(A, b, pi, kappa,
                                trees, number_of_nucleotids)

In [None]:
# Transform strands from ints to strings
str_strands = list()
for strand in strands:
    str_strand = ""
    for acid_int in strand:
        str_strand = ''.join([str_strand, alphabet[acid_int]])
    str_strands += [str_strand]

In [None]:
# Transform strands in sites
sites = list()
for site_ind in range(number_of_nucleotids):
    sites += [''.join([str_strands[species_ind][site_ind] for species_ind in range(n_species)])]

In [None]:
# Process likelihoods with Felsenstein's algorithm
Qs = rate_sub_HKY(pi, kappa)
likelihoods = np.zeros((nbState, number_of_nucleotids))
for state in range(nbState):
    tree = trees[state]
    Q = Qs[state]
    p = pi[state]
    for site_ind, site in enumerate(sites):
        likelihoods[state, site_ind] = pruning(Q, p, tree, site)