In [1]:
import random

import numpy as np

from data_simulation import generate_case, rate_sub_HKY, scale_branches_length
from felsenstein import pruning
from tree_serialisation import load_tree
from viterbi_sumproduct import viterbi

In [2]:
# SIMULATION PARAMETERS
tree_path = "tree.json"
number_of_nucleotids = 100
alphabet = ['A', 'C', 'T', 'G']
alphabetSize = len(alphabet)

nbState = 4
# transition matrix of the toy gene finder
A = np.zeros((nbState, nbState))
A[0, 1] = 1
A[1, 2] = 1
A[2, 3] = 0.33
A[2, 0] = 1 - A[2, 3]
A[3, 3] = 0.33  # 0.9999  # unrealistic ...
A[3, 0] = 1 - A[3, 3]

# state initial probability
b = np.array([0.25, 0.25, 0.26, 0.24])

animalNames = ["dog", "cat", "pig", "cow", "rat", "mouse", "baboon",
               "human"]
n_species = len(animalNames)
"""[...], such as the higher average rate of substitution and the greater
transition/transversion ratio, in noncoding and third-codon-position sites
than in firstand second- codon-position sites[...]"""

pi = np.zeros((nbState, alphabetSize))
# substitution rates for pi 0 and 1 are between 0 and 0.001
pi[0] = np.random.rand(alphabetSize) * 0.001
pi[1] = np.random.rand(alphabetSize) * 0.001
# but between 0 and 0.01 for pi 2 and 3
pi[2] = np.random.rand(alphabetSize) * 0.01
pi[3] = np.random.rand(alphabetSize) * 0.01
pi /= pi.sum(axis=1)[:, None]

# translation/transversion rate
kappa = np.array([2.3, 2.7, 4.3, 5.4])

In [3]:
# load the phylogenetic model from JSON
tree = load_tree(tree_path)
trees = []

for j in range(nbState):
    scaling_factor = random.random()
    trees.append(scale_branches_length(tree, scale=scaling_factor))

strands, states = generate_case(A, b, pi, kappa,
                                trees, number_of_nucleotids)

In [4]:
# Transform strands from ints to strings
str_strands = list()
for strand in strands:
    str_strand = ""
    for acid_int in strand:
        str_strand = ''.join([str_strand, alphabet[acid_int]])
    str_strands += [str_strand]

In [5]:
# Transform strands in sites
sites = list()
for site_ind in range(number_of_nucleotids):
    sites += [''.join([str_strands[species_ind][site_ind] for species_ind in range(n_species)])]

In [6]:
# Process likelihoods with Felsenstein's algorithm
Qs = rate_sub_HKY(pi, kappa)
likelihoods = np.zeros((nbState, number_of_nucleotids))
for state in range(nbState):
    tree = trees[state]
    Q = Qs[state]
    p = pi[state]
    for site_ind, site in enumerate(sites):
        likelihoods[state, site_ind] = pruning(Q, p, tree, site)

In [7]:
# VITERBI PARAMETERS
S = range(nbState)
state_sequence_viterbi = viterbi(S, A, b, likelihoods)

  prob = np.log(A[:, s]) + alpha_log[:, t - 1]


In [8]:
# Precision
np.sum(states == state_sequence_viterbi) / number_of_nucleotids

1.0

In [9]:
states

array([3, 3, 3, 3, 3, 0, 1, 2, 3, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 1, 2, 3, 3,
       0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
       2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 3, 3, 3, 0, 1, 2, 0, 1, 2, 3, 0, 1,
       2, 3, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 0, 1,
       2, 0, 1, 2, 3, 3, 0, 1], dtype=uint8)

In [10]:
state_sequence_viterbi

array([ 3.,  3.,  3.,  3.,  3.,  0.,  1.,  2.,  3.,  3.,  0.,  1.,  2.,
        3.,  3.,  0.,  1.,  2.,  0.,  1.,  2.,  3.,  3.,  0.,  1.,  2.,
        0.,  1.,  2.,  0.,  1.,  2.,  0.,  1.,  2.,  3.,  0.,  1.,  2.,
        3.,  0.,  1.,  2.,  3.,  0.,  1.,  2.,  0.,  1.,  2.,  0.,  1.,
        2.,  0.,  1.,  2.,  3.,  3.,  3.,  3.,  0.,  1.,  2.,  0.,  1.,
        2.,  3.,  0.,  1.,  2.,  3.,  0.,  1.,  2.,  0.,  1.,  2.,  0.,
        1.,  2.,  0.,  1.,  2.,  0.,  1.,  2.,  3.,  0.,  1.,  2.,  0.,
        1.,  2.,  0.,  1.,  2.,  3.,  3.,  0.,  1.])

In [11]:
state_sequence_likelihoods = np.argmax(likelihoods, axis=0)
state_sequence_likelihoods

array([1, 3, 3, 3, 2, 0, 1, 2, 3, 3, 0, 1, 2, 1, 1, 0, 1, 2, 0, 1, 2, 3, 1,
       0, 1, 2, 0, 1, 2, 0, 1, 1, 0, 1, 2, 1, 0, 3, 2, 2, 0, 0, 2, 3, 0, 1,
       2, 0, 2, 2, 0, 1, 0, 0, 1, 2, 3, 3, 3, 3, 2, 1, 1, 0, 1, 2, 3, 0, 1,
       2, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 0, 1,
       2, 0, 1, 1, 2, 2, 0, 1])