In [19]:
import numpy as np

from tree_serialisation import load_tree
from data_simulation import scale_branches_length, rate_sub_HKY, generate_case
from felsenstein import pruning
from viterbi_sumproduct import viterbi, sum_product

Example 1: a toy gene finder

In [20]:
## Definition of the parameters 

alphabet = ['A', 'C', 'T', 'G']
alphabetSize = len(alphabet)
nbState = 4
animalNames = ["dog", "cat", "pig", "cow", "rat", "mouse", "baboon", "human"]
n_species = len(animalNames)
number_of_nucleotids = 200

# State-transition matrix
A = np.zeros((nbState, nbState))
A[0, 1] = 1
A[1, 2] = 1
A[2, 3] = 0.011
A[2, 0] = 1 - A[2, 3]
A[3, 3] = 0.33  # 0.9999  # unrealistic ...
A[3, 0] = 1 - A[3, 3]

# Initial-state probabilities
b = np.array([0.25, 0.25, 0.26, 0.24])

# Loading the phylogenetic model from JSON
tree_path = 'tree.json'
tree = load_tree(tree_path)
trees = []

for j in range(nbState):
    trees.append(scale_branches_length(tree))


pi = np.zeros((nbState, alphabetSize))
# substitution rates for pi 0 and 1 are between 0 and 0.001
pi[0] = np.random.rand(alphabetSize) * 0.001
pi[1] = np.random.rand(alphabetSize) * 0.001
# but between 0 and 0.01 for pi 2 and 3
pi[2] = np.random.rand(alphabetSize) * 0.01
pi[3] = np.random.rand(alphabetSize) * 0.01
pi /= pi.sum(axis=1)[:, None]

# translation/transversion rate
kappa = np.array([2.3, 2.7, 4.3, 5.4])

# Rate substitution matrix
Q = rate_sub_HKY(pi, kappa)

In [21]:
strands, states = generate_case(A, b, pi, kappa, trees, number_of_nucleotids)

# Transform strands from ints to strings
str_strands = list()
for strand in strands:
    str_strand = ""
    for acid_int in strand:
        str_strand = ''.join([str_strand, alphabet[acid_int]])
    str_strands += [str_strand]
str_strands

# Transform strands in sites
sites = list()
for site_ind in range(number_of_nucleotids):
    sites += [''.join([str_strands[species_ind][site_ind] for species_ind in range(n_species)])]

In [22]:
# Emission probas computation
emission_probas = np.zeros((nbState, number_of_nucleotids))
for state in range(nbState):
    for site_ind, site in enumerate(sites):
        emission_probas[state, site_ind] = pruning(Q[state], pi[state], trees[state], site)

In [23]:
# Viterbi
state_sequence_viterbi = viterbi(range(nbState), A, b, emission_probas)

  prob = np.log(A[:, s]) + alpha_log[:, t - 1]


In [14]:
# Precision 
np.sum(states == state_sequence_viterbi) / number_of_nucleotids 
# Weird score...

0.995

Example 2: identification of highly conserved regions

In [26]:
## Definition of the parameters

animalNames = ["dog", "cat", "pig", "cow", "rat", "mouse", "baboon", "human", "chimp"]
n_states = len(animalNames)
n_nucleotids = 500

# State-transition matrix
lmbda = 0.94
a = lmbda + 1/nbState*(1-lmbda)
b = 1/nbState*(1-lmbda)

A = b*np.ones((n_states, n_states))
for i in range(n_states):
    A[i, i] = a
    
# Initial-state probabilities (hypthesis: uniform distribution)
b = np.ones(n_states) / n_states



In [None]:
# Emission probas computation
emission_probas = np.zeros((n_states, n_nucleotids))
for state in range(nbState):
    for site_ind, site in enumerate(sites):
        emission_probas[state, site_ind] = pruning(Q[state], pi[state], trees[state], site)

In [None]:
# Posterior probas computation
post_probas = sum_product(A, b, emission_probas)

In [None]:
interest = post_probas[0, :]