In [1]:
import numpy as np
from hmmlearn import hmm
from enum import Enum   
from scipy import special

In [2]:
#Declaración de enumerado para distinguir los estados
class states(Enum):
    M = 0
    D = 1
    I = 2
    I_gap = 3

In [3]:
class ProfileHMM(hmm.CategoricalHMM):
    def __init__(self, alphabet, gap_symbol ,alignment, insertion_criteria=None, emission_pseudocounts=None , transition_pseudocounts=None, show_probabilities = False):

        #Transformamos el alfabeto a una lista de caracteres
        alphabet = list(map(str, alphabet))
        self.alphabet = alphabet
        self.gap = gap_symbol

        #Añadimos como posibles emisiones el gap y la emisión silenciosa
        n_features = len(alphabet) + 2

        #Obtener la longitud de las secuencias
        sequence_length = len(alignment[0])

        if not all(c in alphabet or c==gap_symbol for c in "".join(alignment)):
            raise ValueError("Existen caracteres en el alineamiento que no se encuentran en el alfabeto")

        #Comprobar que todas las secuencias tienen la misma longitud
        if not all(len(seq) == sequence_length for seq in alignment):
            raise ValueError("Todos las secuencias deben tener la misma longitud")

        #Obtener las regiones
        regions = ["".join( [seq[i] for seq in alignment] ) for i in range(sequence_length)]
        
        #Si no se introduce un criterio de inserción, se utiliza el por defecto
        if not callable(insertion_criteria):
            insert_regions = list(map(self.__default_insertion_criteria, regions))
        else:
            insert_regions = list(map(insertion_criteria, regions))
        
        #El número de estados de alineamiento es la suma de las regiones de alineamiento
        self.n_match_states = insert_regions.count(False)

        super().__init__(n_components=3*self.n_match_states+3, n_features=n_features)
        self.startprob_ = np.zeros(self.n_components)
        self.startprob_[0] = 1
        self.transmat_ = np.zeros((self.n_components, self.n_components))
        self.emissionprob_ = np.zeros((self.n_components, self.n_features))

        match_emissions_pr, insert_emissions_pr, transition_pr, final_pr = self.__compute_probabilities(alignment, regions, insert_regions, emission_pseudocounts, transition_pseudocounts)

        if (show_probabilities):
            print("Probabilidades de emisión de estados de alineamiento:\n",match_emissions_pr)
            print("Probabilidades de emisión de estados de inserción:\n", insert_emissions_pr)
            print("Probabilidades de transición entre estados:\n", transition_pr)
            print("Probabilidades de transición al estado fin:\n", final_pr)

        self.emissionprob_[0, -1] = 1
        self.emissionprob_[-1, -1] = 1
        self.emissionprob_[1:self.n_match_states+1, 0:len(self.alphabet)] = match_emissions_pr
        self.emissionprob_[self.n_match_states+1: 2*self.n_match_states+1, -2] = 1
        self.emissionprob_[2*self.n_match_states+1:self.n_components-1,0:len(self.alphabet)] = insert_emissions_pr


        current_positions = np.array([0, self.n_match_states, 2*self.n_match_states+1])
        columns = np.array([0, self.n_match_states, 2*self.n_match_states]) + 1

        for i in range(self.n_match_states):
            
            self.transmat_[current_positions[0], columns] = transition_pr[0,:,i]
            if i>0:
                self.transmat_[current_positions[1], columns] = transition_pr[1,:,i]
            self.transmat_[current_positions[2], columns] = transition_pr[2,:,i]

            current_positions = current_positions + 1
            columns = columns + 1

        self.transmat_[current_positions[0], [self.n_components-1, self.n_components-2] ] = final_pr[0]
        self.transmat_[current_positions[1], [self.n_components-1, self.n_components-2] ] = final_pr[1]
        self.transmat_[current_positions[2], [self.n_components-1, self.n_components-2] ] = final_pr[2]
        self.transmat_[-1,-1] = 1


    #Función que convierte una secuencia de elementos del alfabeto en secuencia de índices 
    def __decodify(self, x):
        x = list(map(str.upper, x))
        try:
            decoded= np.array(list(map(self.alphabet.index, x)), dtype=int)
        except ValueError:
            print("¡Error, existe elemento de la secuencia que no se encuentra en el alfabeto!")
            raise
        return decoded
    
    def __codify(self, sequence):
        coded_seq = ['M' if elem==0 else 'D' if elem==1 else 'I' for elem in sequence ]
        return coded_seq

    '''Devuelve true si se trata de una región de inserción'''
    def __default_insertion_criteria(self, region):
        return region.count(self.gap) > (len(region))/2 or max([region.count(elem) for elem in self.alphabet])<(len(region))/2
    
    def __laplace_rule(self, row):
        row = row + 1
        return row*1/np.sum(row)

    def __detect_state(self, element, prev_state, in_insert_region):
        #Los estados son: 0=M, 1=D, 2=I, 3=I_gap. Éste último sirve para distinguir las inserciones y los gaps de la región de inserción para no acumular erróneamente
        state = prev_state

        if in_insert_region:
            if element!=self.gap:
                state = states.I.value
            elif (prev_state==states.I.value or prev_state==states.I_gap.value) and element==self.gap:
                state = states.I_gap.value
        elif not in_insert_region:
            if element==self.gap:
                state = states.D.value
            else:
                state = states.M.value

        return state
    
    def __compute_probabilities(self, alignment, regions, insert_regions, emission_pseudocounts, transition_pseudocounts):
        match_emissions = np.zeros((self.n_match_states, len(self.alphabet)), dtype=int)
        insert_emissions = np.zeros((self.n_match_states+1, len(self.alphabet)), dtype=int)
        transition_frenquencies = np.zeros((3, 3, self.n_match_states), dtype=int)
        final_transitions = np.zeros((3,2),dtype=int)

        region_index = 0
        n_sequence = len(alignment)
        previous_states = []
        actual_states = []

        for i in range(len(insert_regions)):

            #Caso especial para el primero
            if i==0:
                gaps = regions[i].count(self.gap)

                if not insert_regions[i]:
                    transition_frenquencies[states.M.value, states.M.value, region_index] = n_sequence-gaps
                    transition_frenquencies[states.M.value, states.D.value, region_index] = gaps
                    match_emissions[region_index] = [regions[i].count(elem) for elem in self.alphabet]

                    region_index += 1
                else:
                    insert_emissions[region_index] = np.add(insert_emissions[region_index], [regions[i].count(elem) for elem in self.alphabet])
                    transition_frenquencies[states.M.value, states.I.value, region_index] = n_sequence-gaps

                previous_states = [self.__detect_state(regions[i][j], 0 ,insert_regions[i]) for j in range(n_sequence)]
                
            else:
                actual_states = [self.__detect_state(regions[i][j], previous_states[j] ,insert_regions[i]) for j in range(n_sequence)]

                #Si estamos salimos de región de inserción, cambiamos los I_gap por I
                if not insert_regions[i] and insert_regions[i-1]:
                    previous_states = [states.I.value if elem==states.I_gap.value else elem for elem in previous_states]

                transitions = list(zip(previous_states, actual_states))

                if not insert_regions[i]:
                    match_emissions[region_index] = [regions[i].count(elem) for elem in self.alphabet]

                    for k in range(3):
                        transition_frenquencies[k,states.M.value,region_index] = transitions.count((k,states.M.value))
                        transition_frenquencies[k,states.D.value,region_index] = transitions.count((k,states.D.value))

                    region_index += 1
                else:
                    insert_emissions[region_index] = np.add(insert_emissions[region_index], [regions[i].count(elem) for elem in self.alphabet])

                    for k in range(3):
                        transition_frenquencies[k,states.I.value,region_index] += transitions.count((k,states.I.value))

                    if insert_regions[i-1]:
                        #Tener en cuenta también transiciones de I_gap a I
                        transition_frenquencies[states.I.value, states.I.value,region_index] += transitions.count((states.I_gap.value, states.I.value))

                previous_states = actual_states

        #Calcular probabilidades de transición hasta fin
        if insert_regions[-1]:
            final_transitions[states.I.value,states.M.value] = actual_states.count(states.I.value)
        else:
            final_transitions[states.M.value,states.M.value] = actual_states.count(states.M.value)
            final_transitions[states.D.value,states.M.value] = actual_states.count(states.D.value)

        if not callable(emission_pseudocounts):
            match_emissions_pr = np.array(list(map(self.__laplace_rule, match_emissions)))
            insert_emissions_pr = np.array(list(map(self.__laplace_rule, insert_emissions)))
        else:
            match_emissions_pr = np.array(list(map(emission_pseudocounts, match_emissions)))
            insert_emissions_pr = np.array(list(map(emission_pseudocounts, insert_emissions)))

        if not callable(transition_pseudocounts):
            transition_pr = np.apply_along_axis(self.__laplace_rule, 1 ,transition_frenquencies)
            final_pr = np.array(list(map(self.__laplace_rule, final_transitions)))
        else:
            transition_pr = np.apply_along_axis(transition_pseudocounts, 1 ,transition_frenquencies)
            final_pr = np.array(list(map(transition_pseudocounts, final_transitions)))

        return match_emissions_pr, insert_emissions_pr, transition_pr, final_pr
    
    def modifiedViterbi(self, x):
        decoded_x = self.__decodify(x)
        n = len(decoded_x)

        v = np.zeros((3, self.n_match_states+1, n+1))
        pointer = np.zeros((3,self.n_match_states+1, n+1), dtype=int)
        v[0,0,0] = 1

        #Ignorar el warning de división entre 0 a la hora de calcular log(0)
        with np.errstate(divide='ignore'): 
            log_emissionprob_ =  np.log(self.emissionprob_)
            log_transmat_ = np.log(self.transmat_)
            v = np.log(v)

        delete_position = self.n_match_states+1
        v[1,1,0] = v[0,0,0] + log_transmat_[0,delete_position]
        pointer[1,1,0] = 0

        for j in range(2, self.n_match_states+1):
            delete_position = delete_position + 1
            v[1,j,0] = v[1,j-1,0] + log_transmat_[delete_position-1, delete_position]
            pointer[1,j,0] = 1

        for i in range(1, n+1):
            current_positions = np.array([0, self.n_match_states, 2*self.n_match_states+1], dtype=int)

            v[2,0,i] = log_emissionprob_[current_positions[2], decoded_x[i-1]]+ np.max([v[0,0,i-1]+log_transmat_[current_positions[0], current_positions[2]], v[2,0,i-1]+log_transmat_[current_positions[2], current_positions[2]]])

            pointer[2,0,i] = np.argmax([v[0,0,i-1]+log_transmat_[current_positions[0], current_positions[2]], v[2,0,i-1]+log_transmat_[current_positions[2], current_positions[2]]])

            for j in range(1, self.n_match_states+1):
                current_positions = current_positions + 1
                v[0,j,i] = log_emissionprob_[current_positions[0], decoded_x[i-1]]+ np.max([v[0,j-1,i-1]+log_transmat_[current_positions[0]-1, current_positions[0]], v[1,j-1,i-1]+log_transmat_[current_positions[1]-1, current_positions[0]], v[2,j-1,i-1]+log_transmat_[current_positions[2]-1, current_positions[0]]])

                pointer[0,j,i] = np.argmax([v[0,j-1,i-1]+log_transmat_[current_positions[0]-1, current_positions[0]], v[1,j-1,i-1]+log_transmat_[current_positions[1]-1, current_positions[0]], v[2,j-1,i-1]+log_transmat_[current_positions[2]-1, current_positions[0]]])

                v[1,j,i] = np.max([v[0,j-1,i]+log_transmat_[current_positions[0]-1, current_positions[1]], v[1,j-1,i]+log_transmat_[current_positions[1]-1, current_positions[1]], v[2,j-1,i]+log_transmat_[current_positions[2]-1, current_positions[1]]])

                pointer[1,j,i] = np.argmax([v[0,j-1,i]+log_transmat_[current_positions[0]-1, current_positions[1]], v[1,j-1,i]+log_transmat_[current_positions[1]-1, current_positions[1]], v[2,j-1,i]+log_transmat_[current_positions[2]-1, current_positions[1]]])

                v[2,j,i] = log_emissionprob_[current_positions[2], decoded_x[i-1]]+ np.max([v[0,j,i-1]+log_transmat_[current_positions[0], current_positions[2]], v[1,j,i-1]+log_transmat_[current_positions[1], current_positions[2]], v[2,j,i-1]+log_transmat_[current_positions[2], current_positions[2]]])

                pointer[2,j,i] = np.argmax([v[0,j,i-1]+log_transmat_[current_positions[0], current_positions[2]], v[1,j,i-1]+log_transmat_[current_positions[1], current_positions[2]], v[2,j,i-1]+log_transmat_[current_positions[2], current_positions[2]]])

            sequence = [ np.argmax([v[0,self.n_match_states, n], v[1,self.n_match_states, n], v[2,self.n_match_states, n]])]

        viterbi_prob = np.max([v[0,self.n_match_states, n]+log_transmat_[current_positions[0],-1], v[1,self.n_match_states, n]+log_transmat_[current_positions[1],-1], v[2,self.n_match_states, n]+log_transmat_[current_positions[2],-1]])
        k = n
        l = self.n_match_states
        while k>0 or l>0:
            next_state = pointer[sequence[-1] ,l, k]
            if sequence[-1]!=1:
                k= k-1
            if sequence[-1]!=2:
                l = l-1
            sequence.append(next_state)

        del sequence[-1]
        sequence.reverse()

        return self.__codify(sequence), np.exp(viterbi_prob)
    
    def modifiedFoward(self, x):
        decoded_x = self.__decodify(x)
        n = len(decoded_x)

        v = np.zeros((3, self.n_match_states+1, n+1))
        v[0,0,0] = 1

        #Ignorar el warning de división entre 0 a la hora de calcular log(0)
        with np.errstate(divide='ignore'): 
            log_emissionprob_ =  np.log(self.emissionprob_)
            log_transmat_ = np.log(self.transmat_)
            v = np.log(v)

        delete_position = self.n_match_states+1
        v[1,1,0] = v[0,0,0] + log_transmat_[0,delete_position]

        for j in range(2, self.n_match_states+1):
            delete_position = delete_position + 1
            v[1,j,0] = v[1,j-1,0] + log_transmat_[delete_position-1, delete_position]

        for i in range(1, n+1):
            current_positions = np.array([0, self.n_match_states, 2*self.n_match_states+1], dtype=int)

            v[2,0,i] = log_emissionprob_[current_positions[2], decoded_x[i-1]]+ special.logsumexp([v[0,0,i-1]+log_transmat_[current_positions[0], current_positions[2]], v[2,0,i-1]+log_transmat_[current_positions[2], current_positions[2]]])

            for j in range(1, self.n_match_states+1):
                current_positions = current_positions + 1

                v[0,j,i] = log_emissionprob_[current_positions[0], decoded_x[i-1]]+special.logsumexp([v[0,j-1,i-1]+log_transmat_[current_positions[0]-1, current_positions[0]], v[1,j-1,i-1]+log_transmat_[current_positions[1]-1, current_positions[0]], v[2,j-1,i-1]+log_transmat_[current_positions[2]-1, current_positions[0]]])

                v[1,j,i] = special.logsumexp([v[0,j-1,i]+log_transmat_[current_positions[0]-1, current_positions[1]], v[1,j-1,i]+log_transmat_[current_positions[1]-1, current_positions[1]], v[2,j-1,i]+log_transmat_[current_positions[2]-1, current_positions[1]]])

                v[2,j,i] = log_emissionprob_[current_positions[2], decoded_x[i-1]]+ special.logsumexp([v[0,j,i-1]+log_transmat_[current_positions[0], current_positions[2]], v[1,j,i-1]+log_transmat_[current_positions[1], current_positions[2]], v[2,j,i-1]+log_transmat_[current_positions[2], current_positions[2]]])


        v_fin = special.logsumexp([v[0,self.n_match_states, n]+log_transmat_[current_positions[0],-1], v[1,self.n_match_states, n]+log_transmat_[current_positions[1],-1], v[2,self.n_match_states, n]+log_transmat_[current_positions[2],-1]])

        return np.exp(v_fin)
        

# Ejemplos: 

In [4]:
model = ProfileHMM(alphabet=["A", "C", "G", "T"], gap_symbol='-', alignment=['TA--TC', 'TAG-TC', 'TAGA-C', '-AG-TG'], show_probabilities=True)

Probabilidades de emisión de estados de alineamiento:
 [[0.14285714 0.14285714 0.14285714 0.57142857]
 [0.625      0.125      0.125      0.125     ]
 [0.14285714 0.14285714 0.57142857 0.14285714]
 [0.14285714 0.14285714 0.14285714 0.57142857]
 [0.125      0.5        0.25       0.125     ]]
Probabilidades de emisión de estados de inserción:
 [[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.4  0.2  0.2  0.2 ]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]]
Probabilidades de transición entre estados:
 [[[0.57142857 0.66666667 0.57142857 0.5        0.66666667]
  [0.28571429 0.16666667 0.28571429 0.16666667 0.16666667]
  [0.14285714 0.16666667 0.14285714 0.33333333 0.16666667]]

 [[0.33333333 0.5        0.33333333 0.5        0.5       ]
  [0.33333333 0.25       0.33333333 0.25       0.25      ]
  [0.33333333 0.25       0.33333333 0.25       0.25      ]]

 [[0.33333333 0.33333333 0.33333333 0.25       0.33333333]
  [0.33333333 0.33333333 0.33333333 0.5        0.3333333

In [5]:
x = "ATATGTC"
result, prob = model.modifiedViterbi(x)
print(result)

['I', 'M', 'M', 'I', 'M', 'M', 'M']


In [6]:
total_prob=model.modifiedFoward(x)
print(total_prob)

1.393704882159438e-05


In [7]:
x = "ATC"
result, prob = model.modifiedViterbi(x)
print(result)

['D', 'M', 'D', 'M', 'M']


In [8]:
total_prob=model.modifiedFoward(x)
print(total_prob)

0.004601382312893029


In [9]:
y = "AAC"
result, prob = model.modifiedViterbi(y)
print(result)

['D', 'M', 'D', 'M', 'M']


In [10]:
total_prob=model.modifiedFoward(y)
print(total_prob)

0.003040138977867695
