In [537]:
import numpy as np
from hmmlearn import hmm
import time

In [538]:
class PairwiseHMM(hmm.CategoricalHMM):
    def __init__ (self, n_features, mu, epsilon ,tau):

        #Los cinco estados son (Begin, M, X, Y, End)
        super().__init__(n_components=5, n_features= n_features)

        match_to_ins = np.subtract(1.0, 2.0*mu+tau)
        ins_to_match = np.subtract(1.0, epsilon+tau)

        if match_to_ins<0.0 :
            raise ValueError(f"1-2*mu-tau must be non negative (got {match_to_ins:.4f})")

        if ins_to_match<0.0 :
            raise ValueError(f"1-epsilon-tau must be non negative (got {ins_to_match:.4f})")

        self.transmat_  = np.array([[0,match_to_ins, mu, mu, tau], 
                                   [0,match_to_ins, mu, mu, tau],
                                   [0,ins_to_match, epsilon, 0, tau],
                                   [0,ins_to_match, 0, epsilon, tau],
                                   [0,0,0,0,1]]) 

    def modifiedViterbi(self, x, y):
        #Tendremos valores de (0,0) a (n,m)
        n = len(x)
        m = len(y)

        v = np.zeros((3, n+1, m+1))
        pointer = np.zeros((3, n+1, m+1), dtype=int)

        #Ignorar el warning de división entre 0 a la hora de calcular log(0)
        with np.errstate(divide='ignore'): 
            #Inicializar las variables
            v = np.log(v)
        
        v[0,0,0] = 0

        for i in range(1,n+1):
            v[1,i,0] = np.log(self.emissionprob_[2,16+x[i-1]])+np.max([np.log(self.transmat_[1,2])+v[0,i-1,0], np.log(self.transmat_[2,2])+v[1,i-1,0] ])
            pointer[1,i,0] = np.argmax([np.log(self.transmat_[1,2])+v[0,i-1,0], np.log(self.transmat_[2,2])+v[1,i-1,0], -np.inf ])

        for j in range(1,m+1):
            v[2,0,j] = np.log(self.emissionprob_[3,20+y[j-1]])+np.max([np.log(self.transmat_[1,3])+v[0,0,j-1], np.log(self.transmat_[3,3])+v[2,0,j-1] ])
            pointer[2,0,j] = np.argmax([np.log(self.transmat_[1,3])+v[0,0,j-1], -np.inf ,np.log(self.transmat_[3,3])+v[2,0,j-1] ])

        for i in range(1,n+1):
            for j in range(1,m+1):
                v[0,i,j] = np.log(self.emissionprob_[1,x[i-1]*4+y[j-1]])+np.max([np.log(self.transmat_[1,1])+v[0,i-1,j-1], np.log(self.transmat_[2,1])+v[1,i-1,j-1], np.log(self.transmat_[3,1])+v[2,i-1,j-1] ])
                pointer[0,i,j] = np.argmax([np.log(self.transmat_[1,1])+v[0,i-1,j-1], np.log(self.transmat_[2,1])+v[1,i-1,j-1], np.log(self.transmat_[3,1])+v[2,i-1,j-1] ])

                v[1,i,j] = np.log(self.emissionprob_[2,16+x[i-1]])+np.max([np.log(self.transmat_[1,2])+v[0,i-1,j], np.log(self.transmat_[2,2])+v[1,i-1,j] ])
                pointer[1,i,j] = np.argmax([np.log(self.transmat_[1,2])+v[0,i-1,j], np.log(self.transmat_[2,2])+v[1,i-1,j], -np.inf ])

                v[2,i,j] = np.log(self.emissionprob_[3,20+y[j-1]])+np.max([np.log(self.transmat_[1,3])+v[0,i,j-1], np.log(self.transmat_[3,3])+v[2,i,j-1] ])
                pointer[2,i,j] = np.argmax([np.log(self.transmat_[1,3])+v[0,i,j-1], -np.inf ,np.log(self.transmat_[3,3])+v[2,i,j-1] ])

        sequence = [ np.argmax([v[0,n,m], v[1,n,m], v[2,n,m] ])]
        i = n
        j = m

        def cases(x, i, j):            
            if i>0 and (x==0 or x==1):
                i-=1
            if j>0 and (x==0 or x==2):
                j-=1
            return i,j

        while i>0 or j>0:
            estado_siguiente = pointer[sequence[-1] ,i,j]
            i,j = cases(sequence[-1],i,j) 
            sequence.append( estado_siguiente )

        #Eliminamos el último elemento pues es siempre el estado inicial
        del sequence[-1]
        sequence.reverse()
        return v, sequence, pointer
    

    def modifiedFoward(self, x, y):
        #Tendremos valores de (0,0) a (n,m)
        n = len(x)
        m = len(y)

        v = np.zeros((3, n+1, m+1))
        v[0,0,0] = 1

        for i in range(1,n+1):
            v[1,i,0] = self.emissionprob_[2,16+x[i-1]]*np.sum([self.transmat_[1,2]*v[0,i-1,0], self.transmat_[2,2]*v[1,i-1,0] ])

        for j in range(1,m+1):
            v[2,0,j] = self.emissionprob_[3,20+y[j-1]]*np.sum([self.transmat_[1,3]*v[0,0,j-1], self.transmat_[3,3]*v[2,0,j-1] ])
            
        for i in range(1,n+1):
            for j in range(1,m+1):
                v[0,i,j] = self.emissionprob_[1,x[i-1]*4+y[j-1]]*np.sum([self.transmat_[1,1]*v[0,i-1,j-1], self.transmat_[2,1]*v[1,i-1,j-1], self.transmat_[3,1]*v[2,i-1,j-1] ])

                v[1,i,j] = self.emissionprob_[2,16+x[i-1]]*np.sum([self.transmat_[1,2]*v[0,i-1,j], self.transmat_[2,2]*v[1,i-1,j] ])

                v[2,i,j] = self.emissionprob_[3,20+y[j-1]]*np.sum([self.transmat_[1,3]*v[0,i,j-1], self.transmat_[3,3]*v[2,i,j-1] ])

        return self.transmat_[0,4]*np.sum([v[0,n,m], v[1,n,m], v[2,n,m]])

In [539]:
k=[0,1,2,3,4,5,6]
k[0:3]

[0, 1, 2]

In [540]:
model = PairwiseHMM(25, 0.4, 0.5, 0.1)

In [541]:
'''En este caso, las emisiones en cada fila son de forma:
(A:A, A:C, A:G, A:T, C:A, C:C, C:G, C:T, G:A, G:C, G:G, G:T, T:A, T:C, T:G, T:T, A:_, C:_, G:_, T:_, _:A, _:C, _:G, _:T, Emisión silenciosa )
[ 1/5 if (i % 6==0)  else 1/60 if (i % 5==4) else 0 for i in range(25)],
[ 1/5 if (i % 6==0)  else 1/60 if (i % 5==4) else 0 for i in range(25)],
[ 1/4 if (i >=20 and i<24) else 0 for i in range(25)],
[ 1/4 if (i % 5==4 and i<24) else 0 for i in range(25)],
[ 1 if (i==24) else 0 for i in range(25)]
'''
model.emissionprob_=np.array(
[ [0 if (i<24) else 1 for i in range(25)],
  [0.125 , 0.0375, 0.0125, 0.075 , 0.0375, 0.125 , 0.075 , 0.0125, 0.0125, 0.075 , 0.125 , 0.0375, 0.075 , 0.0125, 0.0375, 0.125, 0, 0, 0, 0, 0,0,0,0,0 ],
  [0.25 if (i>=16 and i<20) else 0 for i in range(25)],
  [0.25 if (i>=20 and i<24) else 0 for i in range(25) ],
  [0 if (i<24) else 1 for i in range(25)]
]
)

In [542]:
def decodify(x):
    return np.array([0 if letter == 'A' else 1 if letter=='C' else 2 if letter=='G' else 3 for letter in x.upper()])

In [543]:
def codify(s,x,y):
    i=0
    j=0
    align1 = []
    align2 = []
    for k in s:
        match k:
            case 0:
                align1.append(x[i])
                align2.append(y[j])
                i+=1
                j+=1
            case 1:
                align1.append(x[i])
                align2.append("-")
                i+=1
            case 2:
                align1.append("-")
                align2.append(y[j])
                j+=1

    return "".join(align1), "".join(align2)

In [544]:
x="CACGAAT"
y="AGTTCAA"

In [545]:
v, s, p=model.modifiedViterbi( decodify(x), decodify(y) )
print(s)

[1, 0, 2, 2, 2, 0, 1, 0, 1, 0]


In [546]:
print("\n".join(codify(s,x,y)))

CA---CGAAT
-AGTTC-A-A


In [547]:
model.modifiedFoward(decodify(x), decodify(y))

1.850365127897263e-11

In [548]:
model = PairwiseHMM(25, 0.1, 0.1, 0.1)

model.emissionprob_=np.array(
[ [0 if (i<24) else 1 for i in range(25)],
  [0.125 , 0.0375, 0.0125, 0.075 , 0.0375, 0.125 , 0.075 , 0.0125, 0.0125, 0.075 , 0.125 , 0.0375, 0.075 , 0.0125, 0.0375, 0.125, 0, 0, 0, 0, 0,0,0,0,0 ],
  [0.25 if (i>=16 and i<20) else 0 for i in range(25)],
  [0.25 if (i>=20 and i<24) else 0 for i in range(25) ],
  [0 if (i<24) else 1 for i in range(25)]
]
)

In [549]:
x="TTTAACTTATCG"
y="TTACTCG"

In [550]:
start_time = time.time()
v, s, p=model.modifiedViterbi( decodify(x), decodify(y) )
print("--- %s seconds ---" % (time.time() - start_time))
print(s)

--- 0.005984306335449219 seconds ---
[1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0]


In [551]:
print("\n".join(codify(s,x,y)))

TTTAACTTATCG
-TT-AC-T--CG


In [552]:
model.modifiedFoward(decodify(x), decodify(y))

3.8758596420363915e-15

In [553]:
model = PairwiseHMM(25, 0.1, 0.1, 0.02)

model.emissionprob_=np.array(
[ [0 if (i<24) else 1 for i in range(25)],
  [0.125 , 0.0375, 0.0125, 0.075 , 0.0375, 0.125 , 0.075 , 0.0125, 0.0125, 0.075 , 0.125 , 0.0375, 0.075 , 0.0125, 0.0375, 0.125, 0, 0, 0, 0, 0,0,0,0,0 ],
  [0.25 if (i>=16 and i<20) else 0 for i in range(25)],
  [0.25 if (i>=20 and i<24) else 0 for i in range(25) ],
  [0 if (i<24) else 1 for i in range(25)]
]
)

In [554]:
x="gcgcgtgcgcggaaggagccaaggtgaagttgtagcagtgtgtcagaagaggtgcgtggcaccatgctgtcccccgaggcggagcgggtgctgcggtacctggtcgaagtagaggagttg"
y="gacttgtggaacctacttcctgaaaataaccttctgtcctccgagctctccgcacccgtggatgacctgctcccgtacacagatgttgccacctggctggatgaatgtccgaatgaagcg"

In [555]:
start_time = time.time()
v, s, p=model.modifiedViterbi( decodify(x), decodify(y) )
print("--- %s seconds ---" % (time.time() - start_time))
print("\n".join(codify(s,x,y)))

--- 0.78891921043396 seconds ---
gcgcgtgcgcggaaggagccaaggtgaagttgtagcagtgtgtcagaagaggtgcgtggcacca-tgctgtcccccgaggcggagcgggtgctgcggtacctgg-tcgaagta-ga-gg-a-gtt--g
ga-cttg-t-ggaacct-acttcctgaa-aataacct-tctgtcctccgagctctc-cgcacccgtggatgacctgctcccgtacacagatgttgcc-acctggctggatgaatgtccgaatgaagcg


In [556]:
model.modifiedFoward(decodify(x), decodify(y))

5.859574024342029e-150

In [557]:
a="g-cgcgt-gcgcggaaggagccaaggtga-agt-tgt-agcagtg-tgtcagaagaggtgcgtggcaccatgctgtcccccgaggcggagcgggtgctgcggtacctggtcgaagtagaggagttg"

b="gacttgtggaacctacttcctgaaaataaccttctgtcctccgagctctccgcacccgtggatgacctgctcccgtacacagatgttgccacctggctg-gatgaatgtccgaa-t-ga--agc-g"

In [558]:
def score(a,b):
    sum = 0
    for i in range(len(a)):
        if a[i]==b[i]:
            sum +=2
        elif a[i]!=b[i] and a[i]!="-" and b[i]!="-":
            sum+=1
        else:
            sum-=1

    return sum

In [559]:
score(a,b)

159

In [560]:
c,d = codify(s,x,y)
print(c)
print(d)
score(c,d)

gcgcgtgcgcggaaggagccaaggtgaagttgtagcagtgtgtcagaagaggtgcgtggcacca-tgctgtcccccgaggcggagcgggtgctgcggtacctgg-tcgaagta-ga-gg-a-gtt--g
ga-cttg-t-ggaacct-acttcctgaa-aataacct-tctgtcctccgagctctc-cgcacccgtggatgacctgctcccgtacacagatgttgcc-acctggctggatgaatgtccgaatgaagcg


150

In [561]:
model = PairwiseHMM(25, 0.2, 0.1, 0.1)

model.emissionprob_=np.array(
[ [0 if (i<24) else 1 for i in range(25)],
  [0.125 , 0.0375, 0.0125, 0.075 , 0.0375, 0.125 , 0.075 , 0.0125, 0.0125, 0.075 , 0.125 , 0.0375, 0.075 , 0.0125, 0.0375, 0.125, 0, 0, 0, 0, 0,0,0,0,0 ],
  [0.25 if (i>=16 and i<20) else 0 for i in range(25)],
  [0.25 if (i>=20 and i<24) else 0 for i in range(25) ],
  [0 if (i<24) else 1 for i in range(25)]
]
)

In [562]:
x="TTACG"
y="TAG"

In [563]:
start_time = time.time()
v, s, p=model.modifiedViterbi( decodify(x), decodify(y) )
print("--- %s seconds ---" % (time.time() - start_time))
print("\n".join(codify(s,x,y)))

--- 0.0009975433349609375 seconds ---
TTACG
-TA-G


In [564]:
p = model.modifiedFoward(decodify(x), decodify(y))
print(p)

5.712099609375003e-07
