In [4]:
import nltk
import numpy as np
import sys
import pickle

In [134]:
class HMM2:
        def __init__(self, state_list, observation_list,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None):
            """Builds a new Hidden Markov Model
            state_list is the list of state symbols [q_0...q_(N-1)]
            observation_list is the list of observation symbols [v_0...v_(M-1)]
            transition_proba is the transition probability matrix
                [a_ij] a_ijk = Pr(Y_(t+1)=q_i|Y_t=q_j, Y_t-1=q_k)
            observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""
            print ("HMM creating with: ")
            self.N = len(state_list) # The number of states
            self.M = len(observation_list) # The number of words in the vocabulary
            print (str(self.N)+" states")
            print (str(self.M)+" observations")
            self.omega_Y = state_list # Keep the vocabulary of tags
            self.omega_X = observation_list # Keep the vocabulary of tags
            # Init. of the 3 distributions : observation, transition and initial states
            if transition_proba is None:
                self.transition_proba = np.zeros( (self.N, self.N, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = np.zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            if initial_state_proba is None:
                self.initial_state_proba = np.zeros( (self.N, self.N), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            # Since everything will be stored in numpy arrays, it is more convenient and compact to 
            # handle words and tags as indices (integer) for a direct access. However, we also need 
            # to keep the mapping between strings (word or tag) and indices. 
            self.__make_indexes()

        def __make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities arrays"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
                
        def __calculate_transition_proba(self,data):
            """calculate the transition matrix, never call it from outside"""
            for word in range(len(data)):
                actual_alphabet = None
                last1_alphabet = None
                for alphabet in range(0,len(data[word])):
                    last2_alphabet = last1_alphabet #t-2
                    last1_alphabet = actual_alphabet #t-1
                    actual_alphabet = data[word][alphabet][1] #t
                    if alphabet>1:    
                        self.transition_proba[self.Y_index[actual_alphabet],self.Y_index[last1_alphabet],self.Y_index[last2_alphabet]] += 1
        
        def __calculate_observation_proba(self,data):
            """calculate de observation matrix, never call it from outside"""
            for word in range(len(data)):
                for alphabet in range(0,len(data[word])):
                    real_alphabet = data[word][alphabet][1]
                    observed_alphabet = data[word][alphabet][0]
                    self.observation_proba[self.X_index[observed_alphabet],self.Y_index[real_alphabet]] += 1 
        
        def __calculate_initial_state(self,data):
            """calculate initial state distribution, never call it from outside"""
            for word in range(len(data)):
                if len(data[word])>1:
                    initial1_alphabet = data[word][1][1]
                    initial2_alphabet = data[word][0][1]
                    self.initial_state_proba[self.Y_index[initial1_alphabet]][self.Y_index[initial2_alphabet]] += 1
        
        def __proba_normalization(self):
            """using for normalize the probability to 1, to use in the end of training, never call from it outside"""
            for i in range(self.N):
                for j in range(self.N):
                    if self.transition_proba[:][i][j].sum()!=0:
                        self.transition_proba[:][i][j]=self.transition_proba[:][i][j]/self.transition_proba[:][i][j].sum()
                self.observation_proba[:][i]=self.observation_proba[:][i]/self.observation_proba[:][i].sum()
            self.initial_state_proba = self.initial_state_proba / self.initial_state_proba.sum()
                
        def train(self, data):
            """a simple function to train the HMM"""
            self.__calculate_transition_proba(data)
            self.__calculate_observation_proba(data)
            self.__calculate_initial_state(data)
            self.__proba_normalization()
            
            
        def viterbi(self, sequence):
            """second order viterbi algorithms implementation"""
            T1 = np.zeros((self.N, self.N, len(sequence)), float) 
            T2 = np.zeros((self.N, self.N, len(sequence)), float)
            path = np.zeros(len(sequence))
            
            #inital state
            for i in range(self.N):
                for j in range(self.N):
                    T1[i,j,0] = self.initial_state_proba[i][j]*self.observation_proba[self.omega_Y.index(sequence[1])][i]*self.observation_proba[self.omega_Y.index(sequence[0])][j]
                    T2[i,j,0] = 0
                    
            #calculate other states
            for t in range(1,len(sequence)-1):
                for i in range(self.N):
                    for j in range(self.N):
                        #i is actual state t, j is t-1, k is t-2
                        (T1[i,j,t],T2[i,j,t]) = max([(T1[j,k,t-1]*self.transition_proba[i][j][k]*self.observation_proba[state_liste.index(sequence[t+1])][i], k) for k in range(self.N)])        
            z=np.argmax(T1[:,:,len(sequence)-2])
            print (z)
            path[len(sequence)-1]=z[0]
            path[len(sequence)-2]=z[1]
            z=z.astype(int)
            z=T2[z[0],z[1],len(sequence)-3]
            path[len(sequence)-4]=z
            for i in range(len(sequence)-4,0,-1):
                path[i-1]=z
            return path
        
        
        
        

In [120]:
with open('./typos-data/train10.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('./typos-data/train20.pkl', 'rb') as f:
    train_data2 = pickle.load(f)

train_data.extend(train_data2)

state_liste = ['a','b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
observation_list = ['a','b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [135]:
Typo = HMM2(state_liste,observation_list)
Typo.train(train_data)

sequence=[]
for i in range(len(train_data[3])):
    sequence.append(train_data[3][i][0])
X=Typo.viterbi(sequence)

HMM creating with: 
26 states
26 observations
[ 0  0  0  0  0 13 13  0  0  0  0  0  0  0  0  0  0 13  0 13  0  0  0  0
 13  0]


In [101]:
Typo.observation_proba

array([[9.57825679e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.15772645e-03, 0.00000000e+00, 3.18099123e-02, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 8.71051326e-03, 0.00000000e+00,
        0.00000000e+00, 4.96168477e-04],
       [0.00000000e+00, 7.12440092e-01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 3.14648885e-02, 8.39758283e-02,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.48155866e-01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 2.39633257e-02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 9.34831196e-01

In [133]:
print (train_data[3])
print (X)

[('a', 'a'), ('c', 'c'), ('v', 'c'), ('o', 'o'), ('u', 'u'), ('n', 'n'), ('t', 't')]
[25. 25. 25. 25.  0.  0.  0.]


In [116]:
Typo.transition_proba[14,2,2]

0.0967741935483871

1.0000000000000004e-20
