In [None]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import skspeech

## HMM definition

- $N$: number of states in the model (e.g., # of urns)
- $S=S_{1...N}$: the states in the model
- $q_t$: the state at time $t$
- $M$: the number of observation symbols per state (i.e., alphabet size)
- $V=v_{1...M}$: the observation symbols
- $A=\{a_{ij}\}; a_{ij} = P[q_{t+1} = S_j|q_t = S_i]$: state transition probability distribution
- $B=\{b_j(k)\}; b_j(k) = P[v_k\text{ at }t|q_t = S_j]$: observation symbol probability distribution in state $j$
- $\pi=\{\pi_i\}; \pi_i = P[q_1 = S_i]$: initial state distribution

In [None]:
def sample_pdf(pdf, size=1):
    states = np.arange(pdf.size)
    dist = stats.rv_discrete(name='pdf', values=(states, pdf))
    return dist.rvs(size)

def observe(n_t, a, b, pi):
    q_t = sample_pdf(pi)  # initial state
    os = []
    qs = [q_t]
    for t in xrange(n_t):
        o_t = sample_pdf(b[q_t])
        q_t = sample_pdf(a[q_t])
        os.append(o_t)
        qs.append(q_t)


class HMM(object):
    def __init__(self, a, b, pi):
        assert a.shape[0] == a.shape[1]
        assert b.shape[0] == a.shape[0]
        assert pi.size == a.shape[0]
        self.a = a
        self.b = b
        self.pi = pi
        self.N = self.a.shape[0]
        self.M = self.b.shape[1]
        self.states = np.arange(self.N, dtype=int)

    def observe(self, n_t):
        return observe(n_t, self.a, self.b, self.pi)

    def forward(self, obs):
        # forward variable;
        # alpha_t(i) = P(past O sequence, qt = Si | hmm)
        f_prev = self.b.T[obs[0]] * self.pi
        fwd = [[f_prev]]
        for o_i in obs[1:]:
            f_curr = np.zeros(self.N)
            prev_f_sum = np.sum(f_prev * self.a.T, axis=1)
            f_curr = self.b.T[o_i] * prev_f_sum
            fwd.append([f_curr])
            f_prev = f_curr
        return np.concatenate(fwd, axis=0)

    def backward(self, obs):
        # backward variable;
        # beta_t(i) = P(future O sequence|qt = Si, hmm)
        b_prev = np.ones(self.N)
        bkw = [[b_prev]]
        for i, o_i_plus in enumerate(reversed(obs[1:])):
            b_curr = np.sum(a * self.b.T[o_i_plus] * b_prev, axis=1)
            bkw.insert(0, [b_curr])
            b_prev = b_curr
        return np.concatenate(bkw, axis=0)

    def obs_probability(self, obs):
        """Computed with the forward-backward procedure."""
        fwd = self.forward(obs)
        return np.sum(fwd[-1])

    def optimal_path(self, obs):
        """Computed with Viterbi algorithm."""
        return self.viterbi(obs)[1]

    def viterbi(self, obs):
        V = [self.pi * self.b.T[obs[0]]]
        path = [[s] for s in self.states]

        for t in xrange(1, len(obs)):
            v = V[t-1] * (self.a * self.b.T[obs[t]]).T
            prob, state = np.amax(v, axis=1), np.argmax(v, axis=1)
            V.append(prob)
            path = [path[state[y]] + [y] for y in self.states]
        # Return the most likely sequence
        prob, state = np.amax(V[-1]), np.argmax(V[-1])
        return prob, path[state]

    def update(self, obs, n_iters=1):
        """Update a, b, pi so that `obs` is more likely.

        Uses the Baum-Welch algorithm.
        """
        obs = np.asarray(obs)
        print("Before P(obs) = %f" % self.obs_probability(obs))

        for _ in xrange(n_iters):
            alpha = self.forward(obs)
            beta = self.backward(obs)

            xi = np.zeros((self.N, self.N, len(obs) - 1))
            for t in xrange(len(obs) - 1):
                denom = np.dot(np.dot(alpha[t], self.a) * self.b.T[obs[t+1]],
                               beta[t+1])
                for i in xrange(self.N):
                    numer = (alpha[t, i]
                            * self.a[i]
                            * self.b.T[obs[t+1]]
                            * beta[t+1])
                    xi[i, :, t] = numer / denom
  
            # gamma_t(i) = P(q_t = S_i | O, hmm)
            gamma = np.squeeze(np.sum(xi, axis=1))
            # Need final gamma element for new B
            prod = (alpha[-1] * beta[-1]).reshape((-1,1))
            gamma = np.hstack((gamma,  prod / np.sum(prod))) # append one more to gamma!!!

            new_pi = gamma.T[0]
            new_a = np.sum(xi, axis=2) / np.sum(gamma[:, :-1], axis=1).reshape((-1,1))
            new_b = np.array(b)

            if False:
                plt.figure()
                plt.plot(gamma[1])
                plt.ylim(-0.1,1.1)
                plt.legend(('Probability State=1'))
                plt.xlabel('Time')
            
            n_levels = self.b.shape[1]
            sumgamma = np.sum(gamma, axis=1)
            for lev in xrange(n_levels):
                ix = obs == lev
                new_b.T[lev] = np.sum(gamma[:, ix], axis=1) / sumgamma

            self.pi[...] = new_pi
            self.a[...] = new_a
            self.b[...] = new_b
        print("After P(obs) = %f" % self.obs_probability(obs))

N = 2
M = 3
pi = np.array([0.6, 0.4])
a = np.array([[0.7, 0.3], [0.4, 0.6]])
b = np.array([[0.5, 0.4, 0.1], [0.1, 0.3, 0.6]])
obs = np.array([0, 1, 2])

hmm = HMM(a, b, pi)
fwd = hmm.forward(obs)
bwd = hmm.backward(obs)
posterior = fwd * bwd / np.sum(fwd[-1])
# print fwd
# print bwd
# print posterior
# print hmm.obs_probability([2])
hmm.viterbi(obs)
hmm.update([0, 0, 0], n_iters=20)

In [None]:
print hmm.a
print np.sum(hmm.a, axis=1)
print hmm.b
print np.sum(hmm.b, axis=1)
print hmm.pi
print np.sum(hmm.pi)

### Continuous time Baum-Welch

```
function res = BaumWelch(x0)
  N=100; %Number of points
  S = textread(’ProcessedTrace.txt’);
  S = S(1:N,1:2);
  Times = S(:,1);
  Obs = S(:,2);

  %%%%%% initialization of parameters %%%%%%%%%%%%%%
  a=x0(1);
  b=x0(2);
  Q = [[-a a];[b -b]];
  nu = [1/2 1/2];
  G = [nu ; nu];
  alpha = ones(N,2);
  beta = alpha;
  phi = zeros(N-1,4);
  Gmem = 0;
  D=zeros(N,1);
  options = optimset(’LargeScale’,’off’,’MaxFunEvals’, 5000);
  
  %%%%%% Baum-Welch algorithm
  Nits = 0;
  while (norm(G-Gmem)>10^(-5) || Nits == 0) %%%% STOP condition
    Gmem = G
    
    %*********** Update the alphas *************%
    alpha(1,:) = [nu(1)*G(1,Obs(1)) nu(2)*G(2,Obs(1))];
    alpha(1,:) = alpha(1,:)/sum(alpha(1,:));
    for t= 2:N
      E = expm((Times(t)-Times(t-1))*Q);
      for j=1:2
        alpha(t,j) = G(j,Obs(t))*E(1,j)*alpha(t-1,1) + G(j,Obs(t))*E(2,j)*alpha(t-1,2);
      end
      D(t) = sum(alpha(t,:));
      alpha(t,:) = alpha(t,:)/D(t);
    end

    %*********** Update the betas *************%
    for t=1:N-1
      t1 = N-t;
      E = expm((Times(t1+1)-Times(t1))*Q);
      for j=1:2
        beta(t1,j) = G(1,Obs(t1+1))*E(j,1)*beta(t1+1,1) + G(2,Obs(t1+1))*E(j,2)*beta(t1+1,2);
      end
      beta(t1,:) = beta(t1,:)/D(t1+1);
    end
    beta;

    %*********** Update the phis *************%
    for t=1:N-1
      E = expm((Times(t+1)-Times(t))*Q);
      phi(t,1) = E(1,1)*alpha(t,1)*G(1,Obs(t+1))*beta(t+1,1)/D(t+1);
      phi(t,2) = E(1,2)*alpha(t,1)*G(2,Obs(t+1))*beta(t+1,2)/D(t+1);
      phi(t,3) = E(2,1)*alpha(t,2)*G(1,Obs(t+1))*beta(t+1,1)/D(t+1);
      phi(t,4) = E(2,2)*alpha(t,2)*G(2,Obs(t+1))*beta(t+1,2)/D(t+1);
    end
    phi;

    %\\\\\\\\\\\\\\\\\\\\\\\Update the parameters //////////////////////////%
 
    %***************Updating the initial distribution nu *******************%
    nu(1) = alpha(1,1)*beta(1,1) / (alpha(1,1)*beta(1,1) + alpha(1,2)*beta(1,2)) ;
    nu(2) = alpha(1,2)*beta(1,2) / (alpha(1,1)*beta(1,1) + alpha(1,2)*beta(1,2)) ;
   
    %*************** Updating the G matrix ****************************%
    for i=1:2
      for j = 1:2
        temp1 = 0;
        temp2 = 0;
        for k = 1:N
          if (Obs(k)) == j
            temp1 = temp1 + alpha(k,i)*beta(k,i);
          end
          temp2 = temp2 + alpha(k,i)*beta(k,i);
        end
        G(i,j) = temp1/temp2;
      end
    end

    %**************** Updating the Generator matrix Q********************%
    s = fsolve(@myfun, [a b], options)
    a=s(1);
    b=s(2);
    Q = [[-a a];[b -b]]
    Nits = Nits + 1
  end

  %**************** Definition of Equations********************%
  function F = myfun(x)
    F1 = 0;
    F2 = 0;
    for i=1:N-1
      d = Times(i+1)-Times(i);
      e = exp(-(x(1)+x(2))*d);
      F1 = F1 + phi(i,1) * ((1-d*x(1)- x(1)/(x(1)+x(2)))*e - x(2)/(x(1)+x(2)))/(x(2)+x(1)*e) +
                phi(i,2) * ((d*x(1)-1+ x(1)/(x(1)+x(2)))*e - x(1)/(x(1)+x(2))+1)/(x(1)-x(1)*e) +
                phi(i,3)*((d*x(2)+ x(2)/(x(1)+x(2)))*e - x(2)/(x(1)+x(2)))/(x(2)-x(2)*e) +
                phi(i,4)*((-d*x(2)- x(2)/(x(1)+x(2)))*e - x(1)/(x(1)+x(2))+1)/(x(1)+x(2)*e);
      F2 = F2 + phi(i,1) * ((-x(1)/(x(1)+x(2))-d*x(1))*e - x(2)/(x(1)+x(2))+1)/(x(2)+x(1)*e) +
                phi(i,2) * ((d*x(1)+ x(1)/(x(1)+x(2)))*e - x(1)/(x(1)+x(2)))/(x(1)-x(1)*e) +
                phi(i,3)*((d*x(2)+ x(2)/(x(1)+x(2))-1)*e - x(2)/(x(1)+x(2))+1)/(x(2)-x(2)*e) +
                phi(i,4)*((1-x(2)/(x(1)+x(2))-d*x(2))*e - x(1)/(x(1)+x(2)))/(x(1)+x(2)*e);
    end
    F=[F1 F2];
  end
end
```

## HMMs for syllable recognition

In [None]:
# Get a gesture score
dt = 0.02
gs = skspeech.vtl.parse_ges('ges-de-cvc/das.ges')
print gs.t_end
traj = gs.trajectory(dt=dt)
# For dot products, we change this slighty
# so that non-gestures are -1, gestures are 1
traj[traj > 0] = 2.
traj -= 1.

plt.pcolormesh(traj.T)
plt.colorbar()
traj.shape

In [None]:
# Make an HMM for this syllable

# Make a state for each distinct portion of the trajectory
onoff = np.diff(np.vstack([traj, np.zeros(traj.shape[1])]), axis=0)

# Find all the places where transitions happen
transitions = np.unique(np.nonzero(onoff)[0])
# transitions 1 timestep long are erroneous (data problem)
ix = np.diff(np.hstack([transitions, transitions[-1] + 2])) > 1
transitions = transitions[ix]
n_states = transitions.size

# The VTG for each state is the observation.
# We store the actual VTG for that state,
# as we will take the dot product to find
# the similarity.
vtgs = traj[transitions-1]  # Subtract 1 to get VTG before transition

# We also add a "null" observation for VTGs
# not matching those useful for the syllable.
# NB! This is probably important.
# It should be greater than the other VTGs for
# any VTGs that we don't care about.
# XXX try with this
#null_vtg = -np.ones(vtgs.shape[1])
#vtgs = np.vstack([vtgs, null_vtg])
plt.pcolormesh(vtgs.T)

In [None]:
# As a quick test, let's find the normalized
# dot product of the VTGs across the trajectory.
similarity = np.dot(traj, vtgs.T) / vtgs.shape[1]

plt.pcolormesh(traj.T)
plt.figure()
plt.plot(similarity)
plt.legend(np.arange(vtgs.shape[1]))
plt.figure()
plt.plot(similarity.astype(int))

In [None]:
# This is a simple left-right Markov process.
# The probability to stay in the current state is
# how long the state is proportional to the syllable;
# otherwise, it transitions to the next state.
p_stay = np.hstack([transitions[0], np.diff(transitions)]) / float(traj.shape[0])
a = np.zeros((n_states, n_states))
di_0, di_1 = np.diag_indices(n_states)
a[di_0, di_1] = p_stay
a[di_0, np.roll(di_1, -1)] = 1 - p_stay
print a

# The probabilitity of an observation
# is 1 for the observations we want.
b = np.identity(n_states)
# b = np.hstack([b, np.zeros(n_states)[:, np.newaxis]])
print b

# We always start in the first state
pi = np.zeros(n_states)
pi[0] = 1.
print pi

In [None]:
# Make the HMM, and see how likely our input sequence is
hmm = HMM(a, b, pi)
obs = np.argmax(similarity, axis=1)
print obs
hmm.update(obs, n_iters=10000)
fwd = hmm.forward(obs)
print fwd[:20]

In [None]:
print hmm.a