In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from hsmmlearn.hsmm import GaussianHSMM

In [2]:
df = pd.read_csv('hsmm_data.csv')
df.columns = ['num', 'id', 'states', 'durations']
df

Unnamed: 0,num,id,states,durations
0,0,16000277,0,68.0
1,1,16000277,0,22.0
2,2,16000277,0,69.0
3,3,16000277,0,9.0
4,4,16000277,0,58.0
...,...,...,...,...
6394,27,3004067,2,108.0
6395,28,3004067,1,24.0
6396,29,3004067,1,3.0
6397,30,3004067,1,3.0


In [3]:
# clipping durations to max 120 seconds
df.loc[df.durations > 120, 'durations'] = 120
# taking log of durations
df['durations_log'] = np.log(df.durations + 1).astype(int)
df

Unnamed: 0,num,id,states,durations,durations_log
0,0,16000277,0,68.0,4
1,1,16000277,0,22.0,3
2,2,16000277,0,69.0,4
3,3,16000277,0,9.0,2
4,4,16000277,0,58.0,4
...,...,...,...,...,...
6394,27,3004067,2,108.0,4
6395,28,3004067,1,24.0,3
6396,29,3004067,1,3.0,1
6397,30,3004067,1,3.0,1


In [4]:
df.durations_log.unique()

array([4, 3, 2, 1, 0])

In [5]:
n_durations = len(df.durations_log.unique())
n_durations

5

In [6]:
n_states = 3

In [7]:
# find durations matrix from the data (could be done using the `fit` method also (?))
# Each row corresponds to a hidden state, with the `i`-th entry the probability of seeing duration `i` in that state.
durations = np.zeros((n_states, n_durations))

for state in range(n_states):
    total = (df.states == state).sum()
    for duration in range(n_durations):
        num = ((df.states == state) & (df.durations_log == duration)).sum()
        durations[state, duration] = num / total
durations

array([[0.06111444, 0.26363092, 0.16237268, 0.13780707, 0.3750749 ],
       [0.12596685, 0.35211786, 0.15801105, 0.12559853, 0.23830571],
       [0.14789082, 0.44466501, 0.14937965, 0.09330025, 0.16476427]])

In [8]:
# for `fit` method, observations should have each observation (here, `states`) for the corresponding `duration` number of instances (?)
# and we should find these observations for each student separately so that we can find the outliers based on the parameters of the model for each student

In [9]:
df0 = df[df.id == df.id[0]]
df0

Unnamed: 0,num,id,states,durations,durations_log
0,0,16000277,0,68.0,4
1,1,16000277,0,22.0,3
2,2,16000277,0,69.0,4
3,3,16000277,0,9.0,2
4,4,16000277,0,58.0,4
...,...,...,...,...,...
202,202,16000277,2,1.0,0
203,203,16000277,2,3.0,1
204,204,16000277,2,5.0,1
205,205,16000277,2,5.0,1


In [10]:
durations0 = np.zeros((n_states, n_durations))

for state in range(n_states):
    total = (df0.states == state).sum()
    for duration in range(n_durations):
        num = ((df0.states == state) & (df0.durations_log == duration)).sum()
        durations0[state, duration] = num / total
durations0

array([[0.01923077, 0.25      , 0.13461538, 0.09615385, 0.5       ],
       [0.01538462, 0.18461538, 0.32307692, 0.21538462, 0.26153846],
       [0.15555556, 0.6       , 0.16666667, 0.03333333, 0.04444444]])

In [11]:
# observations0 = []
# for i in range(len(df0)):
#     observations0.extend([df0.states[i]] * int(df0.durations_log[i] + 1))
# observations0 = np.array(observations0)
# len(observations0)

In [12]:
tmat = np.array([
    [0.0, 0.5, 0.5],
    [0.3, 0.0, 0.7],
    [0.6, 0.4, 0.0]
])

means = np.array([0.0, 1.0, 2.0])
scales = np.ones_like(means)

In [13]:
hsmm0 = GaussianHSMM(means, scales, durations0, tmat)

In [14]:
df0.states

0      0
1      0
2      0
3      0
4      0
      ..
202    2
203    2
204    2
205    2
206    2
Name: states, Length: 207, dtype: int64

In [15]:
hsmm0.fit(df.states)

NoConvergenceError: The forward-backward algorithm encountered an internal error after 1 steps. Try reducing the `num_iter` parameter. Log-likelihood procession: [].

In [16]:
hsmm0.fit(df0.durations_log)

(False, 1132.0419557693583)