In [1]:
import numpy as np
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder

# Sample DNA sequences (observations) and labels (states)
sequences = ['ATGCG', 'CGTGC', 'ATTCG']  # Example DNA sequences
states = [['G', 'G', 'N', 'N', 'G'], ['N', 'N', 'G', 'G', 'N'], ['G', 'N', 'N', 'G', 'G']]  # G=Gene, N=Non-Gene

# Define possible observations and states
observations = list("ACGT")
hidden_states = ['G', 'N']

# Encode observations and states numerically
obs_encoder = LabelEncoder().fit(observations)
state_encoder = LabelEncoder().fit(hidden_states)

# Encode all sequences
X = np.concatenate([obs_encoder.transform(list(seq)) for seq in sequences]).reshape(-1, 1)
lengths = [len(seq) for seq in sequences]
Y = np.concatenate([state_encoder.transform(label_seq) for label_seq in states])

# Initialize and train HMM using MLE
model = hmm.MultinomialHMM(n_components=len(hidden_states), n_iter=100, tol=0.01)
model.fit(X, lengths)

# Print learned parameters
print("Transition matrix:\n", model.transmat_)
print("Emission matrix:\n", model.emissionprob_)
print("Start probabilities:\n", model.startprob_)

# Predict hidden states for a new sequence
test_seq = 'ATCGT'
encoded_seq = obs_encoder.transform(list(test_seq)).reshape(-1, 1)
logprob, hidden = model.decode(encoded_seq, algorithm="viterbi")
decoded_states = state_encoder.inverse_transform(hidden)

print("Input sequence:", test_seq)
print("Predicted states:", ''.join(decoded_states))

MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


Transition matrix:
 [[0.26685518 0.73314482]
 [0.98941115 0.01058885]]
Emission matrix:
 [[1.]
 [1.]]
Start probabilities:
 [8.26211692e-04 9.99173788e-01]
Input sequence: ATCGT
Predicted states: NGNGN
