In [1]:
from hmmlearn import hmm
import numpy as np

## 准备数据

每个字对应一个状态（B: Begin, M: Middle, E: End, S: Single）

In [2]:
# 示例数据
data = [
    ("我们", "BE"),
    ("喜欢", "BE"),
    ("学习", "BE"),
    ("人工智能", "BMME"),
    ("是", "S"),
    ("未来", "BE"),
    ("的", "S"),
    ("方向", "BE"),
]

将数据转换为观测序列和状态序列

In [3]:
observations = []
states = []

In [None]:
for sentence, state_seq in data:
    observations.extend(list(sentence))
    states.extend(list(state_seq))
observations, states

(['我',
  '们',
  '喜',
  '欢',
  '学',
  '习',
  '人',
  '工',
  '智',
  '能',
  '是',
  '未',
  '来',
  '的',
  '方',
  '向',
  '我',
  '们',
  '喜',
  '欢',
  '学',
  '习',
  '人',
  '工',
  '智',
  '能',
  '是',
  '未',
  '来',
  '的',
  '方',
  '向'],
 ['B',
  'E',
  'B',
  'E',
  'B',
  'E',
  'B',
  'M',
  'M',
  'E',
  'S',
  'B',
  'E',
  'S',
  'B',
  'E',
  'B',
  'E',
  'B',
  'E',
  'B',
  'E',
  'B',
  'M',
  'M',
  'E',
  'S',
  'B',
  'E',
  'S',
  'B',
  'E'])

将字符和状态转换为数字

In [None]:
char_to_idx = {char: idx for idx, char in enumerate(set(observations))}
state_to_idx = {state: idx for idx, state in enumerate(set(states))}
char_to_idx, state_to_idx

({'未': 0,
  '我': 1,
  '能': 2,
  '工': 3,
  '喜': 4,
  '学': 5,
  '欢': 6,
  '智': 7,
  '是': 8,
  '们': 9,
  '人': 10,
  '方': 11,
  '向': 12,
  '来': 13,
  '习': 14,
  '的': 15},
 {'M': 0, 'E': 1, 'B': 2, 'S': 3})

In [None]:
X = np.array([[char_to_idx[char]] for char in observations])
y = np.array([state_to_idx[state] for state in states])
X, y

(array([[ 1],
        [ 9],
        [ 4],
        [ 6],
        [ 5],
        [14],
        [10],
        [ 3],
        [ 7],
        [ 2],
        [ 8],
        [ 0],
        [13],
        [15],
        [11],
        [12],
        [ 1],
        [ 9],
        [ 4],
        [ 6],
        [ 5],
        [14],
        [10],
        [ 3],
        [ 7],
        [ 2],
        [ 8],
        [ 0],
        [13],
        [15],
        [11],
        [12]]),
 array([2, 1, 2, 1, 2, 1, 2, 0, 0, 1, 3, 2, 1, 3, 2, 1, 2, 1, 2, 1, 2, 1,
        2, 0, 0, 1, 3, 2, 1, 3, 2, 1]))

## 模型训练

In [58]:
model = hmm.MultinomialHMM(n_components=len(state_to_idx), n_iter=100)
model.fit(X, lengths=[len(sentence) for sentence, _ in data])

Fitting a model with 75 free scalar parameters with only 32 data points will result in a degenerate solution.


In [11]:
def segment_sentence(sentence, model, char_to_idx, state_to_idx):
    # 将句子转换为观测序列
    X_new = np.array([[char_to_idx[char]] for char in sentence])

    # 预测状态序列
    logprob, state_seq = model.decode(X_new)

    print(logprob)

    # 将状态序列转换为标签
    idx_to_state = {idx: state for state, idx in state_to_idx.items()}
    state_seq = [idx_to_state[idx] for idx in state_seq]

    # 根据状态序列进行分词
    segmented_sentence = []
    for char, state in zip(sentence, state_seq):
        segmented_sentence.append(char)
        if state == 'E' or state == 'S':
            segmented_sentence.append(' ')

    return ''.join(segmented_sentence).strip()


# 测试分词
test_sentence = "我们喜欢学习人工智能"
segmented_sentence = segment_sentence(
    test_sentence, model, char_to_idx, state_to_idx)
print(segmented_sentence)  # 输出: "我们 喜欢 学习 人工智能"

-106.914136642546
我 们 喜欢 学习 人工 智 能


## 使用自定义的参数

In [13]:
n_state = len(set(states))
n_observation = len(set(observations))
n_state, n_observation

(4, 16)

In [25]:
start_prob = []
str_data = [v[0] for v in data]
state_data = [v[1] for v in data]
str_data, state_data

(['我们', '喜欢', '学习', '人工智能', '是', '未来', '的', '方向'],
 ['BE', 'BE', 'BE', 'BMME', 'S', 'BE', 'S', 'BE'])

In [30]:
char_to_idx = {char: idx for idx, char in enumerate(set(observations))}
state_to_idx = {state: idx for idx, state in enumerate(set(states))}
char_to_idx, state_to_idx

({'未': 0,
  '我': 1,
  '能': 2,
  '工': 3,
  '喜': 4,
  '学': 5,
  '欢': 6,
  '智': 7,
  '是': 8,
  '们': 9,
  '人': 10,
  '方': 11,
  '向': 12,
  '来': 13,
  '习': 14,
  '的': 15},
 {'M': 0, 'E': 1, 'B': 2, 'S': 3})

### 初始概率

In [52]:
start_prob = []
c_map = {state: 0 for state in set(state_to_idx.keys())}
count = 0
for v in state_data:
    for c in v:
        c_map[c] += 1
        count += 1
print(c_map)
for k, v in c_map.items():
    start_prob.append(v / count)
start_prob

{'M': 2, 'E': 6, 'B': 6, 'S': 2}


[0.125, 0.375, 0.375, 0.125]

In [53]:
state_to_idx

{'M': 0, 'E': 1, 'B': 2, 'S': 3}

### 状态转移矩阵

In [70]:
trans_mat = np.zeros((n_state, n_state))
for s1, v1 in state_to_idx.items():  # 计算每一个状态到另一个状态的转移概率
    total = 0
    for s2, v2 in state_to_idx.items():
        co = 0
        for row in range(len(state_data)):  # 遍历每一个词语
            for i in range(len(state_data[row]) - 1):
                if state_data[row][i] == s1 and state_data[row][i + 1] == s2:
                    co += 1
                    total += 1
        trans_mat[v1][v2] = co
    trans_mat[v1] = trans_mat[v1] / total if total != 0 else 0.25

trans_mat, model.transmat_

(array([[0.5       , 0.5       , 0.        , 0.        ],
        [0.25      , 0.25      , 0.25      , 0.25      ],
        [0.16666667, 0.83333333, 0.        , 0.        ],
        [0.25      , 0.25      , 0.25      , 0.25      ]]),
 array([[0.00000000e+000, 7.17356037e-001, 1.98191190e-263,
         2.82643963e-001],
        [0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
         5.54182877e-121],
        [0.00000000e+000, 6.25815306e-244, 1.00000000e+000,
         3.70815361e-010],
        [0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
         5.67282588e-122]]))

### 发射矩阵

In [71]:
emit_mat = np.zeros((n_state, n_observation))
for s1, v1 in state_to_idx.items():
    for s2, v2 in char_to_idx.items():
        co = 0
        for row in range(len(state_data)):
            for i in range(len(state_data[row])):
                if state_data[row][i] == s1 and str_data[row][i] == s2:
                    co += 1
        emit_mat[v1][v2] = co
for i in range(len(emit_mat)):
    total = 0
    for j in range(len(emit_mat[i])):
        total += emit_mat[i][j]
    emit_mat[i] = emit_mat[i] / total
emit_mat

array([[0.        , 0.        , 0.        , 0.5       , 0.        ,
        0.        , 0.        , 0.5       , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.16666667, 0.        , 0.        ,
        0.        , 0.16666667, 0.        , 0.        , 0.16666667,
        0.        , 0.        , 0.16666667, 0.16666667, 0.16666667,
        0.        ],
       [0.16666667, 0.16666667, 0.        , 0.        , 0.16666667,
        0.16666667, 0.        , 0.        , 0.        , 0.        ,
        0.16666667, 0.16666667, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.5       , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5       ]])

In [72]:
model_div = hmm.MultinomialHMM(n_components=n_state, n_iter=100)
model_div.startprob_ = np.array(start_prob)
model_div.transmat_ = np.array(trans_mat)
model_div.emissionprob_ = np.array(emit_mat)

In [74]:
test_sentence = "我们喜欢学习人工智能"
X_test = np.array([[char_to_idx[char] for char in test_sentence]])
logprob, state_seq = model_div.decode(X_test.T)
state_seq

array([2, 1, 2, 1, 2, 1, 2, 0, 0, 1])

In [75]:
segment_sentence(test_sentence, model_div, char_to_idx, state_to_idx)

-24.58510095204553


'我们 喜欢 学习 人工智能'