In [181]:
import numpy as np
from collections import defaultdict
from check import check

In [182]:
class CRF:
    def __init__(self, tags, feature_templates):
        self.tags = tags
        self.tag2idx = {tag: idx for idx, tag in enumerate(tags)}
        self.feature_templates = feature_templates
        self.weights = defaultdict(float)                               # 特征权重
        self.transition = np.random.randn(len(tags), len(tags)) * 0.01  # 转移矩阵

    # 提取特征
    def extract_features(self, seq, pos, prev_tag, current_tag):
        features = []
        for template in self.feature_templates:
            if ":" not in template:
                print("模板格式错误：", template)
                continue
            # Unigram特征
            if template.startswith("U"):
                parts = template.split(":")[1].split("/")
                context = []
                for part in parts:
                    off = int(part[3:-1].split(",")[0])
                    idx = pos + off
                    if idx < 0:
                        context.append("[BEG]")
                    elif idx >= len(seq):
                        context.append("[END]")
                    else:
                        context.append(seq[idx])
                features.append(f"{current_tag}::{template}:{'/'.join(context)}")
            # Bigram特征
            elif template.startswith("B"):
                if prev_tag is not None:
                    parts = template.split(":")[1].split("/")
                    context = []
                    for part in parts:
                        off = int(part[3:-1].split(",")[0])
                        idx = pos + off
                        if idx < 0:
                            context.append("[BEG]")
                        elif idx >= len(seq):
                            context.append("[END]")
                        else:
                            context.append(seq[idx])
                    features.append(f"{prev_tag}->{current_tag}::{template}:{'/'.join(context)}")
        return features
                
    # 前向算法
    # alpha[t, tag]: 给定观测序列的前t+1个词时，以tag作为第t个词的标签的所有路径的“分数”之和
    # Z: 配分函数（所有可能路径的得分和）
    def forward(self, seq):
        T = len(seq)
        N = len(self.tags)
        alpha = np.zeros((T, N))

        # 初始化
        for i in range(N):
            features = self.extract_features(seq, 0, None, self.tags[i])
            alpha[0][i] = sum(self.weights[f] for f in features)
        
        # 递推
        for t in range(1, T):
            for tag in range(N):
                score = 0
                for prev_tag in range(N):
                    features = self.extract_features(seq, t, self.tags[prev_tag], self.tags[tag])
                    trans_score = self.transition[prev_tag, tag]
                    emit_score = sum(self.weights[f] for f in features)
                    score += np.exp(alpha[t - 1][prev_tag] + trans_score + emit_score)
                alpha[t, tag] = np.log(score) if score > 0 else -np.inf

        log_Z = np.log(sum(np.exp(alpha[-1]))) if any(np.isfinite(alpha[-1])) else -np.inf
        return alpha, log_Z

    # 后向算法
    # beta[t, tag]: 在给定观测序列的第t个词处，已知该词的标签为tag，从t到序列末尾的所有可能标签路径的“分数”之和。
    def backward(self, seq):
        T = len(seq)
        N = len(self.tags)
        beta = np.zeros((T, N))
        
        # 初始化
        beta[T-1, :] = 0

        # 递推
        for t in range(T-2, -1, -1):
            for tag in range(N):
                score = 0
                for next_tag in range(N):
                    features = self.extract_features(seq, t+1, self.tags[tag], self.tags[next_tag])
                    trans_score = self.transition[tag, next_tag]
                    emit_score = sum(self.weights[f] for f in features)
                    score += np.exp(beta[t+1, next_tag] + trans_score + emit_score)
                beta[t, tag] = np.log(score) if score > 0 else -np.inf
        return beta
    
    # 维特比解码
    def viterbi_decode(self, seq):
        T = len(seq)
        N = len(self.tags)
        viterbi = np.zeros((T, N))
        backptrs = np.zeros((T, N), dtype=int)    # 回溯指针

        # 初始化
        for tag in range(N):
            features = self.extract_features(seq, 0, None, self.tags[tag])
            viterbi[0, tag] = sum(self.weights[f] for f in features)

        # 递推
        for t in range(1, T):
            for i in range(N):        # 当前标签
                max_score = -np.inf
                best_prev_tag = 0
                for j in range(N):     # 前一个标签
                    trans_score = self.transition[j][i]
                    features = self.extract_features(seq, t, self.tags[j], self.tags[i])
                    emit_score = sum(self.weights[f] for f in features)
                    score = viterbi[t-1][j] + trans_score + emit_score
                    if score > max_score:
                        max_score = score
                        best_prev_tag = j
                viterbi[t][i] = max_score
                backptrs[t][i] = best_prev_tag

        # 回溯
        best_path = [np.argmax(viterbi[-1])]
        for t in reversed(range(1, T)):
            best_path.append(backptrs[t][best_path[-1]])
        best_path.reverse()

        return [self.tags[i] for i in best_path]
    
    def compute_gradients(self, seq, tag_seq):
        T = len(seq)
        N = len(self.tags)

        # 经验特征（真实情况中出现过的特征）
        empirical_features = set()
        for t in range(T):
            prev_tag = tag_seq[t-1] if t > 0 else None
            features = self.extract_features(seq, t, prev_tag, tag_seq[t])
            empirical_features.update(features)

        # 期望特征（模型算出来的特征）
        alpha, log_Z = self.forward(seq)
        beta = self.backward(seq)
        expected_features = defaultdict(float)

        # t=0
        for i in range(N):
            features = self.extract_features(seq, 0, None, self.tags[i])
            p = np.exp(alpha[0][i] + beta[0][i] - log_Z) if log_Z != -np.inf else 0
            for f in features:
                expected_features[f] += p

        # t>=1
        for t in range(1, T):
            for i in range(N):      # current tag
                for j in range(N):  # prev tag
                    features = self.extract_features(seq, t, self.tags[j], self.tags[i])
                    trans_score = self.transition[i][j]
                    emit_score = sum(self.weights[f] for f in features)
                    p = np.exp(alpha[t-1][i] + trans_score + emit_score + beta[t][j] - log_Z) if log_Z != -np.inf else 0
                    for f in features:
                        expected_features[f] += p
        
        # 计算 weights 梯度
        w_gradient = defaultdict(float)
        for f in empirical_features:
            w_gradient[f] += 1  # 真实特征计数
        for f in expected_features:
            w_gradient[f] -= expected_features[f]

        # 计算 transition 矩阵梯度
        t_gradient = np.zeros((N, N))
        for t in range(1, T):
            # empirical
            i = self.tag2idx[tag_seq[t-1]]
            j = self.tag2idx[tag_seq[t]]
            t_gradient[i][j] += 1

            # expected
            for i_ in range(N):
                for j_ in range(N):
                    features = self.extract_features(seq, t, self.tags[i_], self.tags[j_])
                    p = np.exp(alpha[t-1][i_] + self.transition[i_][j_] + sum(self.weights[f] for f in features) + beta[t][j_] - log_Z) if log_Z != -np.inf else 0
                    t_gradient[i_][j_] -= p

        return w_gradient, t_gradient, log_Z


    # 训练CRF模型
    def train(self, seqs, tag_seqs, lr=0.1, max_iter=10):
        for iter in range(max_iter):
            avg_loss = 0
            for seq, tag_seq in zip(seqs, tag_seqs):
                w_grad, t_grad, log_Z = self.compute_gradients(seq, tag_seq)

                for f in w_grad:
                    self.weights[f] += lr * w_grad[f]
                
                self.transition += lr * t_grad
                
                # 计算损失
                score = 0
                for t in range(len(seq)):
                    prev_tag = tag_seq[t-1] if t > 0 else None
                    features = self.extract_features(seq, t, prev_tag, tag_seq[t])
                    score += sum(self.weights[f] for f in features)
                    if t > 0:
                        i = self.tag2idx[tag_seq[t-1]]
                        j = self.tag2idx[tag_seq[t]]
                        score += self.transition[i, j]
                avg_loss += log_Z - score

            avg_loss /= len(seqs)
            print(f"Iteration {iter}, Loss: {avg_loss}")

In [183]:
feature_templates = [
    "U00:%x[-2,0]",
    "U01:%x[-1,0]",
    "U02:%x[0,0]",
    "U03:%x[1,0]",
    "U04:%x[2,0]",
    "U05:%x[-2,0]/%x[-1,0]",
    "U06:%x[-1,0]/%x[0,0]",
    "U07:%x[-1,0]/%x[1,0]",
    "U08:%x[0,0]/%x[1,0]",
    "U09:%x[1,0]/%x[2,0]",
]

In [184]:
# 定义标签集
tags = ["O", "B-NAME", "I-NAME", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

# 初始化CRF
crf = CRF(tags, feature_templates)

# 训练数据示例
train_sentences = [["张", "三", "在", "北京", "工作"], ["李", "四", "是", "腾讯", "员工"]]
train_tags = [["B-NAME", "I-NAME", "O", "B-LOC", "O"], ["B-NAME", "I-NAME", "O", "B-ORG", "O"]]

# 训练模型
crf.train(train_sentences, train_tags, max_iter=50)

# 预测新句子
test_sentence = ["王", "五", "来自", "上海"]
predicted_tags = crf.viterbi_decode(test_sentence)
print("预测结果:", list(zip(test_sentence, predicted_tags)))

Iteration 0, Loss: 4.843256297009441
Iteration 1, Loss: 1.162976847863514
Iteration 2, Loss: -1.0869853298223768
Iteration 3, Loss: -2.5001258395600354
Iteration 4, Loss: -3.2525668685552915
Iteration 5, Loss: -3.5916067878546905
Iteration 6, Loss: -3.7321245985288627
Iteration 7, Loss: -3.7889379629642352
Iteration 8, Loss: -3.811516540115367
Iteration 9, Loss: -3.8197554907994764
Iteration 10, Loss: -3.821629253774411
Iteration 11, Loss: -3.8204288592528926
Iteration 12, Loss: -3.8175880221557357
Iteration 13, Loss: -3.8137842825752024
Iteration 14, Loss: -3.8093746650367706
Iteration 15, Loss: -3.804572129219892
Iteration 16, Loss: -3.799519182463861
Iteration 17, Loss: -3.7943197592669122
Iteration 18, Loss: -3.7890537305995906
Iteration 19, Loss: -3.7837839854542494
Iteration 20, Loss: -3.7785602487129566
Iteration 21, Loss: -3.7734214270216384
Iteration 22, Loss: -3.768397271996932
Iteration 23, Loss: -3.763509712582234
Iteration 24, Loss: -3.7587740111429824
Iteration 25, Loss: 

In [185]:
def process_data(file_path):
    # 初始化数据结构
    tags = set()
    words = set()
    tag_seqs = []
    word_seqs = []
    
    current_state_seq = []
    current_obs_seq = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:  # 空行表示句子结束
                if current_state_seq and current_obs_seq:
                    tag_seqs.append(current_state_seq)
                    word_seqs.append(current_obs_seq)
                    current_state_seq = []
                    current_obs_seq = []
                continue
                
            parts = line.split()
            if len(parts) >= 2:  # 确保有词和标签
                word = parts[0]
                tag = parts[-1]  # 假设标签在最后
                
                # 更新状态和观测集合
                tags.add(tag)
                words.add(word)
                
                # 添加到当前序列
                current_state_seq.append(tag)
                current_obs_seq.append(word)
    
    # 处理最后一个句子（如果文件不以空行结尾）
    if current_state_seq and current_obs_seq:
        tag_seqs.append(current_state_seq)
        word_seqs.append(current_obs_seq)
    
    # 转换为列表并排序（为了确定性）
    tags = sorted(tags)
    words = sorted(words)
    
    return {
        'tags': tags,
        'words': words,
        'tag_seqs': tag_seqs,
        'word_seqs': word_seqs
    }

In [193]:
# 加载训练数据
train_data_path = "./NER/Chinese/mytrain.txt"
train_data = process_data(train_data_path)

tags = train_data['tags']
words = train_data['words']
tag_seqs = train_data['tag_seqs']
word_seqs = train_data['word_seqs']

In [202]:
# 训练CRF模型
crf = CRF(tags, feature_templates)
crf.train(word_seqs, tag_seqs, max_iter=60)

Iteration 0, Loss: 21.494153608466444
Iteration 1, Loss: 4.447593915613142
Iteration 2, Loss: -0.11345884484030933
Iteration 3, Loss: -2.7877959686463543
Iteration 4, Loss: -4.235524016701729
Iteration 5, Loss: -4.958916460977056
Iteration 6, Loss: -5.326304825290416
Iteration 7, Loss: -5.533958328757754
Iteration 8, Loss: -5.670547631783288
Iteration 9, Loss: -5.770158177852323
Iteration 10, Loss: -5.847598767109531
Iteration 11, Loss: -5.9108032322493305
Iteration 12, Loss: -5.964327642571564
Iteration 13, Loss: -6.010911011760049
Iteration 14, Loss: -6.052302272184852
Iteration 15, Loss: -6.089690704707027
Iteration 16, Loss: -6.123947397093207
Iteration 17, Loss: -6.155782444065613
Iteration 18, Loss: -6.18586259288104
Iteration 19, Loss: -6.214900141041698
Iteration 20, Loss: -6.243690862156052
Iteration 21, Loss: -6.273044819068491
Iteration 22, Loss: -6.303570931209265
Iteration 23, Loss: -6.335407527190602
Iteration 24, Loss: -6.368129108352713
Iteration 25, Loss: -6.4009509054

In [200]:
# valid_data_path = "./NER/Chinese/validation.txt"
valid_data_path = "./NER/Chinese/myvalid.txt"
valid_data = process_data(valid_data_path)

output_path = "./crf_validation_output.txt"
with open(output_path, "w", encoding="utf-8") as fout:
    for seq, tag_seq in zip(valid_data['word_seqs'], valid_data['tag_seqs']):
        predicted_tags = crf.viterbi_decode(seq)
        for word, tag in zip(seq, predicted_tags):
            fout.write(f"{word} {tag}\n")
        fout.write("\n")  # 句子间空行

In [201]:
check(language = "English", gold_path="NER/Chinese/myvalid.txt", my_path="crf_validation_output.txt")

              precision    recall  f1-score   support

       B-PER     0.0000    0.0000    0.0000         0
       I-PER     0.0000    0.0000    0.0000         0
       B-ORG     0.0000    0.0000    0.0000         2
       I-ORG     0.0000    0.0000    0.0000         0
       B-LOC     0.0000    0.0000    0.0000         0
       I-LOC     0.0000    0.0000    0.0000         0
      B-MISC     0.0000    0.0000    0.0000         0
      I-MISC     0.0000    0.0000    0.0000         0

   micro avg     0.0000    0.0000    0.0000         2
   macro avg     0.0000    0.0000    0.0000         2
weighted avg     0.0000    0.0000    0.0000         2

