In [26]:
import numpy as np
from collections import defaultdict
from check import check

In [27]:
class CRF:
    def __init__(self, tags, feature_templates):
        self.tags = tags
        self.tag2idx = {tag: idx for idx, tag in enumerate(tags)}
        self.feature_templates = feature_templates
        self.weights = defaultdict(float)                               # 特征权重
        self.transition = np.random.randn(len(tags), len(tags)) * 0.01  # 转移矩阵

    # 提取特征
    def extract_features(self, seq, pos, prev_tag, current_tag):
        features = []
        for template in self.feature_templates:
            if ":" not in template:
                print("模板格式错误：", template)
                continue
            # Unigram特征
            if template.startswith("U"):
                parts = template.split(":")[1].split("/")
                context = []
                for part in parts:
                    off = int(part[3:-1].split(",")[0])
                    idx = pos + off
                    if idx < 0:
                        context.append("[BEG]")
                    elif idx >= len(seq):
                        context.append("[END]")
                    else:
                        context.append(seq[idx])
                features.append(f"{current_tag}::{template}:{'/'.join(context)}")
            # Bigram特征
            elif template.startswith("B"):
                if prev_tag is not None:
                    parts = template.split(":")[1].split("/")
                    context = []
                    for part in parts:
                        off = int(part[3:-1].split(",")[0])
                        idx = pos + off
                        if idx < 0:
                            context.append("[BEG]")
                        elif idx >= len(seq):
                            context.append("[END]")
                        else:
                            context.append(seq[idx])
                    features.append(f"{prev_tag}->{current_tag}::{template}:{'/'.join(context)}")
        return features
                
    # 前向算法
    # alpha[t, tag]: 给定观测序列的前t+1个词时，以tag作为第t个词的标签的所有路径的“分数”之和
    # Z: 配分函数（所有可能路径的得分和）
    def forward(self, seq):
        T = len(seq)
        N = len(self.tags)
        alpha = np.zeros((T, N))

        # 初始化
        for i in range(N):
            features = self.extract_features(seq, 0, None, self.tags[i])
            alpha[0][i] = sum(self.weights[f] for f in features)
        
        # 递推
        for t in range(1, T):
            for tag in range(N):
                log_scores = []
                for prev_tag in range(N):
                    features = self.extract_features(seq, t, self.tags[prev_tag], self.tags[tag])
                    trans_score = self.transition[prev_tag, tag]
                    emit_score = sum(self.weights[f] for f in features)
                    log_scores.append(alpha[t - 1][prev_tag] + trans_score + emit_score)
                alpha[t, tag] = np.logaddexp.reduce(log_scores) if log_scores else -np.inf

        log_Z = np.log(sum(np.exp(alpha[-1]))) if any(np.isfinite(alpha[-1])) else -np.inf

        return alpha, log_Z

    # 后向算法
    # beta[t, tag]: 在给定观测序列的第t个词处，已知该词的标签为tag，从t到序列末尾的所有可能标签路径的“分数”之和。
    def backward(self, seq):
        T = len(seq)
        N = len(self.tags)
        beta = np.zeros((T, N))
        
        # 初始化
        beta[T-1, :] = 0

        # 递推
        for t in range(T-2, -1, -1):
            for tag in range(N):
                log_scores = []
                for next_tag in range(N):
                    features = self.extract_features(seq, t+1, self.tags[tag], self.tags[next_tag])
                    trans_score = self.transition[tag, next_tag]
                    emit_score = sum(self.weights[f] for f in features)
                    log_scores.append(beta[t+1, next_tag] + trans_score + emit_score)
                beta[t, tag] = np.logaddexp.reduce(log_scores) if log_scores else -np.inf
        return beta
    
    # 维特比解码
    def viterbi_decode(self, seq):
        T = len(seq)
        N = len(self.tags)
        viterbi = np.zeros((T, N))
        backptrs = np.zeros((T, N), dtype=int)    # 回溯指针

        # 初始化
        for tag in range(N):
            features = self.extract_features(seq, 0, None, self.tags[tag])
            viterbi[0, tag] = sum(self.weights[f] for f in features)

        # 递推
        for t in range(1, T):
            for i in range(N):        # 当前标签
                max_score = -np.inf
                best_prev_tag = 0
                for j in range(N):     # 前一个标签
                    trans_score = self.transition[j][i]
                    features = self.extract_features(seq, t, self.tags[j], self.tags[i])
                    emit_score = sum(self.weights[f] for f in features)
                    score = viterbi[t-1][j] + trans_score + emit_score
                    if score > max_score:
                        max_score = score
                        best_prev_tag = j
                viterbi[t][i] = max_score
                backptrs[t][i] = best_prev_tag

        # 回溯
        best_path = [np.argmax(viterbi[-1])]
        for t in reversed(range(1, T)):
            best_path.append(backptrs[t][best_path[-1]])
        best_path.reverse()

        return [self.tags[i] for i in best_path]
    
    def compute_gradients(self, seq, tag_seq):
        T = len(seq)
        N = len(self.tags)

        # 经验特征（真实情况中出现过的特征）
        empirical_features = set()
        for t in range(T):
            prev_tag = tag_seq[t-1] if t > 0 else None
            features = self.extract_features(seq, t, prev_tag, tag_seq[t])
            empirical_features.update(features)

        # 期望特征（模型算出来的特征）
        alpha, log_Z = self.forward(seq)
        beta = self.backward(seq)
        expected_features = defaultdict(float)

        # t=0
        for i in range(N):
            features = self.extract_features(seq, 0, None, self.tags[i])
            p = np.exp(alpha[0][i] + beta[0][i] - log_Z) if log_Z != -np.inf else 0
            for f in features:
                expected_features[f] += p

        # t>=1
        for t in range(1, T):
            for i in range(N):      # current tag
                for j in range(N):  # prev tag
                    features = self.extract_features(seq, t, self.tags[j], self.tags[i])
                    trans_score = self.transition[i][j]
                    emit_score = sum(self.weights[f] for f in features)
                    p = np.exp(alpha[t-1][i] + trans_score + emit_score + beta[t][j] - log_Z) if log_Z != -np.inf else 0
                    for f in features:
                        expected_features[f] += p
        
        # 计算 weights 梯度
        w_gradient = defaultdict(float)
        for f in empirical_features:
            w_gradient[f] += 1  # 真实特征计数
        for f in expected_features:
            w_gradient[f] -= expected_features[f]

        # 计算 transition 矩阵梯度
        t_gradient = np.zeros((N, N))
        for t in range(1, T):
            # empirical
            i = self.tag2idx[tag_seq[t-1]]
            j = self.tag2idx[tag_seq[t]]
            t_gradient[i][j] += 1

            # expected
            for i_ in range(N):
                for j_ in range(N):
                    features = self.extract_features(seq, t, self.tags[i_], self.tags[j_])
                    p = np.exp(alpha[t-1][i_] + self.transition[i_][j_] + sum(self.weights[f] for f in features) + beta[t][j_] - log_Z) if log_Z != -np.inf else 0
                    t_gradient[i_][j_] -= p

        return w_gradient, t_gradient, log_Z
    # def compute_gradients(self, sentence, true_tags):
    #     """计算梯度"""
    #     # 提取真实路径特征
    #     true_features = set()
    #     for t in range(len(sentence)):
    #         prev_tag = true_tags[t - 1] if t > 0 else None
    #         features = self.extract_features(sentence, t, prev_tag, true_tags[t])
    #         true_features.update(features)

    #     # 前向后向算法
    #     alpha, log_Z = self.forward(sentence)
    #     beta = self.backward(sentence)
    #     expected_features = defaultdict(float)

    #     # 计算特征期望
    #     # P(y_0=j|x)            = alpha_0(j) * E_0(j) / Z
    #     # P(y_{t-1}=i, y_t=j|x) = alpha_{t-1}(i) * T(i,j) * E_t(j) * beta_t(j) / Z
    #     for t in range(len(sentence)):
    #         for i in range(len(self.tags)):
    #             for j in range(len(self.tags)):
    #                 # 提取特征
    #                 cur_tag = self.tags[j]
    #                 prev_tag = self.tags[i] if t > 0 else None
    #                 features = self.extract_features(sentence, t, prev_tag, cur_tag)

    #                 # 计算概率
    #                 if t == 0:
    #                     prob = np.exp(alpha[t][j] + beta[t][j] - log_Z) if log_Z != -np.inf else 0
    #                 else:
    #                     trans_score = self.transition[i][j]
    #                     emit_score = sum(self.weights[f] for f in features)
    #                     prob = np.exp(alpha[t - 1][i] + trans_score + emit_score + beta[t][j] - log_Z) if log_Z != -np.inf else 0

    #                 # 累加特征期望
    #                 for f in features:
    #                     expected_features[f] += prob

    #     # 计算权重梯度
    #     # weight_grad[f] = true - expected
    #     weight_grad = defaultdict(float)
    #     for f in true_features:
    #         weight_grad[f] += 1
    #     for f in expected_features:
    #         weight_grad[f] -= expected_features[f]

    #     # 计算转移矩阵梯度
    #     # transition_grad[f] = true - expected
    #     transition_grad = np.zeros_like(self.transition)
    #     for t in range(1, len(sentence)):
    #         i = self.tag2idx[true_tags[t - 1]]
    #         j = self.tag2idx[true_tags[t]]
    #         transition_grad[i][j] += 1

    #         for i_ in range(len(self.tags)):
    #             for j_ in range(len(self.tags)):
    #                 features = self.extract_features(sentence, t, self.tags[i_], self.tags[j_])
    #                 prob = np.exp(alpha[t - 1][i_] + self.transition[i_][j_] + sum(self.weights[f] for f in features) + beta[t][j_] - log_Z) if log_Z != -np.inf else 0
    #                 transition_grad[i_][j_] -= prob

    #     return weight_grad, transition_grad, log_Z


    # 训练CRF模型
    def train(self, seqs, tag_seqs, lr=0.1, max_iter=10):
        for iter in range(max_iter):
            avg_loss = 0
            for seq, tag_seq in zip(seqs, tag_seqs):
                w_grad, t_grad, log_Z = self.compute_gradients(seq, tag_seq)

                for f in w_grad:
                    self.weights[f] += lr * w_grad[f]
                
                self.transition += lr * t_grad
                
                # 计算损失
                score = 0
                for t in range(len(seq)):
                    prev_tag = tag_seq[t-1] if t > 0 else None
                    features = self.extract_features(seq, t, prev_tag, tag_seq[t])
                    score += sum(self.weights[f] for f in features)
                    if t > 0:
                        i = self.tag2idx[tag_seq[t-1]]
                        j = self.tag2idx[tag_seq[t]]
                        score += self.transition[i, j]
                avg_loss += log_Z - score

            avg_loss /= len(seqs)
            print(f"Iteration {iter}, Loss: {avg_loss:.2f}")
    # def train(self, sentences, true_tags_seq, max_iter=10, learning_rate=0.1):
    #     for iteration in range(max_iter):
    #         total_loss = 0
    #         for sentence, tags in zip(sentences, true_tags_seq):
    #             # 计算梯度
    #             weights_grad, transition_grad, log_Z = self.compute_gradients(sentence, tags)

    #             # 更新权重
    #             for f in weights_grad:
    #                 self.weights[f] += learning_rate * weights_grad[f]

    #             # 更新转移矩阵(带约束)
    #             for i in range(len(self.tags)):
    #                 for j in range(len(self.tags)):
    #                     self.transition[i][j] += learning_rate * transition_grad[i][j]

    #             # 计算真实路径得分
    #             true_score = 0
    #             for t in range(len(sentence)):
    #                 prev_tag = tags[t - 1] if t > 0 else None
    #                 features = self.extract_features(sentence, t, prev_tag, tags[t])
    #                 true_score += sum(self.weights[f] for f in features)
    #                 if t > 0:
    #                     i = self.tag2idx[tags[t - 1]]
    #                     j = self.tag2idx[tags[t]]
    #                     true_score += self.transition[i][j]

    #             # 累加损失
    #             total_loss += log_Z - true_score

    #         print(f"迭代次数={iteration},  Loss={total_loss/len(sentences):.2f}")

In [28]:
feature_templates = [
    "U00:%x[-2,0]",
    "U01:%x[-1,0]",
    "U02:%x[0,0]",
    "U03:%x[1,0]",
    "U04:%x[2,0]",
    "U05:%x[-2,0]/%x[-1,0]",
    "U06:%x[-1,0]/%x[0,0]",
    "U07:%x[-1,0]/%x[1,0]",
    "U08:%x[0,0]/%x[1,0]",
    "U09:%x[1,0]/%x[2,0]",
    "B00:%x[-2,0]",
    "B01:%x[-1,0]",
    "B02:%x[0,0]",
    "B03:%x[1,0]",
    "B04:%x[2,0]",
]

In [29]:
# 定义标签集
tags = ["O", "B-NAME", "I-NAME", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

# 初始化CRF
crf = CRF(tags, feature_templates)

# 训练数据示例
train_sentences = [["张", "三", "在", "北京", "工作"], ["李", "四", "是", "腾讯", "员工"]]
train_tags = [["B-NAME", "I-NAME", "O", "B-LOC", "O"], ["B-NAME", "I-NAME", "O", "B-ORG", "O"]]

# 训练模型
crf.train(train_sentences, train_tags, max_iter=10)

# 预测新句子
test_sentence = ["王", "五", "来自", "上海"]
predicted_tags = crf.viterbi_decode(test_sentence)
print("预测结果:", list(zip(test_sentence, predicted_tags)))

Iteration 0, Loss: 2.86
Iteration 1, Loss: -2.39
Iteration 2, Loss: -5.10
Iteration 3, Loss: -5.81
Iteration 4, Loss: -5.94
Iteration 5, Loss: -5.98
Iteration 6, Loss: -5.99
Iteration 7, Loss: -5.99
Iteration 8, Loss: -5.99
Iteration 9, Loss: -6.00
预测结果: [('王', 'B-NAME'), ('五', 'I-NAME'), ('来自', 'B-ORG'), ('上海', 'O')]


In [30]:
def process_data(file_path):
    # 初始化数据结构
    tags = set()
    words = set()
    tag_seqs = []
    word_seqs = []
    
    current_state_seq = []
    current_obs_seq = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:  # 空行表示句子结束
                if current_state_seq and current_obs_seq:
                    tag_seqs.append(current_state_seq)
                    word_seqs.append(current_obs_seq)
                    current_state_seq = []
                    current_obs_seq = []
                continue
                
            parts = line.split()
            if len(parts) >= 2:  # 确保有词和标签
                word = parts[0]
                tag = parts[-1]  # 假设标签在最后
                
                # 更新状态和观测集合
                tags.add(tag)
                words.add(word)
                
                # 添加到当前序列
                current_state_seq.append(tag)
                current_obs_seq.append(word)
    
    # 处理最后一个句子（如果文件不以空行结尾）
    if current_state_seq and current_obs_seq:
        tag_seqs.append(current_state_seq)
        word_seqs.append(current_obs_seq)
    
    # 转换为列表并排序（为了确定性）
    tags = sorted(tags)
    words = sorted(words)
    
    return {
        'tags': tags,
        'words': words,
        'tag_seqs': tag_seqs,
        'word_seqs': word_seqs
    }

In [31]:
# 加载训练数据
train_data_path = "./NER/Chinese/mytrain.txt"
train_data = process_data(train_data_path)

tags = train_data['tags']
words = train_data['words']
tag_seqs = train_data['tag_seqs']
word_seqs = train_data['word_seqs']

In [32]:
# 训练CRF模型
crf = CRF(tags, feature_templates)
crf.train(word_seqs, tag_seqs, lr=0.1,max_iter=10)

Iteration 0, Loss: 7.79
Iteration 1, Loss: -10.33
Iteration 2, Loss: -6.43
Iteration 3, Loss: -13.45
Iteration 4, Loss: -5.18
Iteration 5, Loss: -7.44
Iteration 6, Loss: -4.33
Iteration 7, Loss: -11.04
Iteration 8, Loss: inf
Iteration 9, Loss: inf


In [33]:
# valid_data_path = "./NER/Chinese/validation.txt"
valid_data_path = "./NER/Chinese/myvalid.txt"
valid_data = process_data(valid_data_path)

output_path = "./crf_validation_output.txt"
with open(output_path, "w", encoding="utf-8") as fout:
    for seq, tag_seq in zip(valid_data['word_seqs'], valid_data['tag_seqs']):
        predicted_tags = crf.viterbi_decode(seq)
        for word, tag in zip(seq, predicted_tags):
            fout.write(f"{word} {tag}\n")
        fout.write("\n")  # 句子间空行

In [34]:
check(language = "Chinese", gold_path="NER/Chinese/myvalid.txt", my_path="crf_validation_output.txt")

              precision    recall  f1-score   support

      B-NAME     0.0000    0.0000    0.0000         1
      M-NAME     0.0000    0.0000    0.0000         1
      E-NAME     0.0000    0.0000    0.0000         1
      S-NAME     0.0000    0.0000    0.0000         1
      B-CONT     0.0000    0.0000    0.0000         0
      M-CONT     0.0000    0.0000    0.0000         0
      E-CONT     0.0000    0.0000    0.0000         0
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.0000    0.0000    0.0000         7
       M-EDU     0.0000    0.0000    0.0000         6
       E-EDU     0.0000    0.0000    0.0000         7
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.8750    0.5385    0.6667        13
     M-TITLE     0.7222    0.4483    0.5532        29
     E-TITLE     0.6000    0.6923    0.6429        13
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.7143    0.3333    0.4545        15
       M-ORG     0.6196    