In [2]:
import numpy as np
from collections import defaultdict
from check import check
import pickle
import dill

In [2]:
class CRF:
    def __init__(self, tags, feature_templates):
        self.tags = tags
        self.tag2idx = {tag: idx for idx, tag in enumerate(tags)}
        self.feature_templates = feature_templates
        self.weights = defaultdict(float)                               # 特征权重
        self.transition = np.random.randn(len(tags), len(tags)) * 0.01  # 转移矩阵

    def save(self, filepath):
        # 保存权重和转移矩阵
        with open(filepath, "wb") as f:
            pickle.dump({
                "weights": dict(self.weights),
                "transition": self.transition,
                "tags": self.tags,
                "feature_templates": self.feature_templates
            }, f)

    def load(self, filepath):
        # 加载权重和转移矩阵
        with open(filepath, "rb") as f:
            data = pickle.load(f)
            self.weights = defaultdict(float, data["weights"])
            self.transition = data["transition"]
            self.tags = data["tags"]
            self.tag2idx = {tag: idx for idx, tag in enumerate(self.tags)}
            self.feature_templates = data["feature_templates"]

    # 提取特征
    def extract_features(self, seq, pos, prev_tag, current_tag):
        features = []
        for template in self.feature_templates:
            if ":" not in template:
                print("模板格式错误：", template)
                continue
            # Unigram特征
            if template.startswith("U"):
                parts = template.split(":")[1].split("/")
                context = []
                for part in parts:
                    off = int(part[3:-1].split(",")[0])
                    idx = pos + off
                    if idx < 0:
                        context.append("[BEG]")
                    elif idx >= len(seq):
                        context.append("[END]")
                    else:
                        context.append(seq[idx])
                features.append(f"{current_tag}::{template}:{'/'.join(context)}")
            # Bigram特征
            elif template.startswith("B"):
                if prev_tag is not None:
                    parts = template.split(":")[1].split("/")
                    context = []
                    for part in parts:
                        off = int(part[3:-1].split(",")[0])
                        idx = pos + off
                        if idx < 0:
                            context.append("[BEG]")
                        elif idx >= len(seq):
                            context.append("[END]")
                        else:
                            context.append(seq[idx])
                    features.append(f"{prev_tag}->{current_tag}::{template}:{'/'.join(context)}")
        return features
                
    # 前向算法
    # alpha[t, tag]: 给定观测序列的前t+1个词时，以tag作为第t个词的标签的所有路径的“分数”之和
    # Z: 配分函数（所有可能路径的得分和）
    def forward(self, seq):
        T = len(seq)
        N = len(self.tags)
        alpha = np.zeros((T, N))

        # 初始化
        for i in range(N):
            features = self.extract_features(seq, 0, None, self.tags[i])
            alpha[0][i] = sum(self.weights[f] for f in features)
        
        # 递推
        for t in range(1, T):
            for tag in range(N):
                log_scores = []
                for prev_tag in range(N):
                    features = self.extract_features(seq, t, self.tags[prev_tag], self.tags[tag])
                    trans_score = self.transition[prev_tag, tag]
                    emit_score = sum(self.weights[f] for f in features)
                    log_scores.append(alpha[t - 1][prev_tag] + trans_score + emit_score)
                alpha[t, tag] = np.logaddexp.reduce(log_scores) if log_scores else -np.inf

        log_Z = np.log(sum(np.exp(alpha[-1]))) if any(np.isfinite(alpha[-1])) else -np.inf

        return alpha, log_Z

    # 后向算法
    # beta[t, tag]: 在给定观测序列的第t个词处，已知该词的标签为tag，从t到序列末尾的所有可能标签路径的“分数”之和。
    def backward(self, seq):
        T = len(seq)
        N = len(self.tags)
        beta = np.zeros((T, N))
        
        # 初始化
        beta[T-1, :] = 0

        # 递推
        for t in range(T-2, -1, -1):
            for tag in range(N):
                log_scores = []
                for next_tag in range(N):
                    features = self.extract_features(seq, t+1, self.tags[tag], self.tags[next_tag])
                    trans_score = self.transition[tag, next_tag]
                    emit_score = sum(self.weights[f] for f in features)
                    log_scores.append(beta[t+1, next_tag] + trans_score + emit_score)
                beta[t, tag] = np.logaddexp.reduce(log_scores) if log_scores else -np.inf
        return beta
    
    # 维特比解码
    def viterbi_decode(self, seq):
        T = len(seq)
        N = len(self.tags)
        viterbi = np.zeros((T, N))
        backptrs = np.zeros((T, N), dtype=int)    # 回溯指针

        # 初始化
        for tag in range(N):
            features = self.extract_features(seq, 0, None, self.tags[tag])
            viterbi[0, tag] = sum(self.weights[f] for f in features)

        # 递推
        for t in range(1, T):
            for i in range(N):        # 当前标签
                max_score = -np.inf
                best_prev_tag = 0
                for j in range(N):     # 前一个标签
                    trans_score = self.transition[j][i]
                    features = self.extract_features(seq, t, self.tags[j], self.tags[i])
                    emit_score = sum(self.weights[f] for f in features)
                    score = viterbi[t-1][j] + trans_score + emit_score
                    if score > max_score:
                        max_score = score
                        best_prev_tag = j
                viterbi[t][i] = max_score
                backptrs[t][i] = best_prev_tag

        # 回溯
        best_path = [np.argmax(viterbi[-1])]
        for t in reversed(range(1, T)):
            best_path.append(backptrs[t][best_path[-1]])
        best_path.reverse()

        return [self.tags[i] for i in best_path]
    
    # 计算梯度
    def compute_gradients(self, seq, tag_seq):
        T = len(seq)
        N = len(self.tags)

        # 真实路径特征
        true_features = set()
        for t in range(T):
            prev_tag = tag_seq[t - 1] if t > 0 else None
            features = self.extract_features(seq, t, prev_tag, tag_seq[t])
            true_features.update(features)

        # 前向后向算法
        alpha, log_Z = self.forward(seq)
        beta = self.backward(seq)
        expected_features = defaultdict(float)

        # 计算特征期望
        for t in range(T):
            for i in range(N):
                for j in range(N):
                    # 提取特征
                    cur_tag = self.tags[j]
                    prev_tag = self.tags[i] if t > 0 else None
                    features = self.extract_features(seq, t, prev_tag, cur_tag)

                    # 计算概率
                    if t == 0:
                        prob = np.exp(alpha[t][j] + beta[t][j] - log_Z) if log_Z != -np.inf else 0
                    else:
                        trans_score = self.transition[i][j]
                        emit_score = sum(self.weights[f] for f in features)
                        prob = np.exp(alpha[t - 1][i] + trans_score + emit_score + beta[t][j] - log_Z) if log_Z != -np.inf else 0

                    # 累加特征期望
                    for f in features:
                        expected_features[f] += prob

        # 计算权重梯度
        # weight_grad[f] = true - expected
        w_grad = defaultdict(float)
        for f in true_features:
            w_grad[f] += 1
        for f in expected_features:
            w_grad[f] -= expected_features[f]

        # 计算转移矩阵梯度
        # transition_grad[f] = true - expected
        t_grad = np.zeros_like(self.transition)
        for t in range(1, T):
            i = self.tag2idx[tag_seq[t - 1]]
            j = self.tag2idx[tag_seq[t]]
            t_grad[i][j] += 1

            for i_ in range(N):
                for j_ in range(N):
                    features = self.extract_features(seq, t, self.tags[i_], self.tags[j_])
                    prob = np.exp(alpha[t - 1][i_] + self.transition[i_][j_] + sum(self.weights[f] for f in features) + beta[t][j_] - log_Z) if log_Z != -np.inf else 0
                    t_grad[i_][j_] -= prob

        return w_grad, t_grad, log_Z

    


    # 训练CRF模型
    def train(self, seqs, tag_seqs, batch_size, max_iter, learning_rate):
        for iter in range(max_iter):
            total_loss = 0
            batch_indices = range(0, len(seqs), batch_size)

            for start_idx in batch_indices:
                end_idx = start_idx + batch_size
                batch_sentences = seqs[start_idx:end_idx]
                batch_tags = tag_seqs[start_idx:end_idx]
                print(f"正在处理批次 {start_idx}-{end_idx}/{len(seqs)} ...")

                # 初始化
                batch_weights_grad = defaultdict(float)
                batch_transition_grad = np.zeros_like(self.transition)
                batch_loss = 0.0

                # 计算批次内所有样本的梯度
                for sentence, tags in zip(batch_sentences, batch_tags):
                    # 计算单个样本的梯度
                    weights_grad, transition_grad, log_Z = self.compute_gradients(sentence, tags)

                    # 累积权重梯度
                    for f in weights_grad:
                        batch_weights_grad[f] += weights_grad[f]

                    # 累积转移矩阵梯度
                    batch_transition_grad += transition_grad

                    # 计算单个样本的损失
                    true_score = self.compute_single_score(sentence, tags)
                    batch_loss += log_Z - true_score

                # 计算批次平均梯度
                batch_size_actual = len(batch_sentences)
                for f in batch_weights_grad:
                    batch_weights_grad[f] /= batch_size_actual
                batch_transition_grad /= batch_size_actual
                batch_loss /= batch_size_actual

                # 使用平均梯度
                for f in batch_weights_grad:
                    self.weights[f] += learning_rate * batch_weights_grad[f]

                for i in range(len(self.tags)):
                    for j in range(len(self.tags)):
                        self.transition[i][j] += learning_rate * batch_transition_grad[i][j]

                loss = batch_loss * batch_size_actual
                print(f"批次损失: {loss/batch_size_actual:.2f}")
                total_loss += loss
                
            print(f"第 {iter} 次迭代, Loss={total_loss/len(seqs):.2f}")
            print("-" * 50)

    # 计算单个样本的经验路径得分
    def compute_single_score(self, seq, tag_seq):
        score = 0
        for t in range(len(seq)):
            prev_tag = tag_seq[t - 1] if t > 0 else None
            features = self.extract_features(seq, t, prev_tag, tag_seq[t])
            score += sum(self.weights[f] for f in features)
            if t > 0:
                i = self.tag2idx[tag_seq[t - 1]]
                j = self.tag2idx[tag_seq[t]]
                score += self.transition[i][j]
        return score

In [63]:
feature_templates = [
    "U00:%x[-2,0]",
    "U01:%x[-1,0]",
    "U02:%x[0,0]",
    "U03:%x[1,0]",
    "U04:%x[2,0]",
    "U05:%x[-2,0]/%x[-1,0]",
    "U06:%x[-1,0]/%x[0,0]",
    "U07:%x[-1,0]/%x[1,0]",
    "U08:%x[0,0]/%x[1,0]",
    "U09:%x[1,0]/%x[2,0]",
    "B00:%x[-2,0]",
    "B01:%x[-1,0]",
    "B02:%x[0,0]",
    "B03:%x[1,0]",
    "B04:%x[2,0]",
]

In [64]:
def process_data(file_path):
    # 初始化数据结构
    tags = set()
    words = set()
    tag_seqs = []
    word_seqs = []
    
    current_state_seq = []
    current_obs_seq = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:  # 空行表示句子结束
                if current_state_seq and current_obs_seq:
                    tag_seqs.append(current_state_seq)
                    word_seqs.append(current_obs_seq)
                    current_state_seq = []
                    current_obs_seq = []
                continue
                
            parts = line.split()
            if len(parts) >= 2:  # 确保有词和标签
                word = parts[0]
                tag = parts[-1]  # 假设标签在最后
                
                # 更新状态和观测集合
                tags.add(tag)
                words.add(word)
                
                # 添加到当前序列
                current_state_seq.append(tag)
                current_obs_seq.append(word)
    
    # 处理最后一个句子（如果文件不以空行结尾）
    if current_state_seq and current_obs_seq:
        tag_seqs.append(current_state_seq)
        word_seqs.append(current_obs_seq)
    
    # 转换为列表并排序（为了确定性）
    tags = sorted(tags)
    words = sorted(words)
    
    return {
        'tags': tags,
        'words': words,
        'tag_seqs': tag_seqs,
        'word_seqs': word_seqs
    }

In [5]:
# 加载训练数据
# train_data_path = "./NER/Chinese/mytrain.txt"
train_data_path = "./NER/English/mytrain.txt"
train_data = process_data(train_data_path)

tags = train_data['tags']
words = train_data['words']
tag_seqs = train_data['tag_seqs']
word_seqs = train_data['word_seqs']

NameError: name 'process_data' is not defined

In [71]:
# 训练CRF模型
crf = CRF(tags, feature_templates)
crf.train(word_seqs, tag_seqs, batch_size=16, max_iter=5, learning_rate=0.1)

正在处理批次 0-16/344 ...
批次损失: 33.90
正在处理批次 16-32/344 ...
批次损失: 14.15
正在处理批次 32-48/344 ...
批次损失: 21.70
正在处理批次 48-64/344 ...
批次损失: 9.93
正在处理批次 64-80/344 ...
批次损失: 13.43
正在处理批次 80-96/344 ...
批次损失: 12.58
正在处理批次 96-112/344 ...
批次损失: 9.48
正在处理批次 112-128/344 ...
批次损失: 10.14
正在处理批次 128-144/344 ...
批次损失: 8.77
正在处理批次 144-160/344 ...
批次损失: 9.21
正在处理批次 160-176/344 ...
批次损失: 11.81
正在处理批次 176-192/344 ...
批次损失: 12.54
正在处理批次 192-208/344 ...
批次损失: 8.80
正在处理批次 208-224/344 ...
批次损失: 11.25
正在处理批次 224-240/344 ...
批次损失: 10.92
正在处理批次 240-256/344 ...
批次损失: 9.40
正在处理批次 256-272/344 ...
批次损失: 6.21
正在处理批次 272-288/344 ...
批次损失: 10.00
正在处理批次 288-304/344 ...
批次损失: 10.88
正在处理批次 304-320/344 ...
批次损失: 6.84
正在处理批次 320-336/344 ...
批次损失: 8.37
正在处理批次 336-352/344 ...
批次损失: 12.60
第 0 次迭代, Loss=11.94
--------------------------------------------------
正在处理批次 0-16/344 ...
批次损失: 9.88
正在处理批次 16-32/344 ...
批次损失: 8.38
正在处理批次 32-48/344 ...
批次损失: 11.38
正在处理批次 48-64/344 ...
批次损失: 8.38
正在处理批次 64-80/344 ...
批次损失: 11.34
正在处理批次 80-96/344 ...


In [4]:
def process_validation_file(input_file, output_file, crf):
    current_sentence = []
    with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
        for line in fin:
            if line.strip() == "":
                # 处理一个完整句子
                if current_sentence:
                    words = [word.lower() for word, _ in current_sentence]
                    predicted_tags = crf.viterbi_decode(words)
                    for (word, _), tag in zip(current_sentence, predicted_tags):
                        fout.write(f"{word} {tag}\n")
                    fout.write("\n")
                    current_sentence = []
            else:
                # 非空行，读取单词
                parts = line.split()
                word = parts[0]
                current_sentence.append((word, None))

In [None]:
# crf.save("model/crf_cn.pkl")
# crf.save("model/crf_en.pkl")

In [67]:
# 验证集
valid_data_path = "NER/English/validation.txt"
valid_data = process_data(valid_data_path)

output_path = "output/crf_validation_output.txt"
with open(output_path, "w", encoding="utf-8") as fout:
    for seq, tags in zip(valid_data['word_seqs'], valid_data['tag_seqs']):
        predicted_tags = crf.viterbi_decode(seq)
        for word, tag in zip(seq, predicted_tags):
            fout.write(f"{word} {tag}\n")
        fout.write("\n")  # 句子间空行

check(language = "English", gold_path=valid_data_path, my_path=output_path)

              precision    recall  f1-score   support

       B-PER     0.2385    0.1661    0.1958      1842
       I-PER     0.1935    0.1951    0.1943      1307
       B-ORG     0.3429    0.0626    0.1059      1341
       I-ORG     0.0452    0.0120    0.0189       751
       B-LOC     0.4892    0.1475    0.2267      1837
       I-LOC     0.0000    0.0000    0.0000       257
      B-MISC     1.0000    0.0011    0.0022       922
      I-MISC     0.0000    0.0000    0.0000       346

   micro avg     0.2516    0.1076    0.1508      8603
   macro avg     0.2887    0.0731    0.0930      8603
weighted avg     0.3495    0.1076    0.1382      8603



In [7]:
crf = CRF(tags, feature_templates)
crf.load("model/crf_en.pkl")
# crf.load("model/crf_cn.pkl")

NameError: name 'tags' is not defined

In [5]:
# 从文件加载CRF模型
with open("model/crf_English.pkl", "rb") as f:
    crf = dill.load(f)

process_validation_file("pj2_test/english_test.txt", "english_test_CRF.txt", crf)
check(language="English", gold_path="pj2_test/english_test.txt", my_path="english_test_CRF.txt")

Micro F1 Score = 0.8247
              precision    recall  f1-score   support

       B-PER     0.4400    0.0068    0.0134      1617
       I-PER     0.0000    0.0000    0.0000      1156
       B-ORG     0.0294    0.0006    0.0012      1661
       I-ORG     0.0000    0.0000    0.0000       835
       B-LOC     0.0000    0.0000    0.0000      1668
       I-LOC     0.0000    0.0000    0.0000       257
      B-MISC     0.0000    0.0000    0.0000       702
      I-MISC     0.0000    0.0000    0.0000       216

   micro avg     0.1818    0.0015    0.0029      8112
   macro avg     0.0587    0.0009    0.0018      8112
weighted avg     0.0937    0.0015    0.0029      8112



In [68]:
# 测试集
# test_data_path = "pj2_test/chinese_test.txt"
test_data_path = "pj2_test/english_test.txt"
test_data = process_data(test_data_path)

output_path = "output/crf_test_output.txt"
with open(output_path, "w", encoding="utf-8") as fout:
    for seq, tags in zip(test_data['word_seqs'], test_data['tag_seqs']):
        predicted_tags = crf.viterbi_decode(seq)
        for word, tag in zip(seq, predicted_tags):
            fout.write(f"{word} {tag}\n")
        fout.write("\n")  # 句子间空行

# check(language = "Chinese", gold_path=test_data_path, my_path=output_path)
check(language = "English", gold_path=test_data_path, my_path=output_path)

              precision    recall  f1-score   support

       B-PER     0.2090    0.1583    0.1802      1617
       I-PER     0.1600    0.1756    0.1674      1156
       B-ORG     0.2812    0.0488    0.0831      1661
       I-ORG     0.0355    0.0120    0.0179       835
       B-LOC     0.4453    0.1463    0.2202      1668
       I-LOC     0.0000    0.0000    0.0000       257
      B-MISC     0.0000    0.0000    0.0000       702
      I-MISC     0.0000    0.0000    0.0000       216

   micro avg     0.2158    0.0979    0.1347      8112
   macro avg     0.1414    0.0676    0.0836      8112
weighted avg     0.2172    0.0979    0.1239      8112

