In [25]:
from collections import Counter
import pycrfsuite
import random
from alive_progress import alive_bar
import progressbar
import tqdm
from tqdm import tqdm
from IPython.display import clear_output


class CRF():
    def __init__(self, tags):
        self.weight = {}
        self.tags = tags
        self.start = 'BOS'
        self.stop = 'EOS'

        pass

    def viterbi_decoder(self, sentence):

        score = [[0 for _ in range(len(self.tags))]
                 for _ in range(len(sentence))]

        prev = [[0 for _ in range(len(self.tags))]
                for _ in range(len(sentence))]
        for i in range(len(sentence)):
            # 为句子第i的单词打标签self.tags[k]
            for k in range(len(self.tags)):
                max_score = float('-inf')
                max_prev = -1
                prev_tags = 1 if i == 0 else len(self.tags)
                # 如果上一个标签是
                for j in range(prev_tags):
                    prev_tag = self.tags[j] if i != 0 else self.start
                    prev_score = score[i-1][j] if i-1 >= 0 else 0
                    temp_score = prev_score + \
                        self.features_scoring(self.word2features(
                            sentence, i, self.tags[k], prev_tag))
                    if temp_score > max_score:
                        max_score = temp_score
                        max_prev = j

                score[i][k] = max_score
                prev[i][k] = max_prev

        tag = []
        tail_tag = 0
        max_score = float('-inf')
        for j in range(len(self.tags)):
            if score[-1][j] > max_score:
                max_score = score[-1][j]
                tail_tag = j

        tag.append(tail_tag)
        for i in range(len(sentence)-1, 0, -1):
            tag.insert(0, prev[i][tail_tag])
            tail_tag = prev[i][tail_tag]

        tag = [self.tags[i] for i in tag]
        return tag

    def update_weights(self, sentence, true_labels, predict_labels, learning_rate):
        for i in range(len(sentence)):
            prev_lable = predict_labels[i-1] if i >= 1 else self.start
            pred_features = self.word2features(
                sentence, i, predict_labels[i], prev_lable)

            prev_lable = true_labels[i-1] if i >= 1 else self.start
            true_features = self.word2features(
                sentence, i, true_labels[i], prev_lable)

            union = set(pred_features) | set(true_features)
            pred_features_counter = Counter(pred_features)
            true_features_counter = Counter(true_features)
            for feature in union:
                self.weight.setdefault(feature, 0)
                self.weight[feature] += learning_rate*(true_features_counter.get(
                    feature, 0) - pred_features_counter.get(feature, 0))

    def train(self, training_data, epochs=5, learning_rate=0.1,L1=0,L2 = 0.1):
        progress_bar = tqdm(total=epochs*len(training_data),
                            desc='Training Progress')
        i = 0
        acc_list = []
        for epoch in range(epochs):
            random.shuffle(training_data)
            acc = 0
            for sentences in training_data:
                true_labels = [label for word, label in sentences]
                sentence = [word for word, label in sentences]
                predicted_labels = self.viterbi_decoder(sentence)
                if " ".join(predicted_labels) == " ".join(true_labels):
                    acc += 1
                else:
                    self.update_weights(
                        sentence, true_labels, predicted_labels, learning_rate)
                progress_bar.update(1)

            progress_bar.set_postfix_str(
                "Train_accuracy:"+str(acc*100/len(training_data))+"%")
            acc_list.append(acc*100/len(training_data))
            for key in self.weight.keys():
                self.weight[key] *=1-L2
                self.weight[key] -=L1

        progress_bar.close()
        return acc_list

    def features_scoring(self, features):
        score = 0
        for feature in features:
            score = score + self.weight.get(feature, 0)
        return score

# 定义特征模板
    def word2features(self, sent, i, current_label, previous_label):
        word = sent[i]
        feature_1 = [
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word[-1:]=' + word[-1:],
            'word.isupper=%s' % word.isupper(),
            'word.istitle=%s' % word.istitle(),
            'word.isdigit=%s' % word.isdigit(),
        ]
        # for offset in [-2, -1, 0, 1, 2]:
        #     if 0 <= i + offset < len(sent) - 1:
        #         word_n = sent[i + offset]
        #         word_next = sent[i + offset + 1]
        #         features.extend([
        #             '{:02d}:{}/{}'.format(offset, word_n, word_next),
        #         ])

        unigram = []
        bigram = []
        def get_words(i,offset):
            if i+offset < 0:
                return self.start
            if i + offset > len(sent) - 1:
                return self.stop
            return sent[i + offset]
        # unigram.append("U00:{}-{}".format(get_words(i,-2) , current_label))
        unigram.append("U01:{}-{}".format(get_words(i,-1) , current_label))
        unigram.append("U02:{}-{}".format(get_words(i,0) , current_label))
        # unigram.append("U03:{}-{}".format(get_words(i,1) , current_label))
        # unigram.append("U04:{}-{}".format(get_words(i,2) , current_label))
        # unigram.append("U05:{}/{}-{}".format(get_words(i,-2) , get_words(i,1), current_label))
        unigram.append("U06:{}/{}-{}".format(get_words(i,-1) , get_words(i,0), current_label))
        # unigram.append("U07:{}/{}-{}".format(get_words(i,-1) , get_words(i,1), current_label))
        # unigram.append("U08:{}/{}-{}".format(get_words(i,0) , get_words(i,1), current_label))
        # unigram.append("U09:{}/{}-{}".format(get_words(i,1) , get_words(i,2), current_label))


        # bigram.append("B00:{}-{}/{}".format(get_words(i,-2) , current_label,previous_label))
        # bigram.append("B01:{}-{}/{}".format(get_words(i,-1) , current_label,previous_label))
        # bigram.append("B02:{}-{}/{}".format(get_words(i,0) , current_label,previous_label))
        # bigram.append("B03:{}-{}/{}".format(get_words(i,1) , current_label,previous_label))
        # bigram.append("B04:{}-{}/{}".format(get_words(i,2) , current_label,previous_label))
        # bigram.append("B05:{}/{}-{}/{}".format(get_words(i,-2) , get_words(i,1), current_label,previous_label))
        bigram.append("B06:{}/{}-{}/{}".format(get_words(i,-1) , get_words(i,0), current_label,previous_label))
        # bigram.append("B07:{}/{}-{}/{}".format(get_words(i,-1) , get_words(i,1), current_label,previous_label))
        # bigram.append("B08:{}/{}-{}/{}".format(get_words(i,0) , get_words(i,1), current_label,previous_label))
        # bigram.append("B09:{}/{}-{}/{}".format(get_words(i,1) , get_words(i,2), current_label,previous_label))

        return unigram+bigram


In [45]:
# test
train_data = []
with open("NER/English/train.txt") as f:
    temp = []
    for lines in f.readlines():
        if(len(lines)==1):

            if len(temp)>0:
                train_data.append((temp))
                temp = []
        else:
            lists = lines.split(" ")
            temp.append((lists[0],str.strip(lists[-1])))

train_data

valid_data = []
with open("NER/English/validation.txt") as f:
    temp = []
    for lines in f.readlines():
        if(len(lines)==1):

            if len(temp)>0:
                valid_data.append((temp))
                temp = []
        else:
            lists = lines.split(" ")
            temp.append((lists[0],str.strip(lists[-1])))

valid_data


[[('Bradford', 'B-ORG'), ('1', 'O'), ('Tranmere', 'B-ORG'), ('0', 'O')],
 [('"', 'O'),
  ('It', 'O'),
  ('appears', 'O'),
  ('that', 'O'),
  ('August', 'O'),
  ('is', 'O'),
  ('showing', 'O'),
  ('an', 'O'),
  ('economy', 'O'),
  ('again', 'O'),
  ('reversing', 'O'),
  ('course', 'O'),
  ('and', 'O'),
  ('is', 'O'),
  ('not', 'O'),
  ('moving', 'O'),
  ('onto', 'O'),
  ('a', 'O'),
  ('significantly', 'O'),
  ('slower', 'O'),
  ('track', 'O'),
  ('at', 'O'),
  ('this', 'O'),
  ('point', 'O'),
  (',', 'O'),
  ('"', 'O'),
  ('said', 'O'),
  ('economist', 'O'),
  ('Lynn', 'B-PER'),
  ('Reaser', 'I-PER'),
  ('of', 'O'),
  ('Barnett', 'B-ORG'),
  ('Banks', 'I-ORG'),
  ('Inc.', 'I-ORG'),
  ('in', 'O'),
  ('Jacksonville', 'B-LOC'),
  (',', 'O'),
  ('Fla', 'B-LOC'),
  ('.', 'O')],
 [('Result', 'O'), ('in', 'O'), ('an', 'O'), ('international', 'O')],
 [('But', 'O'),
  ('I', 'O'),
  ('think', 'O'),
  ('it', 'O'),
  ("'s", 'O'),
  ('not', 'O'),
  ('that', 'O'),
  ('.', 'O')],
 [('More', 'O'),
  ('

In [27]:
import random
crf = CRF(["O","B-PER","I-PER","B-ORG","I-ORG","B-LOC","I-LOC","B-MISC","I-MISC"])
words = [word for word,_ in valid_data[1]]
labels = [label for _,label in valid_data[1]]


In [28]:
print(words)
crf.update_weights(words,labels,random.sample(crf.tags*20,len(labels)),1)
print(words)

['"', 'It', 'appears', 'that', 'August', 'is', 'showing', 'an', 'economy', 'again', 'reversing', 'course', 'and', 'is', 'not', 'moving', 'onto', 'a', 'significantly', 'slower', 'track', 'at', 'this', 'point', ',', '"', 'said', 'economist', 'Lynn', 'Reaser', 'of', 'Barnett', 'Banks', 'Inc.', 'in', 'Jacksonville', ',', 'Fla', '.']
['"', 'It', 'appears', 'that', 'August', 'is', 'showing', 'an', 'economy', 'again', 'reversing', 'course', 'and', 'is', 'not', 'moving', 'onto', 'a', 'significantly', 'slower', 'track', 'at', 'this', 'point', ',', '"', 'said', 'economist', 'Lynn', 'Reaser', 'of', 'Barnett', 'Banks', 'Inc.', 'in', 'Jacksonville', ',', 'Fla', '.']


In [29]:

print(len(crf.viterbi_decoder(words)))
print(crf.viterbi_decoder(words))
print(len(labels))
print(labels)


39
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-LOC', 'O', 'B-LOC', 'O']
39
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'B-LOC', 'O', 'B-LOC', 'O']


In [30]:
crf.train(train_data,5,0.01)

Training Progress: 100%|██████████| 70200/70200 [03:50<00:00, 304.82it/s, Train_accuracy:96.08262108262109%]


[46.35327635327636,
 83.78917378917379,
 91.4031339031339,
 94.49430199430199,
 96.08262108262109]

In [31]:
print(len(crf.weight.keys()))
def pair2label(pair):
    return [label for _,label in pair]
def pair2lword(pair):
    return [word for word,_ in pair]

acc = 0 
for test in valid_data:
    pred = crf.viterbi_decoder(pair2lword(test))
    if ' '.join(pred) == ' '.join(pair2label(test)):
        acc += 1
print(acc/len(valid_data))
# 0.19759926131117267
# 0.1948291782086796
# 0.14035087719298245

301937
0.6244998461064943


In [63]:
from sklearn_crfsuite import CRF


crfmodel = CRF(algorithm='lbfgs',
               c1=0.1,
               c2=0.1,
               max_iterations=100,
               all_possible_transitions=False,
               keep_tempfiles = True)
import pycrfsuite

# 定义特征函数
def word2features(sent, i):
    word = sent[i][0]
    features = [
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    unigram = []
    def get_words(i, offset):
        if i+offset < 0:
            return 'BOS'
        if i + offset > len(sent) - 1:
            return 'EOS'
        return sent[i + offset][0]
    # unigram.append("U00:{}".format(get_words(i,-2) ))
    unigram.append("U01:{}".format(get_words(i, -1)))
    unigram.append("U02:{}".format(get_words(i, 0)))
    unigram.append("U03:{}".format(get_words(i,1)))
    # unigram.append("U04:{}".format(get_words(i,2)))
    # unigram.append("U05:{}/{}".format(get_words(i,-2) , get_words(i,1)))
    unigram.append("U06:{}/{}".format(get_words(i, -1) , get_words(i, 0)))
    # unigram.append("U07:{}/{}".format(get_words(i,-1) , get_words(i,1)))
    unigram.append("U08:{}/{}".format(get_words(i,0) , get_words(i,1)))
    # unigram.append("U09:{}/{}".format(get_words(i,1) , get_words(i,2)))
    if i > 0:
        word1 = sent[i-1][0]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
        ])
    else:
        features.append('EOS')
    return unigram+features

# 定义标签
def word2label(sent, i):
    return sent[i][1]

# 提取特征和标签
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word2label(sent, i) for i in range(len(sent))]

# 创建 CRF 模型
trainer = pycrfsuite.Trainer(algorithm='lbfgs', verbose=True)

# 添加数据
for sent in train_data:
    trainer.append(sent2features(sent), sent2labels(sent))

# 设置参数
trainer.set_params({
    'c1': 1.0,
    'c2': 1e-3,
    'max_iterations': 50,
    'feature.possible_states': True,
    'feature.possible_transitions': True
})

# 训练模型
model_file = 'ner_model.crfsuite'
trainer.train(model_file)

# 加载模型
tagger = pycrfsuite.Tagger()
tagger.open(model_file)

# 测试数据

acc = 0 
for test in valid_data:
    pred = tagger.tag(sent2features(test))
    if ' '.join(pred) == ' '.join(pair2label(test)):
        acc += 1
print(acc/len(valid_data))


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 1
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 3260142
Seconds required: 18.663

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 164563.835582
Feature norm: 1.000000
Error norm: 113448.211809
Active features: 269326
Line search trials: 1
Line search step: 0.000002
Seconds required for this iteration: 0.792

***** Iteration #2 *****
Loss: 142907.858644
Feature norm: 1.395098
Error norm: 35944.055546
Active features: 209987
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.359

***** Iteration #3 *****
Loss: 138462.210144
Feature norm: 1.432076
Error norm: 34490.893357
Active features: 176322
Line search trials: 1
Line search step: 1.000000
Seconds req