In [1]:
import re
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import joblib
import yaml
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

## 加载数据

In [2]:
def load_data(data_path):
    data = list()
    data_sent_with_label = list()
    with open(data_path, mode='r', encoding="utf-8") as f:
        for line in tqdm(f):
            if line.strip() == "":
                data.append(data_sent_with_label.copy())
                data_sent_with_label.clear()
            else:
                row_data=line.strip().split(" ")
                if len(row_data)==1:
                    data_sent_with_label.append((' ',row_data[0]))
                else:
                    data_sent_with_label.append(tuple(line.strip().split(" ")))
    return data

In [3]:
print("'  O'.split()结果为：",'  O'.split())
print("'  O'.strip()结果为：",'  O'.strip())
# print("' \n'.strip()结果为：",' \n'.strip())

'  O'.split()结果为： ['O']
'  O'.strip()结果为： O


In [4]:
train=load_data('data/train_data/train.txt')


2288790it [00:02, 894592.00it/s] 


In [5]:
len(train)

40000

In [6]:
train,valid=train_test_split(train,test_size=0.2,shuffle=True,random_state=42)
print(len(train),len(valid))

32000 8000


In [7]:
' '.isspace()

True

## 构造ngram特征

In [8]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word': word,
        'word.isdigit()': word.isdigit(),
        'word.isspace()': word.isspace(),
        'word.isalpha()': word.isalpha(),
        
    }
    if i > 0:
        word1 = sent[i-1][0]
        words = word1 + word
        features.update({
            '-1:word': word1,
            '-1:words': words,
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isspace()': word1.isalpha(),

        })
    else:
        features['BOS'] = True

#     if i > 1:
#         word2 = sent[i-2][0]
#         word1 = sent[i-1][0]
#         words = word1 + word2 + word
#         features.update({
#             '-2:word': word2,
#             '-2:words': words,
#             '-2:word.isdigit()': word2.isdigit(),
#             '-2:word.isspace()': word2.isalpha(),

#         })

    # if i > 2:
    #     word3 = sent[i - 3][0]
    #     word2 = sent[i - 2][0]
    #     word1 = sent[i - 1][0]
    #     words = word1 + word2 + word3 + word
    #     features.update({
    #         '-3:word': word3,
    #         '-3:words': words,
    #         '-3:word.isdigit()': word3.isdigit(),
    #         '-3:word.isspace()': word3.isalpha(),
    #     })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        words = word1 + word
        features.update({
            '+1:word': word1,
            '+1:words': words,
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isspace()': word1.isalpha(),
        })
    else:
        features['EOS'] = True

#     if i < len(sent)-2:
#         word2 = sent[i + 2][0]
#         word1 = sent[i + 1][0]
#         words = word + word1 + word2
#         features.update({
#             '+2:word': word2,
#             '+2:words': words,
#             '+2:word.isdigit()': word2.isdigit(),
#         })

    # if i < len(sent)-3:
    #     word3 = sent[i + 3][0]
    #     word2 = sent[i + 2][0]
    #     word1 = sent[i + 1][0]
    #     words = word + word1 + word2 + word3
    #     features.update({
    #         '+3:word': word3,
    #         '+3:words': words,
    #         '+3:word.isdigit()': word3.isdigit(),
    #     })

    return features

In [9]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [ele[-1] for ele in sent]


In [10]:
# 生成特征
X_train = [sent2features(s) for s in tqdm(train)]
y_train = [sent2labels(s) for s in tqdm(train)]

X_dev = [sent2features(s) for s in tqdm(valid)]
y_dev = [sent2labels(s) for s in tqdm(valid)]

100%|██████████| 32000/32000 [00:03<00:00, 10023.82it/s]
100%|██████████| 32000/32000 [00:00<00:00, 197827.91it/s]
100%|██████████| 8000/8000 [00:00<00:00, 8029.21it/s]
100%|██████████| 8000/8000 [00:00<00:00, 225008.76it/s]


In [11]:
X_train[0]

[{'bias': 1.0,
  'word': '毕',
  'word.isdigit()': False,
  'word.isspace()': False,
  'word.isalpha()': True,
  'BOS': True,
  '+1:word': '加',
  '+1:words': '加毕',
  '+1:word.isdigit()': False,
  '+1:word.isspace()': True},
 {'bias': 1.0,
  'word': '加',
  'word.isdigit()': False,
  'word.isspace()': False,
  'word.isalpha()': True,
  '-1:word': '毕',
  '-1:words': '毕加',
  '-1:word.isdigit()': False,
  '-1:word.isspace()': True,
  '+1:word': '索',
  '+1:words': '索加',
  '+1:word.isdigit()': False,
  '+1:word.isspace()': True},
 {'bias': 1.0,
  'word': '索',
  'word.isdigit()': False,
  'word.isspace()': False,
  'word.isalpha()': True,
  '-1:word': '加',
  '-1:words': '加索',
  '-1:word.isdigit()': False,
  '-1:word.isspace()': True,
  '+1:word': '旗',
  '+1:words': '旗索',
  '+1:word.isdigit()': False,
  '+1:word.isspace()': True},
 {'bias': 1.0,
  'word': '旗',
  'word.isdigit()': False,
  'word.isspace()': False,
  'word.isalpha()': True,
  '-1:word': '索',
  '-1:words': '索旗',
  '-1:word.isdigit(

In [None]:
# **表示该位置接受任意多个关键字（keyword）参数，在函数**位置上转化为词典 [key:value, key:value ]
crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=0.25,c2=0.018,max_iterations=100,
                                 all_possible_transitions=True,verbose=True)
crf_model.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 32000/32000 [00:17<00:00, 1860.63it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 729973
Seconds required: 4.920

L-BFGS optimization
c1: 0.250000
c2: 0.018000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=93.28 loss=7610708.28 active=717990 feature_norm=1.00
Iter 2   time=96.01 loss=7462314.02 active=712997 feature_norm=8.14
Iter 3   time=48.23 loss=5971845.88 active=706005 feature_norm=7.29
Iter 4   time=47.82 loss=5763859.18 active=717400 feature_norm=6.56
Iter 5   time=48.08 loss=5298840.34 active=714661 feature_norm=6.93
Iter 6   time=48.04 loss=5078078.89 active=714850 feature_norm=10.80
Iter 7   time=47.46 loss=4505330.91 active=714735 feature_norm=11.59
Iter 8   time=96.92 loss=4373063.73 active=719234 feature_norm=11.03
Iter 9   time=48.44 loss=4102882.89 active=722290 f

In [None]:
labels=list(crf_model.classes_)
labels.remove("O")
y_pred = crf_model.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

In [None]:
import joblib
joblib.dump(crf_model, "./product_crf_model.joblib")

In [None]:
text = 'OPPO闪充充电器 X9070 X9077 R5 快充头通用手机数据线 套餐【2.4充电头+数据线 】 安卓 1.5m'

NER_tagger = joblib.load('./product_crf_model.joblib')
list_result = []
new_sents = re.split(u'(。|！|\!|？|\?)', text)
sents_feature = [sent2features(sent) for sent in new_sents]
y_pred = NER_tagger.predict(sents_feature)
for sent, ner_tag in zip(new_sents, y_pred):
    for word, tag in zip(sent, ner_tag):
        list_result.append((word,tag))
list_result    

In [None]:
import sklearn

In [None]:
sklearn.__version__

In [None]:
text = 'OPPO闪充充电器 X9070 X9077 R5 快充头通用手机数据线 套餐【2.4充电头+数据线 】 安卓 1.5m'

NER_tagger = joblib.load('./chinese_crf_model.joblib')
list_result = []
new_sents = re.split(u'(。|！|\!|？|\?)', text)
sents_feature = [sent2features(sent) for sent in new_sents]
y_pred = NER_tagger.predict(sents_feature)
for sent, ner_tag in zip(new_sents, y_pred):
    for word, tag in zip(sent, ner_tag):
        list_result.append((word,tag))
list_result    

In [None]:
text=[c[0] for c in list_result]
tag_pre=[c[1] for c in list_result]
# text,tag_pre

In [None]:
def bulid_result_line(sentence, tag_pred):
    result_list = []
    for index, tag in zip(range(len(tag_pred)), tag_pred):
        if tag[0] == 'B':
            start = index
            end = index
            label_type = tag[2:]
            if end != len(tag_pred) - 1:
                while tag_pred[end + 1][0] == 'I' and tag_pred[end + 1][2:] == label_type:
                    end += 1
                    if end == len(tag_pred) - 1:
                        break
            result_list.append({'start': start,
                                'end': end,
                                'lable_type': label_type

                                })
    nouns = []
    line = ''.join(sentence)
    if len(result_list) != 0:
        for index, item in enumerate(result_list):
            nouns.append(''.join(sentence[result_list[index]['start']:result_list[index]['end'] + 1]))
    return nouns


In [None]:
bulid_result_line(text,tag_pre)