In [1]:
import re
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import joblib
import yaml
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

## 加载数据

In [2]:
def load_data(data_path):
    data = list()
    data_sent_with_label = list()
    with open(data_path, mode='r', encoding="utf-8") as f:
        for line in tqdm(f):
            if line.strip() == "":
                data.append(data_sent_with_label.copy())
                data_sent_with_label.clear()
            else:
                row_data=line.strip().split(" ")
                if len(row_data)==1:
                    data_sent_with_label.append((' ',row_data[0]))
                else:
                    data_sent_with_label.append(tuple(line.strip().split(" ")))
    return data

In [3]:
print("'  O'.split()结果为：",'  O'.split())
print("'  O'.strip()结果为：",'  O'.strip())
# print("' \n'.strip()结果为：",' \n'.strip())

'  O'.split()结果为： ['O']
'  O'.strip()结果为： O


In [4]:
train=load_data('data/train_data/train.txt')


2288790it [00:02, 938249.80it/s] 


In [5]:
len(train)

40000

In [6]:
train,valid=train_test_split(train,test_size=0.2,shuffle=True,random_state=42)
print(len(train),len(valid))

32000 8000


In [7]:
' '.isspace()

True

## 构造ngram特征

In [8]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word': word,
        'word.isdigit()': word.isdigit(),
        'word.isspace()': word.isspace(),
        'word.isalpha()': word.isalpha(),
        
    }
    if i > 0:
        word1 = sent[i-1][0]
        words = word1 + word
        features.update({
            '-1:word': word1,
            '-1:words': words,
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isspace()': word1.isalpha(),

        })
    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        word1 = sent[i-1][0]
        words = word1 + word2 + word
        features.update({
            '-2:word': word2,
            '-2:words': words,
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.isspace()': word2.isalpha(),

        })

    if i > 2:
        word3 = sent[i - 3][0]
        word2 = sent[i - 2][0]
        word1 = sent[i - 1][0]
        words = word1 + word2 + word3 + word
        features.update({
            '-3:word': word3,
            '-3:words': words,
            '-3:word.isdigit()': word3.isdigit(),
            '-3:word.isspace()': word3.isalpha(),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        words = word1 + word
        features.update({
            '+1:word': word1,
            '+1:words': words,
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isspace()': word1.isalpha(),
        })
    else:
        features['EOS'] = True

    if i < len(sent)-2:
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        words = word + word1 + word2
        features.update({
            '+2:word': word2,
            '+2:words': words,
            '+2:word.isdigit()': word2.isdigit(),
        })

    if i < len(sent)-3:
        word3 = sent[i + 3][0]
        word2 = sent[i + 2][0]
        word1 = sent[i + 1][0]
        words = word + word1 + word2 + word3
        features.update({
            '+3:word': word3,
            '+3:words': words,
            '+3:word.isdigit()': word3.isdigit(),
        })

    return features

In [9]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [ele[-1] for ele in sent]


In [10]:
# 生成特征
X_train = [sent2features(s) for s in tqdm(train)]
y_train = [sent2labels(s) for s in tqdm(train)]

X_dev = [sent2features(s) for s in tqdm(valid)]
y_dev = [sent2labels(s) for s in tqdm(valid)]

100%|██████████| 32000/32000 [00:08<00:00, 3696.70it/s]
100%|██████████| 32000/32000 [00:00<00:00, 193892.65it/s]
100%|██████████| 8000/8000 [00:02<00:00, 3452.29it/s]
100%|██████████| 8000/8000 [00:00<00:00, 153150.85it/s]


In [11]:
X_train[0]

[{'bias': 1.0,
  'word': '毕',
  'word.isdigit()': False,
  'word.isspace()': False,
  'word.isalpha()': True,
  'BOS': True,
  '+1:word': '加',
  '+1:words': '加毕',
  '+1:word.isdigit()': False,
  '+1:word.isspace()': True,
  '+2:word': '索',
  '+2:words': '毕加索',
  '+2:word.isdigit()': False,
  '+3:word': '旗',
  '+3:words': '毕加索旗',
  '+3:word.isdigit()': False},
 {'bias': 1.0,
  'word': '加',
  'word.isdigit()': False,
  'word.isspace()': False,
  'word.isalpha()': True,
  '-1:word': '毕',
  '-1:words': '毕加',
  '-1:word.isdigit()': False,
  '-1:word.isspace()': True,
  '+1:word': '索',
  '+1:words': '索加',
  '+1:word.isdigit()': False,
  '+1:word.isspace()': True,
  '+2:word': '旗',
  '+2:words': '加索旗',
  '+2:word.isdigit()': False,
  '+3:word': '下',
  '+3:words': '加索旗下',
  '+3:word.isdigit()': False},
 {'bias': 1.0,
  'word': '索',
  'word.isdigit()': False,
  'word.isspace()': False,
  'word.isalpha()': True,
  '-1:word': '加',
  '-1:words': '加索',
  '-1:word.isdigit()': False,
  '-1:word.isspa

In [12]:
# **表示该位置接受任意多个关键字（keyword）参数，在函数**位置上转化为词典 [key:value, key:value ]
crf_model = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=0.25,c2=0.018,max_iterations=300,
                                 all_possible_transitions=True,verbose=True)
crf_model.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 32000/32000 [00:50<00:00, 633.02it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 4274092
Seconds required: 21.410

L-BFGS optimization
c1: 0.250000
c2: 0.018000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=119.79 loss=7485482.04 active=4247633 feature_norm=1.00
Iter 2   time=119.79 loss=6607541.48 active=4233368 feature_norm=5.81
Iter 3   time=59.85 loss=5597225.84 active=4210680 feature_norm=5.62
Iter 4   time=60.70 loss=5350647.52 active=4238032 feature_norm=5.56
Iter 5   time=61.35 loss=5055685.05 active=4244462 feature_norm=6.38
Iter 6   time=62.56 loss=4605034.53 active=4234333 feature_norm=9.66
Iter 7   time=59.53 loss=4195952.40 active=4252519 feature_norm=11.17
Iter 8   time=55.37 loss=4015708.22 active=4257162 feature_norm=11.89
Iter 9   time=54.83 loss=3717151.89 acti

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.25, c2=0.018, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=300,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=True)

In [13]:
labels=list(crf_model.classes_)
labels.remove("O")
y_pred = crf_model.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='weighted', labels=labels)
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(
    y_dev, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

         B-1      0.890     0.904     0.897      5062
         I-1      0.891     0.925     0.908     11806
        B-10      0.558     0.527     0.542      1678
        I-10      0.563     0.603     0.582      4082
        B-11      0.792     0.808     0.800     12128
        I-11      0.779     0.794     0.786     18511
        B-12      0.807     0.825     0.816      2522
        I-12      0.800     0.826     0.813      3539
        B-13      0.725     0.724     0.725     12871
        I-13      0.706     0.698     0.702     18940
        B-14      0.881     0.907     0.894      4429
        I-14      0.870     0.896     0.883      5099
        B-15      0.664     0.712     0.688       139
        I-15      0.677     0.661     0.669       171
        B-16      0.908     0.917     0.912      4567
        I-16      0.907     0.927     0.917      5684
        B-17      1.000     0.400     0.571         5
        I-17      1.000    

In [14]:
import joblib
joblib.dump(crf_model, "./product_crf_model.joblib")

['./product_crf_model.joblib']

In [15]:
text = 'OPPO闪充充电器 X9070 X9077 R5 快充头通用手机数据线 套餐【2.4充电头+数据线 】 安卓 1.5m'

NER_tagger = joblib.load('./product_crf_model.joblib')
list_result = []
new_sents = re.split(u'(。|！|\!|？|\?)', text)
sents_feature = [sent2features(sent) for sent in new_sents]
y_pred = NER_tagger.predict(sents_feature)
for sent, ner_tag in zip(new_sents, y_pred):
    for word, tag in zip(sent, ner_tag):
        list_result.append((word,tag))
list_result    

[('O', 'B-37'),
 ('P', 'I-37'),
 ('P', 'I-37'),
 ('O', 'I-37'),
 ('闪', 'B-11'),
 ('充', 'I-11'),
 ('充', 'B-4'),
 ('电', 'I-4'),
 ('器', 'I-4'),
 (' ', 'O'),
 ('X', 'B-38'),
 ('9', 'I-38'),
 ('0', 'I-38'),
 ('7', 'I-38'),
 ('0', 'I-38'),
 (' ', 'O'),
 ('X', 'B-38'),
 ('9', 'I-38'),
 ('0', 'I-38'),
 ('7', 'I-38'),
 ('7', 'I-38'),
 (' ', 'O'),
 ('R', 'B-38'),
 ('5', 'I-38'),
 (' ', 'O'),
 ('快', 'B-4'),
 ('充', 'I-4'),
 ('头', 'I-4'),
 ('通', 'B-11'),
 ('用', 'I-11'),
 ('手', 'B-40'),
 ('机', 'I-40'),
 ('数', 'B-4'),
 ('据', 'I-4'),
 ('线', 'I-4'),
 (' ', 'O'),
 ('套', 'O'),
 ('餐', 'O'),
 ('【', 'O'),
 ('2', 'B-18'),
 ('.', 'I-18'),
 ('4', 'I-18'),
 ('充', 'B-4'),
 ('电', 'I-4'),
 ('头', 'I-4'),
 ('+', 'O'),
 ('数', 'B-4'),
 ('据', 'I-4'),
 ('线', 'I-4'),
 (' ', 'O'),
 ('】', 'O'),
 (' ', 'O'),
 ('安', 'B-37'),
 ('卓', 'I-37'),
 (' ', 'O'),
 ('1', 'B-18'),
 ('.', 'I-18'),
 ('5', 'I-18'),
 ('m', 'I-18')]

In [16]:
import sklearn

In [17]:
sklearn.__version__

'0.22.2'

## submit

In [18]:
test_file='data/preliminary_test_a/sample_per_line_preliminary_A.txt'
test_sents=[]
with open(test_file, 'r', encoding='utf-8') as f:
    for line in f.read().split('\n'):
        test_sents.append(line)

In [19]:
sents_feature = [sent2features(sent) for sent in test_sents]

In [21]:
# text = 'OPPO闪充充电器 X9070 X9077 R5 快充头通用手机数据线 套餐【2.4充电头+数据线 】 安卓 1.5m'
NER_tagger = joblib.load('./product_crf_model.joblib')
list_results = []

y_pred = NER_tagger.predict(sents_feature)

for sent, ner_tag in zip(test_sents, y_pred):
    line_result=[]
    for word, tag in zip(sent, ner_tag):
        line_result.append((word,tag))
    list_results.append(line_result)    

In [22]:
with open('crf.txt','w',encoding='utf-8') as f:
    for i,line_result in enumerate(list_results):
        for word,tag in line_result:
            f.write(f'{word} {tag}\n')
        if i<len(list_results)-1:
            f.write('\n')