## 使用传统方法做特征
- 正则
- crf++
#### 要想使用正则表达式，就必须先把各个类别的数据分类聚合起来，即看看每个类别的数据有什么特点，然后在看如何使用正则表达式

In [1]:
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm

In [2]:
train_data_path = '../data/train_data/'
train_data_label = '../data/train_label/'
test_data_path = '../data/test_data/'

In [3]:
def get_dataframe(data_path_dir):
    id_list = []
    data_list = []
    data_id_list = [file.split('.')[0] for file in os.listdir(data_path_dir) if '.txt' in file]
    for ID in tqdm(set(data_id_list)):
        data_path = os.path.join(data_path_dir,ID+'.txt')
        with open(data_path,'r',encoding='utf-8') as f:
            line = ''.join(f.readlines())
        id_list.append(ID)
        data_list.append(line)
    df_data = pd.DataFrame()
    df_data['id']=id_list
    df_data['data']=data_list
    return df_data

In [4]:
df_train = get_dataframe(train_data_path)
df_test = get_dataframe(test_data_path)

100%|██████████| 2515/2515 [00:20<00:00, 123.47it/s]
100%|██████████| 3956/3956 [00:30<00:00, 128.02it/s]


In [23]:
df_train
df_test

Unnamed: 0,id,data
0,2844,80后小警微博，今学习工作在网络，期待与您共同关注互联网、关注我们身边的互联网安全。微博纯属...
1,1316,華納歌手2012年最新派台歌<到此為止><好好過>／2011年電影<喜愛夜蒲>女主角／希望大...
2,2879,已出版《蝎子的猎物》《可惜我是水瓶座》《天使的嫁纱》；2009年于台湾出版个人星座系列小说：...
3,898,以一种勇敢无畏的精神，兢兢业业的作风，昂扬的斗志积极践行着“奉献、有爱、互助、进步”的志愿精...
4,2266,遊走中港台的時尚精靈，歌手，RinaBijoux訂製珠寶設計師工作聯繫:海風時尚娛樂Benn...
...,...,...
3951,856,Livethelifeyoulove，lovethelifeyoulive.，作家，《误入妻...
3952,1609,爱好排行榜：1MyPrincess，2电影，3动漫绘画，4游戏，5写作，6……兴趣排行榜：1...
3953,2440,金华市中心医院肛肠外科主任、学科带头人，省肛肠外科学分会委员，金华市肛肠外科学分会主任委员、...
3954,2837,自由的惰性气体，可以改变我的形状，却改变不了我的分子。代表作品静态电影《风声后传》《望天树，...


In [5]:
def get_label(data_dir):
    # 获取的数据
    label_list = []
    
    # 获取ID
    label_id_list = [file_name.split('.')[0] for file_name in os.listdir(data_dir) if '.csv' in file_name]
    # 循环ID
    for ID in tqdm(set(label_id_list)):
        
        # 获取标签
        label_path = os.path.join(data_dir,ID+'.csv')
        df_label = pd.read_csv(label_path)
        label_list += [df_label]
    # dataframe
    df_label = pd.concat(label_list)
    return df_label

In [6]:
df_train_label = get_label(train_data_label)
df_train_label

100%|██████████| 2515/2515 [00:31<00:00, 80.24it/s]


Unnamed: 0,ID,Category,Pos_b,Pos_e,Privacy
0,1257,movie,9,15,Believe
1,1257,movie,17,23,Lucifer
0,757,company,21,24,Nike
1,757,position,29,30,教练
2,757,position,91,92,教练
...,...,...,...,...,...
6,654,movie,79,85,《风雨上海滩》
7,654,movie,86,91,《一碗沧桑》
8,654,movie,92,97,《沙海剑魂》
9,654,movie,100,103,《敦煌》


In [28]:
print(set(df_train_label.Category))
# print(set(df_train_label['Category']))

{'company', 'address', 'game', 'position', 'organization', 'book', 'mobile', 'vx', 'government', 'scene', 'QQ', 'name', 'email', 'movie'}


In [7]:
# df_label_trian_category = df_train_label.groupby(['Category','Privacy']).agg(lambda x:' '.join([str(i) for i in list(x)]))
# df_label_trian_category = df_train_label.groupby(['Category','Privacy']).agg(lambda x:' '.join([str(i) for i in list(x)])).reset_index()
df_label_trian_category = df_train_label.groupby(['Category'])['Privacy'].agg(lambda x: ' '.join([str(i) for i in list(x)])).reset_index()
df_label_trian_category

Unnamed: 0,Category,Privacy
0,QQ,130238288 527622886 490401933 496049279 230871...
1,address,中国 中国 中国 中国 布鲁日 霍顿平原 兰卡 北京赛区 北京 好莱坞 购物中心 大工业区 ...
2,book,《commanderkeen》 《GameInformer》 《马克思佩恩3》 《别拿男人不...
3,company,Nike 美瑞克斯 TRXBOSU NIKE PowerMix core-max 北京香江国...
4,email,yuxuan1118@126.com hejingxinxiang@163.com info...
5,game,DOTA 《dirt2》 《极品飞车：变速》 王者荣耀 王者荣耀 王者荣耀 《潜龙谍影4》 ...
6,government,印度国防研究机构 印度国防研究与发展组织 银监会 保监会 海军 武汉市青山区房产管理局 央行...
7,mobile,18616873450 18621963983 400-692-0001 093390561...
8,movie,Believe Lucifer 《杀手没有假期》 《刺客信条2：世系》 《焦点导演：邱礼涛》...
9,name,张剑 陆慧明 carmack 罗钊明 angelina 李焯桃 李晨洋 maya Aiji ...


In [51]:
# 'company', 'address', 'game', 'position', 'organization', 'book', 'mobile', 'vx', 'government', 'scene', 'QQ', 'name', 'email', 'movie'
df_train_vx = df_train_label[df_train_label['Category']=='vx'][['Category','Privacy']]
df_train_vx.to_csv('../data/regex/vx_train.csv',index=False)

df_train_mobile = df_train_label[df_train_label['Category']=='mobile'][['Category','Privacy']]
df_train_mobile.to_csv('../data/regex/mobile_train.csv',index=False)

df_train_book = df_train_label[df_train_label['Category']=='book'][['Category','Privacy']]
df_train_book.to_csv('../data/regex/book_train.csv',index=False)

df_train_email = df_train_label[df_train_label['Category']=='email'][['Category','Privacy']]
df_train_email.to_csv('../data/regex/email_train.csv',index=False)

df_train_movie = df_train_label[df_train_label['Category']=='movie'][['Category','Privacy']]
df_train_movie.to_csv('../data/regex/movie_train.csv',index=False)

In [57]:
text = df_test[df_test['id']=='2984']['data'].values[0]
text

'工作邮箱联系：33119699@qq.com我的淘宝店:http://baby-amelie.taobao.com/，内地组合至上励合成员刘洲成'

In [115]:
# 使用正则表达式处理 vx,qq,mobile,email,如果使用正则，就没有训练的过程，直接从切割好的文件中找到模式即可，
# test集合需要输出s_pos,e_pos，category，id ，entity等信息
import copy
# |(\d{7,9}$)|(\d{3}-\d{4}-\d{4})
phone_regex = \
re.compile('(^1[3589][0-9]{9}$)|(^\d{3}-\d{4}-\d{4}$)|(^\d{3}[-﹣]\d{8}$)|(^\+\d{2}-?\d{11}$)')
category_list = df_train_mobile['Privacy'].values
category_ok_list = []
category_list_copy = list(copy.deepcopy(category_list))
for item in category_list:
    result = phone_regex.findall(str(item))
    if result != []:
        category_ok_list.append(item)
        category_list_copy.remove(item)
print(len(category_list_copy))
print(category_list_copy)
print('--'*50)
print(len(category_ok_list))
print(category_ok_list)

110
['+886227627111', '347-738-8723', '+886）2-2752-1874', '+85294979172', '+886-936367799', '+886963110290', '+886223755010', '+86137-5787-9227', '00886928592269', '1861060699713940087450', '+88627627111', '+85266873859', '+85223352109', '+85266873859', '85172333-606', '886921024788', '8618611732247', '60122919901', '886-4-23029888', 'zzz813222', '+886-936-092-241', '+886-952-199-882', '1817534480', '+886287713553', '764957359', '0922-652-820', '0913-659-895', '+886913881777', '0978-022-680', '+886932147992', '+886910741038', '886-2-2738-7077', '0592-3931800', '84682361', '+8522335-2385', '+852-37411033', '+886928534302', '（852）23072034', '（852）66088899（86）13818199118', '886-4-23029888', '+60128816178', '010-6362212113910492247', '139-010-66161', '+86134-0106-3493', '+886920494700', '2335-2305/9263-3317', '+85223352109', '00852-69010858', '+85224372627', '+886-2-24255385', 'r886-2-25095720', '+852-98014313', '+85223352105', '057788285332', '772845851', '6821155', '2009', '886-2369-9886

In [119]:
phone_regex = \
re.compile('^[A-Za-z0-9\u4e00-\u9fa5\._\-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$')
category_list = df_train_email['Privacy'].values
category_ok_list = []
category_list_copy = list(copy.deepcopy(category_list))
for item in category_list:
    result = phone_regex.findall(str(item))
    if result != []:
        category_ok_list.append(item)
        category_list_copy.remove(item)
print(len(category_list_copy))
print(category_list_copy)
print('--'*50)
print(len(category_ok_list))
print(category_ok_list)

11
['高一桉gya2012vip@sina.com...', 'kenneth', 'www.celinajade.com', '金晶kinkiemail', 'yabinstudio@sina.com@music', "'pamela@dreamstardom.com", 'GaryCheung@', 'info@wowmusic.', 'winpy.lau@tvb.com.', 'www.starmusichk.com', 'jessicafplydia@yahoo.com.hk.']
----------------------------------------------------------------------------------------------------
248
['jdfz520@yahoo.cn', 'wujing@missworld.cn', 'bcw.3people@gmail.com', 'desen66@126.com', 'aben@alegendstar.com', 'leevi0129@126.com', 'jeff.aam@gmail.com', 'arjay588@gmail.com', 'aben@alegendstar.com', 'derekgaga@gmail.com', 'sammy@catwalkagency.com', 'miki1234567@tom.com', 'toyiau@gpaa.com.hk', 'dvfuns@gmail.com', 'aimmo_pawan@hotmail.com', '1257677705@qq.com', 'linyuchun@eelin.com.tw', '79147402@qq.com', 'pccwman@hotmail.com', 'dreamux@yahoo.com.hk', 'rain_ho2656@yahoo.com.cn', 'Ld6919@126.com', 'aben@alegendstar.com', 'zhengluoqianwork@163.com', 'montager@foxmail.com', 'star2012@foxmail.com', 'nonamepunks@hotmail.com', 'wangwuli@vip.12

In [121]:
phone_regex = \
re.compile('^[a-zA-Z][a-zA-Z\d_-]{5,19}$')
category_list = df_train_vx['Privacy'].values
category_ok_list = []
category_list_copy = list(copy.deepcopy(category_list))
for item in category_list:
    result = phone_regex.findall(str(item))
    if result != []:
        category_ok_list.append(item)
        category_list_copy.remove(item)
print(len(category_list_copy))
print(category_list_copy)
print('--'*50)
print(len(category_ok_list))
print(category_ok_list)

5
['lily潘小芬', '1827967769', '57150788', '57150788', '13426245800']
----------------------------------------------------------------------------------------------------
14
['wwjshow', 'liushasha319439', 'montager', 'zhangzhizheng-zzz', 'rachel-213', 'sjyy98', 'zhuzhiyi91510', 'zhuzishanmisstin', 'tianlangyinyue', 'mengmeng6570', 'waveapp', 'songxiaobo', 'hongshui001', 'hongshui007']


## 使用crf-suite库来学习模型

In [8]:
def get_list_data(data_path):
    with open(data_path,'r',encoding='utf-8') as f:
        line = ''.join(f.readlines())
        list_data = list(line)
    return list_data

In [9]:
# 采用BIOES 标注体系
def get_BIOES(list_data, df_label):
    list_label = ['O'] * len(list_data)
    for index,d in df_label.iterrows():
        ID = d['ID']
        Category = d['Category']
        Pos_b = d['Pos_b']
        Pos_e = d['Pos_e']
        
        # 异常数据处理
        if 2162 == ID:
            Category == '前明骏女孩组合队长"'
            Pos_e = Pos_e - 1

        if Pos_b == Pos_e:
            list_label[Pos_b] = 'S_' + Category
        elif Pos_b - Pos_e == 1:
            list_label[Pos_b] = 'B_' + Category
            list_label[Pos_e] = 'E_' + Category
        else:
            # 异常数据处理
            try:
                list_label[Pos_b] = 'B_' + Category
                list_label[Pos_e] = 'E_' + Category 
                for pos_i in range(Pos_b+1,Pos_e):
                    list_label[pos_i] = 'I_' + Category 
            except:
                print(ID)
    return [(ID,data,label) for data,label in zip(list_data,list_label)]

In [10]:
def get_data_label(path_dir_data,path_dir_label):
    
    # 训练数据集
    list_BIOES = []
    # 获取训练集 ID
    list_data_ID = [file_name.split('.')[0] for file_name in os.listdir(path_dir_data) if '.txt' in file_name]
    list_label_ID = [file_name.split('.')[0] for file_name in os.listdir(path_dir_label) if '.csv' in file_name]
    
    # 循环训练集 ID
    for ID in tqdm(set(list_data_ID) & set(list_label_ID)):
        # 获取词和标签地址
        data_path = os.path.join(path_dir_data,ID+'.txt')
        label_path = os.path.join(path_dir_label,ID+'.csv')
        
        # 获取词和标签数据
        list_data = get_list_data(data_path)
#         print('list_data:',list_data)
        df_label = pd.read_csv(label_path)
        
        # BIOES 进行标注
        BIOES = get_BIOES(list_data, df_label)
        
        # 加入训练数据集
        list_BIOES += [BIOES]
        
    return list_BIOES

In [11]:
list_BIOES_train = get_data_label(train_data_path,train_data_label)

100%|██████████| 2515/2515 [00:10<00:00, 245.42it/s]


In [12]:
def get_data(path_dir_data):
    
    # 测试数据集
    list_BIOES = []
    # 获取测试集 ID
    list_data_ID = [file_name.split('.')[0] for file_name in os.listdir(path_dir_data) if '.txt' in file_name]
   
    # 循环测试集 ID
    for ID in tqdm(set(list_data_ID)):
        # 获取词
        data_path = os.path.join(path_dir_data,ID+'.txt')
        
        # 获取词
        list_data = get_list_data(data_path)
        
        # BIOES 进行标注
        BIOES = [(ID,data,'N') for data in list_data]
        
        # 加入训练数据集
        list_BIOES += [BIOES]
        
    return list_BIOES

In [13]:
list_BIOES_test = get_data(test_data_path)

100%|██████████| 3956/3956 [00:00<00:00, 5870.80it/s]


In [14]:
train_data_01, val_data_01 = list_BIOES_train[:int(0.8 * len(list_BIOES_train))], list_BIOES_train[int(0.8 * len(list_BIOES_train)):]

In [15]:
import nltk
import sklearn 
import sklearn_crfsuite
from sklearn_crfsuite import scorers,metrics

In [16]:
def word2features(sent, i):
    word = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word': word
    }
    
    if i > 0:
        word = sent[i-1][1]
        features.update({
            '-1:word': word
        })
    else:
        features['BOS'] = True

    if i > 1:
        word = sent[i-2][1]
        features.update({
            '-2:word': word
        })
    else:
        features['BOS'] = True        
        
        
    if i < len(sent)-1:
        word = sent[i+1][1]
        features.update({
            '+1:word': word
        })
    else:
        features['EOS'] = True

    if i < len(sent)-2:
        word = sent[i+2][1]
        features.update({
            '+2:word': word
        })
    else:
        features['EOS'] = True
        
    return features

In [17]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for ID, token, label in sent]

def sent2tokens(sent):
    return [token for ID, token, label in sent]

In [18]:
%%time
X_train = [sent2features(s) for s in train_data_01]
y_train = [sent2labels(s) for s in train_data_01]

X_val = [sent2features(s) for s in val_data_01]
y_val = [sent2labels(s) for s in val_data_01]

Wall time: 755 ms


In [19]:
from itertools import chain
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

In [20]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
labels = list(crf.classes_)
labels.remove('O')
labels

Wall time: 6min


In [21]:
y_pred = crf.predict(X_val)
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_val, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

          B_QQ      0.000     0.000     0.000         4
          E_QQ      0.000     0.000     0.000         4
          I_QQ      0.000     0.000     0.000        28
     B_address      0.599     0.489     0.538       372
     E_address      0.523     0.427     0.470       372
     I_address      0.662     0.582     0.619      1005
        B_book      0.700     0.522     0.598       134
        E_book      0.687     0.507     0.584       134
        I_book      0.701     0.571     0.629       583
     B_company      0.694     0.596     0.641       396
     E_company      0.685     0.585     0.631       398
     I_company      0.680     0.643     0.661      1134
     S_company      1.000     1.000     1.000         4
       B_email      0.915     0.782     0.843        55
       E_email      0.896     0.782     0.835        55
       I_email      0.969     0.946     0.958       970
        B_game      0.816     0.739     0.776  

### 使用grid search 和 cv来提高模型表现

In [150]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 29.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 102.4min finished


Wall time: 1h 45min 24s


In [151]:
# crf = rs.best_estimator_
# best params: {'c1': 0.23180081778963083, 'c2': 0.14637816579797572}
# best CV score: 0.6564354114392703
# model size: 2.13M
    
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.23180081778963083, 'c2': 0.14637816579797572}
best CV score: 0.6564354114392703
model size: 2.13M


In [161]:
import pickle
print(crf.modelfile.name)
pickle.dump(crf.modelfile , "train_model.m")

C:\Users\wvbx\AppData\Local\Temp\modeletbar7dg.crfsuite


TypeError: file must have a 'write' attribute

In [155]:
crf = rs.best_estimator_
y_pred = crf.predict(X_val)
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

print(metrics.flat_classification_report(
    y_val, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

          B_QQ      1.000     0.286     0.444         7
          E_QQ      0.500     0.143     0.222         7
          I_QQ      1.000     0.240     0.387        50
     B_address      0.614     0.476     0.536       435
     E_address      0.543     0.421     0.474       435
     I_address      0.612     0.539     0.574      1166
     S_address      0.000     0.000     0.000         0
        B_book      0.667     0.565     0.611       124
        E_book      0.648     0.548     0.594       124
        I_book      0.691     0.514     0.590       622
     B_company      0.742     0.614     0.672       430
     E_company      0.733     0.607     0.664       430
     I_company      0.697     0.683     0.690      1307
     S_company      1.000     0.800     0.889         5
       B_email      0.745     0.667     0.704        57
       E_email      0.942     0.845     0.891        58
       I_email      0.945     0.950     0.948  