In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from random import choice
import random
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import re, os
import codecs
from keras.layers import *
from keras.models import Model
import keras.backend as K
import keras.backend.tensorflow_backend as KTF
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import keras
from keras.callbacks import EarlyStopping
from sklearn import metrics

Using TensorFlow backend.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]='0' # 使用编号为1，2号的GPU
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.7
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
KTF.set_session(session)

In [3]:
maxlen = 25
config_path = './RoBERTa-wwm-ext, Chinese/bert_config.json'
checkpoint_path = './RoBERTa-wwm-ext, Chinese/bert_model.ckpt'
dict_path = './RoBERTa-wwm-ext, Chinese/vocab.txt'

In [4]:
# 读数据
datafile = r'gdtrian.csv'
data1 = pd.read_csv(datafile, encoding='utf-8', sep='|')
data1['absD'] = abs(data1['flag'] - data1['pred'])
data2 = data1[data1['absD'] < 0.2]
data2 = data2[['contents', 'flag']]
data3 = pd.read_excel('gdtrian.xlsx')
data4 = data3[['contents','最终结果']]
data4.columns = ['contents', 'flag']

In [5]:
#data5 = data2.sample(n=10000)
data = pd.concat([data2,data4])
data.head()

Unnamed: 0,contents,flag
0,恩施跨网工单调度岗adddisp 丰-黔江中继段主用衰耗较大，系统目前倒换至备用，加派恩施检...,1
1,省noc调度监控岗(集团故障)inceptbill nan feedbackscene 集团...,0
3,省noc调度监控岗(集团故障)inceptbill nan feedbackscene 集团...,0
5,省noc调度监控岗(集团故障)inceptbill nan feedbackscene 集团...,0
7,省noc调度监控岗(集团故障)inceptbill nan feedbackscene 待传...,0


In [6]:
# 去掉回车，修改标签,分词
data['contents'] = data['contents'].apply(lambda x: x.replace('\r\n',''))
data = data.reindex()
data = data[['contents','flag']]
data = np.array(data)
random.shuffle(data)#随机打乱
#取前70%为训练集
allurl_fea = [d[0] for d in data]
train_data=data[:int(0.7*len(allurl_fea))]
#将np.array转为dataframe，并对两列赋列名
#剩余百分之30为测试集
valid_data=data[int(0.7*len(allurl_fea)):]
print(train_data.shape,valid_data.shape)

(28351, 2) (12151, 2)


In [7]:
class CustomModelCheckpoint(keras.callbacks.Callback):
    def __init__(self, model, path):
        self.model = model
        self.path = path
        self.best_loss = np.inf

    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs['val_loss']
        if val_loss < self.best_loss:
            print("\nValidation loss decreased from {} to {}, saving model".format(self.best_loss, val_loss))
            self.model.save_weights(self.path, overwrite=True)
            self.best_loss = val_loss

In [8]:
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
for l in bert_model.layers:
    l.trainable = True

x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))

x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x) # 取出[CLS]对应的向量用来做分类
x = Dropout(0.1)(x)
# x = Dense(100, activation='relu')(x)
# x = Dropout(0.1)(x)
p = Dense(1, activation='sigmoid')(x)

model = Model([x1_in, x2_in], p)
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-5), # 用足够小的学习率
    metrics=['accuracy'] 
)
model.summary()

W1223 23:19:44.185786 13216 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1223 23:19:44.191645 13216 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1223 23:19:44.267188 13216 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:131: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1223 23:19:44.268164 13216 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W1223 23:19:44.278989 13216 deprecation.py:50

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, None, 768)    101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 768)          0           model_2[1][0]                    
__________

In [9]:
checkpointer = CustomModelCheckpoint(model, 'bert_chgx.h5')
early_stopping = EarlyStopping(monitor='val_loss', patience=2, verbose=2)
callbacks_list = [checkpointer,early_stopping]

In [10]:
token_dict = {}

with codecs.open(dict_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)

In [11]:
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]') # space类用未经训练的[unused1]表示
            else:
                R.append('[UNK]') # 剩余的字符是[UNK]
        return R

tokenizer = OurTokenizer(token_dict)

In [12]:
tokenizer.tokenize(u'四川电信反馈: 光缆中断，已修复，请集团确认')
#tokenizer.encode(u'四川电信反馈: 光缆中断，已修复，请集团确认')

['[CLS]',
 '四',
 '川',
 '电',
 '信',
 '反',
 '馈',
 ':',
 '[unused1]',
 '光',
 '缆',
 '中',
 '断',
 '，',
 '已',
 '修',
 '复',
 '，',
 '请',
 '集',
 '团',
 '确',
 '认',
 '[SEP]']

In [13]:
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])

In [14]:
class data_generator:
    def __init__(self, data, batch_size=16):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][0:maxlen]
                x1, x2 = tokenizer.encode(first=text)
                y = d[1]
                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    #Y = keras.utils.to_categorical(Y, num_classes=8)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []

In [15]:
train_D = data_generator(train_data,batch_size=16)
valid_D = data_generator(valid_data,batch_size=16)

model.fit_generator(
    train_D.__iter__(),
    steps_per_epoch=len(train_D),
    epochs=10,
    validation_data=valid_D.__iter__(),
    validation_steps=len(valid_D),
    callbacks=callbacks_list
)

Epoch 1/10

Validation loss decreased from inf to 0.022375850473121416, saving model
Epoch 2/10

Validation loss decreased from 0.022375850473121416 to 0.019513645287816468, saving model
Epoch 3/10
Epoch 4/10
 154/1772 [=>............................] - ETA: 9:51 - loss: 8.0564e-06 - acc: 1.0000

KeyboardInterrupt: 

In [16]:
#读取模型
model.load_weights('bert_chgx.h5')
X_test0 = [i[0][:maxlen] for i in valid_data]
y_test = [i[1] for i in valid_data]
X_test = [tokenizer.encode(first=i) for i in X_test0]
X1_test = [i[0] for i in X_test]
X2_test = [i[1] for i in X_test]
from keras.preprocessing import sequence
X1_test = sequence.pad_sequences(X1_test, maxlen=maxlen, padding='post')
X2_test = sequence.pad_sequences(X2_test, maxlen=maxlen, padding='post')

In [17]:
out = model.predict([X1_test,X2_test])
auc = metrics.roc_auc_score(y_test,out)
print(auc)
#out = np.argmax(out,axis=1)
out = [1 if i >0.5 else 0 for i in out ]
#out = out.apply(lambda x: toNormal(x))
report=metrics.classification_report(y_test, out)
print(report)

0.9990424710799566
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      7292
           1       0.99      1.00      0.99      4859

    accuracy                           1.00     12151
   macro avg       0.99      1.00      1.00     12151
weighted avg       1.00      1.00      1.00     12151



In [25]:
strl = ['综合设备维护班岗chgdisp feedbackfault 初次故障定位是否准确：是 revert 跟换尾纤恢复正常',' ']
pred_X = np.array(strl)
X_valid = [i[:maxlen] for i in pred_X]
X_valid = [tokenizer.encode(first=i) for i in X_valid]
X1_valid = [i[0] for i in X_valid]
X2_valid = [i[1] for i in X_valid]
X1_valid = sequence.pad_sequences(X1_valid, maxlen=maxlen, padding='post')
X2_valid = sequence.pad_sequences(X2_valid, maxlen=maxlen, padding='post')
out = model.predict([X1_valid,X2_valid])
print(out)

[[0.9978168]
 [0.671964 ]]
