In [1]:
import pandas as pd

data_df = pd.read_csv("atec_nlp_sim_train_all.csv", sep="\t", header=None, 
                      encoding="utf-8-sig", names=["sent1", "sent2", "label"])
data_df.head(10)

Unnamed: 0,sent1,sent2,label
1,怎么更改花呗手机号码,我的花呗是以前的手机号码，怎么更改成现在的支付宝的号码手机号,1
2,也开不了花呗，就这样了？完事了,真的嘛？就是花呗付款,0
3,花呗冻结以后还能开通吗,我的条件可以开通花呗借款吗,0
4,如何得知关闭借呗,想永久关闭借呗,0
5,花呗扫码付钱,二维码扫描可以用花呗吗,0
6,花呗逾期后不能分期吗,我这个 逾期后还完了 最低还款 后 能分期吗,0
7,花呗分期清空,花呗分期查询,0
8,借呗逾期短信通知,如何购买花呗短信通知,0
9,借呗即将到期要还的账单还能分期吗,借呗要分期还，是吗,0
10,花呗为什么不能支付手机交易,花呗透支了为什么不可以继续用了,0


In [2]:
import jieba

# 添加自定义词典
jieba.load_userdict("atec_dict.txt")
# 分词测试
seg_words = jieba.lcut("怎么更改花呗手机号码")
print(seg_words)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.131 seconds.
Prefix dict has been built successfully.


['怎么', '更改', '花呗', '手机号码']


In [3]:
data_df["sent1"] = data_df["sent1"].apply(lambda x: jieba.lcut(x.strip("\r\t ").replace("***","*")))
data_df["sent2"] = data_df["sent2"].apply(lambda x: jieba.lcut(x.strip("\r\t ").replace("***","*")))
data_df.head(10)

Unnamed: 0,sent1,sent2,label
1,"[怎么, 更改, 花呗, 手机号码]","[我, 的, 花呗, 是, 以前, 的, 手机号码, ，, 怎么, 更, 改成, 现在, 的...",1
2,"[也, 开, 不了, 花呗, ，, 就, 这样, 了, ？, 完事, 了]","[真的, 嘛, ？, 就是, 花呗, 付款]",0
3,"[花呗, 冻结, 以后, 还, 能, 开通, 吗]","[我, 的, 条件, 可以, 开通, 花呗, 借款, 吗]",0
4,"[如何, 得知, 关闭, 借呗]","[想, 永久, 关闭, 借呗]",0
5,"[花呗, 扫码, 付钱]","[二维码, 扫描, 可以, 用, 花呗, 吗]",0
6,"[花呗, 逾期, 后, 不能, 分期, 吗]","[我, 这个, , 逾期, 后, 还, 完, 了, , 最低, 还款, , 后, ,...",0
7,"[花呗, 分期, 清空]","[花呗, 分期, 查询]",0
8,"[借呗, 逾期, 短信, 通知]","[如何, 购买, 花呗, 短信, 通知]",0
9,"[借呗, 即将, 到期, 要, 还, 的, 账单, 还, 能, 分期, 吗]","[借呗, 要, 分期, 还, ，, 是, 吗]",0
10,"[花呗, 为什么, 不能, 支付, 手机, 交易]","[花呗, 透支, 了, 为什么, 不, 可以, 继续, 用, 了]",0


In [4]:
from collections import Counter

c = Counter()
sent_data = data_df["sent1"].values + data_df["sent2"].values
for d in sent_data:
    c.update(d)
word_counts = sorted(dict(c).items(), key=lambda x: x[1], reverse=True)

print(word_counts[:10])

[('花呗', 141731), ('我', 61743), ('借呗', 61340), ('的', 60189), ('了', 47468), ('，', 46908), ('吗', 42196), ('还', 35076), ('怎么', 33715), ('还款', 29525)]


In [5]:
vocab_words = ["<PAD>", "<UNK>"]
for w, c in word_counts:
    vocab_words.append(w)

vocab2id = {w: i for i, w in enumerate(vocab_words)}
id2vocab = {i: w for i, w in enumerate(vocab_words)}

print("vocab size: ", len(vocab2id))
print(list(vocab2id.items())[:5])
print(list(id2vocab.items())[:5])

vocab size:  13262
[('<PAD>', 0), ('<UNK>', 1), ('花呗', 2), ('我', 3), ('借呗', 4)]
[(0, '<PAD>'), (1, '<UNK>'), (2, '花呗'), (3, '我'), (4, '借呗')]


In [6]:
with open("vocab.txt", "w", encoding="utf8") as f:
    for w, i in vocab2id.items():
        f.write(w+"\n")

In [7]:
def sent2index(vocab2id, words):
    return [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in words]

data_df["sent1"] = data_df["sent1"].apply(lambda x: sent2index(vocab2id, x))
data_df["sent2"] = data_df["sent2"].apply(lambda x: sent2index(vocab2id, x))

data_df.head(10)

Unnamed: 0,sent1,sent2,label
1,"[10, 238, 2, 214]","[3, 5, 2, 17, 150, 5, 214, 7, 10, 1006, 583, 4...",1
2,"[102, 153, 32, 2, 7, 72, 591, 6, 134, 3073, 6]","[829, 132, 134, 211, 2, 33]",0
3,"[2, 110, 181, 9, 23, 19, 8]","[3, 5, 202, 12, 19, 2, 119, 8]",0
4,"[57, 6939, 52, 4]","[68, 570, 52, 4]",0
5,"[2, 314, 584]","[212, 1031, 12, 13, 2, 8]",0
6,"[2, 38, 47, 22, 18, 8]","[3, 53, 28, 38, 47, 9, 91, 6, 28, 98, 11, 28, ...",0
7,"[2, 18, 2285]","[2, 18, 226]",0
8,"[4, 38, 216, 402]","[57, 271, 2, 216, 402]",0
9,"[4, 2886, 196, 54, 9, 5, 63, 9, 23, 18, 8]","[4, 54, 18, 9, 7, 17, 8]",0
10,"[2, 14, 22, 34, 97, 232]","[2, 377, 6, 14, 26, 12, 327, 13, 6]",0


In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

print(tf.__version__)

class BaseTextCNN(keras.Model):
    def __init__(self, filters, kernel_sizes, output_dim, name):
        super(BaseTextCNN, self).__init__(name=name)
        self.kernel_sizes = kernel_sizes
        self.conv_layers = []
        self.max_poolings = []
        for kernel_size in kernel_sizes:
            self.conv_layers.append(
                keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, 
                                activation='relu', padding="same")
            )
            self.max_poolings.append(keras.layers.GlobalMaxPool1D())
        self.concatenate = keras.layers.Concatenate()
        self.dense = keras.layers.Dense(output_dim, activation='tanh')
        
    def call(self, inputs):
        convs = []
        for i in range(len(self.kernel_sizes)):
            x = self.conv_layers[i](inputs)
            x = self.max_poolings[i](x)
            convs.append(x)
        x = self.concatenate(convs)
        output = self.dense(x)
        return output

2.2.0


In [9]:
max_len = 15
vocab_size = len(vocab2id)
embedding_size = 128
filters = 200
kernel_sizes = [3,4,5]
output_dim = 100

In [10]:
from tensorflow.keras.layers import Input,Embedding,Dot,Dense
from tensorflow.keras.models import Model

In [11]:
from tensorflow.keras.layers import Dense,Conv1D,GlobalMaxPool1D,concatenate

In [13]:
from tensorflow.keras.layers import Bidirectional,LSTM

In [14]:
from tensorflow.keras.layers import Attention

In [15]:
# EMBEDDING_SIZE = 100
hidden_size= 64
attention_size = 50
# hidden_size=64
class_nums=2

In [16]:
from tensorflow.keras.layers import Convolution1D,Activation,MaxPool1D,GRU
from tensorflow.keras.layers import Bidirectional,LSTM,Dense

In [17]:
input1=Input(name='sent1',shape=(max_len,))
input2=Input(name='sent2',shape=(max_len,))
embedding=Embedding(vocab_size,embedding_size)
sent1_embed=embedding(input1)
sent2_embed=embedding(input2)
x=Convolution1D(256, 3, padding='same', strides = 1)(sent1_embed)
x=Activation('relu')(x)
x=MaxPool1D(pool_size=2)(x)
x=GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True)(x)
output_sent1=GRU(256, dropout=0.2, recurrent_dropout=0.1)(x)
x1=Convolution1D(256, 3, padding='same', strides = 1)(sent2_embed)
x1=Activation('relu')(x1)
x1=MaxPool1D(pool_size=2)(x1)
x1=GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True)(x1)
output_sent2=GRU(256, dropout=0.2, recurrent_dropout=0.1)(x1)
cosine_output=Dot(axes=[1,1],normalize=True)([output_sent1,output_sent2])
outputs=Dense(1,activation='linear',name="output")(cosine_output)
model=Model(inputs=[input1,input2],outputs=outputs)

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sent1 (InputLayer)              [(None, 15)]         0                                            
__________________________________________________________________________________________________
sent2 (InputLayer)              [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 15, 128)      1697536     sent1[0][0]                      
                                                                 sent2[0][0]                      
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 15, 256)      98560       embedding[0][0]              

In [19]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def batch_generator(all_data, batch_size, maxlen, shuffle=True):
    """
    :param all_data : all_data整个数据集，包含输入和输出标签
    :param batch_size: batch_size表示每个batch的大小
    :param shuffle: 是否打乱顺序
    :return:
    """
    # 输入all_datas的每一项必须是numpy数组，保证后面能按p所示取值
    all_data = [np.array(d) for d in all_data]
    # 获取样本大小
    data_size = all_data[0].shape[0]

    if shuffle:
        # 随机生成打乱的索引
        p = np.random.permutation(data_size)
        # 重新组织数据
        all_data = [d[p] for d in all_data]
        
    batch_count = 0
    while True:
        # 数据一轮循环(epoch)完成，打乱一次顺序
        if batch_count * batch_size + batch_size > data_size:
            batch_count = 0
            if shuffle:
                p = np.random.permutation(data_size)
                all_data = [d[p] for d in all_data]
        start = batch_count * batch_size
        end = start + batch_size
        batch_count += 1
        batch_data = [d[start: end] for d in all_data]
        batch_sent1, batch_sent2, batch_label = batch_data
        
        batch_sent1_pad = pad_sequences(batch_sent1, maxlen=max_len, padding='post')
        batch_sent2_pad = pad_sequences(batch_sent2, maxlen=max_len, padding='post')
        
        yield [batch_sent1_pad, batch_sent2_pad], batch_label

In [20]:
sent1_datas = data_df["sent1"].values.tolist()
sent2_datas = data_df["sent2"].values.tolist()
labels = data_df["label"].values.tolist()

In [21]:
# 划分训练 测试数据集
count = len(labels)
idx1, idx2 = int(count*0.8), int(count*0.9)
sent1_train, sent2_train = sent1_datas[:idx1], sent2_datas[:idx1]
sent1_val, sent2_val = sent1_datas[idx1:idx2], sent2_datas[idx1:idx2]
sent1_test, sent2_test = sent1_datas[idx2:], sent2_datas[idx2:]

train_labels, val_labels, test_labels = labels[:idx1], labels[idx1:idx2], labels[idx2:]

print("train data: ", len(sent1_train), len(sent2_train), len(train_labels))
print("val data: ", len(sent1_val), len(sent2_val), len(val_labels))
print("test data: ", len(sent1_test), len(sent2_test), len(test_labels))

train data:  81981 81981 81981
val data:  10248 10248 10248
test data:  10248 10248 10248


In [22]:
# batch数据的生成器
batch_size = 64
maxlen = 15
batch_count = int(len(train_labels) / batch_size)
batch_gen_train = batch_generator([sent1_train, sent2_train, train_labels], batch_size, max_len)
batch_gen_val = batch_generator([sent1_val, sent2_val, val_labels], batch_size, max_len)
batch_gen_test = batch_generator([sent1_test, sent2_test, test_labels], batch_size, max_len)

In [None]:
epochs = 10

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 训练模型
model.fit(batch_gen_train, 
                    verbose=1, 
                    validation_data=batch_gen_val,
                    validation_steps=100,
                    steps_per_epoch=batch_count, 
                    epochs=2)

# 预测模型
score, acc = model.evaluate(batch_gen_test, steps=50, 
                                      max_queue_size=10, 
                                      use_multiprocessing=False)
print('score:', score, 'accuracy:', acc)

# 保存训练好的模型
# model.save("output/cnndssm_semantic_match.h5")
# model.save_weights("output/match_model_weight.h5")

  if sys.path[0] == '':


Epoch 1/2

In [None]:
def sent2index(vocab2id, words):
    return [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in words]

sent1 = "怎么更改花呗手机号码？"
sent2 = "怎么更改成现在的支付宝的号码手机号？"

sent1_ids = sent2index(vocab2id, jieba.lcut(sent1))
sent2_ids = sent2index(vocab2id, jieba.lcut(sent2))

sent1_pad = pad_sequences([sent1_ids], maxlen=max_len, padding='post')
sent2_pad = pad_sequences([sent2_ids], maxlen=max_len, padding='post')

# model.load_weights("output/match_model_weight.h5")

preds = model.predict([sent1_pad, sent2_pad])

print("sent1: %s" % sent1)
print("sent2: %s" % sent2)
print("score: %s" % preds[0])