In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Flatten, Embedding, LSTM, SpatialDropout1D, Input, Bidirectional,Dropout, Activation, GRU
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

In [5]:
train_data = pd.read_csv('Train_DataSet.csv')
train_label = pd.read_csv('Train_DataSet_Label.csv')
train = pd.merge(train_data, train_label, how='left', on='id')
train = train[(train.label.notnull()) & (train.content.notnull())]
test = pd.read_csv('Test_DataSet.csv')

train['title'] = train['title'].fillna('')
train['content'] = train['content'].fillna('')
test['title'] = test['title'].fillna('')
test['content'] = test['content'].fillna('')

In [6]:
import re
def filter(text):
    text = re.sub("[A-Za-z0-9\!\=\？\%\[\]\,\（\）\>\<:&lt;\/#\. -----\_]", "", text)
    text = text.replace('图片', '')
    text = text.replace('\xa0', '') # 删除nbsp
    # new
    r1 =  "\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——！\\\，。=？、：“”‘’￥……（）《》【】]"
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, ' ', text)        #去除html标签
    text = re.sub(r1,'',text)
    text = text.strip()
    return text

In [8]:
def clean_text(data):
    data['title'] = data['title'].apply(lambda x: filter(x))
    data['content'] = data['content'].apply(lambda x: filter(x))
    return data
train = clean_text(train)
test = clean_text(test)

In [9]:
stop_words = pd.read_table('stop.txt', header=None)[0].tolist()

In [15]:
import jieba
jieba.setLogLevel(jieba.logging.INFO)
import string
table = str.maketrans("","",string.punctuation)
def cut_text(sentence):
    tokens = list(jieba.cut(sentence))
    # 去除停用词
    tokens = [token for token in tokens if token not in stop_words]
#     # 去除英文标点
#     tokens = [w.translate(table) for w in tokens]
    return tokens

In [16]:
train_title = [cut_text(sent) for sent in train.title.values]
train_content = [cut_text(sent) for sent in train.content.values]
test_title = [cut_text(sent) for sent in test.title.values]
test_content = [cut_text(sent) for sent in test.content.values]

In [17]:
all_doc = train_title + train_content + test_title + test_content

In [19]:
import gensim
import time
class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, save_path):
        self.save_path = save_path
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)

        self.pre_loss = cum_loss
        self.since = time.time()
# model_word2vec = gensim.models.Word2Vec.load('final_word2vec_model')

In [21]:
model_word2vec = gensim.models.Word2Vec(min_count=1, 
                                        window=5, 
                                        vector_size=256,
                                        workers=4,
                                        batch_words=1000)
since = time.time()
model_word2vec.build_vocab(all_doc, progress_per=2000)
time_elapsed = time.time() - since
print('Time to build vocab: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

Time to build vocab: 0min 6s


In [22]:
since = time.time()
model_word2vec.train(all_doc, total_examples=model_word2vec.corpus_count, 
                        epochs=5, compute_loss=True, report_delay=60*10,
                        callbacks=[EpochSaver('./final_word2vec_model')])
time_elapsed = time.time() - since
print('Time to train: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

Epoch 1, loss: 4080627.25, time: 0min 9s
Better model. Best loss: 4080627.25
Model ./final_word2vec_model save done!
Epoch 2, loss: 2651312.75, time: 0min 9s
Better model. Best loss: 2651312.75
Model ./final_word2vec_model save done!
Epoch 3, loss: 2272688.00, time: 0min 9s
Better model. Best loss: 2272688.00
Model ./final_word2vec_model save done!
Epoch 4, loss: 1891435.00, time: 0min 9s
Better model. Best loss: 1891435.00
Model ./final_word2vec_model save done!
Epoch 5, loss: 1837068.00, time: 0min 9s
Better model. Best loss: 1837068.00
Model ./final_word2vec_model save done!
Time to train: 0min 50s


In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_title + test_title)
# tokenizer.fit_on_texts(train_content + test_content)

In [41]:
# 转化成词向量矩阵，利用新的word2vec模型
vocab_size = len(tokenizer.word_index)
error_count=0
embedding_matrix = np.zeros((vocab_size + 1, 256))
for word, i in tqdm(tokenizer.word_index.items()):
    if model_word2vec.wv.__contains__(word):
        embedding_matrix[i] = model_word2vec.wv[word]
    else:
        error_count += 1

100%|██████████| 32058/32058 [00:00<00:00, 306496.64it/s]


In [42]:
sequence = tokenizer.texts_to_sequences(train_title)
traintitle = pad_sequences(sequence, maxlen=30)
sequence = tokenizer.texts_to_sequences(test_title)
testtitle = pad_sequences(sequence, maxlen=30)
# sequence = tokenizer.texts_to_sequences(train_content)
# traincontent = pad_sequences(sequence, maxlen=512)
# sequence = tokenizer.texts_to_sequences(test_content)
# testcontent = pad_sequences(sequence, maxlen=512)

In [43]:
import tensorflow as tf
def metric_F1score(y_true,y_pred):    
    TP=tf.reduce_sum(y_true*tf.round(y_pred))
    TN=tf.reduce_sum((1-y_true)*(1-tf.round(y_pred)))
    FP=tf.reduce_sum((1-y_true)*tf.round(y_pred))
    FN=tf.reduce_sum(y_true*(1-tf.round(y_pred)))
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    F1score=2*precision*recall/(precision+recall)
    return F1score




In [44]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
class TextCNN(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        # Embedding part can try multichannel as same as origin paper
        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen,
                              weights=[embedding_matrix])(input)
        convs = []
        for kernel_size in [3, 4, 5]:
            c = Conv1D(128, kernel_size, activation='relu')(embedding)
            c = GlobalMaxPooling1D()(c)
            convs.append(c)
        x = Concatenate()(convs)

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model
    
model = TextCNN(maxlen=30, max_features=len(tokenizer.word_index) + 1,
                    embedding_dims=256, class_num=3, last_activation='softmax').get_model()
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy',metric_F1score])

In [45]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 30, 256)      8207104     ['input_2[0][0]']                
                                                                                                  
 conv1d_3 (Conv1D)              (None, 28, 128)      98432       ['embedding_1[0][0]']            
                                                                                                  
 conv1d_4 (Conv1D)              (None, 27, 128)      131200      ['embedding_1[0][0]']            
                                                                                            

In [46]:
label = train['label'].astype(int)
# labels = to_categorical(label) 
# train_X, val_X, train_Y, val_Y = train_test_split(traintitle, label, shuffle=True, test_size=0.2,
#                                                     random_state=2019)
train_X, val_X, train_Y, val_Y = train_test_split(traintitle, label, shuffle=True, test_size=0.2,
                                                    random_state=2019)
train_Y = to_categorical(train_Y)

In [47]:
model.fit(train_X,
          train_Y,
          batch_size=128,
          epochs=10)
# model.fit(traintitle,
#           labels,
#           batch_size=128,
#           epochs=3,
#           shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16a6808e788>

In [48]:
from sklearn.metrics import f1_score
pred_val = model.predict(val_X)
print(f1_score(val_Y, np.argmax(pred_val, axis=1), average='macro'))

0.636989761999734


In [49]:
preds = np.argmax(model.predict(testtitle), axis=1)
test['label'] = preds

In [50]:
test['label'].value_counts()

1    4042
2    2936
0     378
Name: label, dtype: int64

In [51]:
test[test.label==0]

Unnamed: 0,id,title,content,label
44,016120c239f547ea8881ab632ddd03bb,沙湾职中代表队斩获全市中学生防震减灾知识竞赛高中组头筹,沙湾新闻网讯雷小军月日在市教育局和市防震减灾局共同主办的全市中学生防震减灾知识竞赛中沙湾职中...,0
69,021b50be5c714739a4d5ac46567c03f2,兢兢业业的排头兵蒙牛集团一线员工风采录,随着消费升级节奏的加快消费者对乳品的需求也越来越来对于蒙牛牛奶的研发部门而言研究出更多高品质...,0
77,026928bb2e5f4e1391fe7965ce949ebf,久治县组织干部观摩学习海南州贵德县民族团结进步创建工作先进经验,为进一步提升久治县创建民族团结进步先进县工作水平久治县委主动与海南州贵德县委共和县委联系对接...,0
88,02af201cde4d4c62b7b28e3f54bb8d17,安康汉阴警方帮助农民工追回拖欠工资万元,下载次数下载附件保存到相册分钟前上传月日上午汉阴县公安局铁佛派出所民警在户籍大厅当场为十余名...,0
92,02ee82f4e9bb49a0b87361dcff6fb1d0,九寨地震平安归来心有余悸为九寨祈福,晚上八点半左右我们一行四人达到了九寨沟沟口预订的酒店放下行李从酒店出来找到一家餐馆准备随便吃...,0
...,...,...,...,...
7199,fa1cb9f75bfa4bf38846e569d74711d0,中国大使与印尼穆斯林共同开斋并启动便民卫生项目,月日中国驻印尼大使肖千右来到位于雅加达南区的阿斯沙克法经学院与印尼最大穆斯林组织伊斯兰教士联...,0
7201,fa27b649587a498eb75780387b26b1ce,正能量交警推车助人群众纷纷点赞,今天一张女交警奋力推车的照片在微信朋友圈火热传播网友纷纷转发点赞并写下评论说推车的交警同志真...,0
7237,fb9abb737fc942ee8909a9268c3fb88e,定向挑战黎园奔跑材化学院社区文化节活动激情开赛,点击蓝字关注关注我们定向挑战黎园奔跑材化学院社区文化节活动激情开赛黎小材比赛即将开始我将全程...,0
7316,fe9bbce72c194aea90016898e7ef4c82,福建警方摧毁一个网络水军团伙炒作舆情余起,新华社福州月日电记者王成据福建省公安厅消息莆田警方侦破一起公安部挂牌督办的网络水军案件摧毁网...,0


In [39]:
test[['id', 'label']].to_csv('baseline4.csv', index=False)