In [1]:
"""
这个是参照 https://www.kaggle.com/yekenot/pooled-gru-fasttext/output 的方法，对豆瓣的数据进行多分类训练。
原作者在英文的数据集上的结果如下：

Using TensorFlow backend.
203.0s

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
2366.0s

 - 2163s - loss: 0.0496 - acc: 0.9820 - val_loss: 0.0458 - val_acc: 0.9824  2382.1s
ROC-AUC - epoch: 1 - score: 0.986507 

Epoch 2/2
4440.9s

 - 2059s - loss: 0.0376 - acc: 0.9853 - val_loss: 0.0455 - val_acc: 0.9826 4455.1s
 ROC-AUC - epoch: 2 - score: 0.986944 
 
我实现的代码，将dropout从0.2调整到0.3， ROC-AUC-score: 0.668627 -> ROC-AUC-score: 0.669534 ,

结合豆瓣的数据分析，我觉得接下来改进的思路是句向量的过程中，能够去掉信息量低的部分，SIF的句向量方法可以试下。
"""


import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback


Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['OMP_NUM_THREADS'] = '4'

In [3]:
EMBEDDING_FILE = './res/crawl-300d-2M.vec'

In [34]:
import pickle
X_train = []
# 将豆瓣的评论进行分词，然后用空格分隔
with open('../l10/res/cut_words.txt', encoding='utf-8') as fd:
    line = fd.readline()
    while line:
        X_train.append(line.strip())
        line = fd.readline()
X_train.pop()
with open('../l10/res/label_onehot.pickle', 'rb') as fl:
    y_train = pickle.load(fl)

In [37]:
max_features = 30000
maxlen = 200
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(list(X_train) + list(X_test))
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
# X_test = tokenizer.texts_to_sequences(X_test)

In [39]:
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
# x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [41]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')


embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf-8'))

In [42]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [43]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [44]:
embedding_matrix.shape

(30000, 300)

In [45]:
maxlen = 200
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(5, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [46]:
model = get_model()

In [31]:
# 这里是移花接木的地方，换上了自己豆瓣的数据了。
# import pickle
# with open('../l10/res/dataset.pickle', 'rb') as f:
#     x_train = pickle.load(f)

# with open('../l10/res/label_onehot.pickle', 'rb') as f:
#     y_train = pickle.load(f)   

In [50]:
x_train = np.asarray(x_train).reshape(-1,200)

In [55]:
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=233)

RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)


# y_pred = model.predict(x_test, batch_size=1024)
# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
# submission.to_csv('submission.csv', index=False)

Train on 209195 samples, validate on 52299 samples
Epoch 1/2
 - 1729s - loss: 0.4575 - acc: 0.7998 - val_loss: 0.4511 - val_acc: 0.8000

 ROC-AUC - epoch: 1 - score: 0.662795 

Epoch 2/2
 - 1569s - loss: 0.4322 - acc: 0.8051 - val_loss: 0.4522 - val_acc: 0.7978

 ROC-AUC - epoch: 2 - score: 0.669534 

