# This is a Basic Module for DPCNN

In [41]:
from keras import Model
from keras.layers import Dense, Embedding, Input, Flatten, Concatenate,\
    SpatialDropout1D, Conv1D, MaxPooling1D, Add, Reshape, ZeroPadding1D
from keras.preprocessing.sequence import pad_sequences
from keras_callbacks import *
import numpy as np
from metrics import f1
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.losses import categorical_crossentropy
from sklearn.metrics import f1_score
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))
K.clear_session()

In [2]:
def dp_block(input):
    pool = ZeroPadding1D((0, 1))(input)
    pool = MaxPooling1D(pool_size=3, strides=2, padding='valid')(pool)
    conv = Conv1D(filters=250, strides=1, kernel_size=3, padding='same', activation='relu')(pool)  # keep dimension the same
    conv = Conv1D(filters=250, strides=1, kernel_size=3, padding='same', use_bias=False, activation='relu')(conv)
    shortcut = Add()([conv, pool])
    return shortcut

In [3]:
def build_model(embedding_layer, maxlen, targets):
    sent_input = Input((maxlen,), dtype='int32')
    sent_embedded = embedding_layer(sent_input)

    # region embedding
    conv = Conv1D(filters=250, strides=1, kernel_size=3, padding='same', activation='relu')(sent_embedded)

#     while conv.shape[1] > 2:
#          conv = dp_block(conv)
    
    output_list = []
    for target in targets:
        flat_target = Flatten(name='flatten'+target)(conv)
        output_target = Dense(4, name='output'+target, activation='softmax')(flat_target)
        output_list.append(output_target)
    model = Model(sent_input, output_list)
    return model

# Load Data

In [4]:
from torch_data import UserCommentDataset, calculate_labels, save_predictions
from train_config import data_path_config, targets
from gensim.models import KeyedVectors
from utils import *



In [42]:
maxlen = 256

In [43]:
train_dataset = UserCommentDataset(data_path_config['train_data_path'],
                                   targets=targets,
                                   content='jieba_seg',
                                   transformers=None)
validate_dataset = UserCommentDataset(data_path_config['valid_data_path'],
                                      targets=targets,
                                      content='jieba_seg',
                                      transformers=None)
test_dataset = UserCommentDataset(data_path_config['test_data_path'],
                                  targets=None,
                                  content='jieba_seg',
                                  transformers=None)

In [44]:
train_sentences, train_targets = zip(*train_dataset)
validate_sentences, validate_targets = zip(*validate_dataset)
test_sentences, _ = zip(*test_dataset)

In [45]:
def decompose_targets(data):
    decomposed_list = []
    for idx, _ in enumerate(targets):
        target_list = [x[idx] for x in data]
        target_array = np.vstack(target_list)
        decomposed_list.append(target_array)
    return decomposed_list

Y_train = decompose_targets(train_targets)
Y_valid = decompose_targets(validate_targets)

In [46]:
w2v = KeyedVectors.load_word2vec_format(data_path_config['embedding_path'],
                                        binary=True, unicode_errors='ignore')
tok2idx = build_tok2idx(w2v)

In [47]:
w2v_matrix = build_embedding_matrix(tok2idx, w2v, data_path_config['embedding_dim'])

In [48]:
X_train_seq = texts_to_sequences(train_sentences, tok2idx)
X_valid_seq = texts_to_sequences(validate_sentences, tok2idx)
X_test_seq = texts_to_sequences(test_sentences, tok2idx)

In [49]:
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Modeling

In [50]:
model_name = 'DPCNN'
lr_schedule = generate_learning_rate_schedule(0.001, 0.1, 20, 0)
checkpoint = generate_check_point(model_name)
early_stopping = generate_early_stopping()
tensorboard = generate_tensorboard(model_name, 'ALL')
callbacks=[lr_schedule, checkpoint, early_stopping, tensorboard]

In [51]:
embedding_matrix = np.random.randn(w2v_matrix.shape[0],w2v_matrix.shape[1])
embedding_matrix[0, :] = 0


embedding_layer = Embedding(input_dim=w2v_matrix.shape[0],
                            output_dim=w2v_matrix.shape[1],
                            weights=[embedding_matrix],
                            trainable=True)

In [52]:
model = build_model(embedding_layer, maxlen, targets)

In [53]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 256, 200)     13966000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 256, 250)     150250      embedding_1[0][0]                
__________________________________________________________________________________________________
zero_padding1d_1 (ZeroPadding1D (None, 257, 250)     0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
max_poolin

In [54]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [55]:
model.fit(X_train_pad, 
          Y_train, batch_size=64,
          callbacks=callbacks,
          epochs=1,
          validation_data=(X_valid_pad, Y_valid))

Train on 105000 samples, validate on 15000 samples
Epoch 1/1

Epoch 00001: LearningRateScheduler reducing learning rate to 0.1.


<keras.callbacks.History at 0x1da8b5765f8>

In [37]:
xxx = model.predict(X_valid_pad)[0]

In [39]:
np.sum(xxx, axis=0)

array([15000.,     0.,     0.,     0.], dtype=float32)

In [None]:
Y_valid[0].shape

In [27]:
xxx.shape

(15000, 4)

In [29]:
np.sum(xxx, axis=0)

array([15000.,     0.,     0.,     0.], dtype=float32)

In [30]:
np.sum(Y_valid[0], axis=0)

array([11757,   182,   136,  2925])