# This is a Basic Module for DPCNN

In [1]:
from keras import Model
from keras.layers import Dense, Embedding, Input, Flatten, Concatenate,\
    SpatialDropout1D, Conv1D, MaxPooling1D, Add, Reshape, ZeroPadding1D
from keras.preprocessing.sequence import pad_sequences
from keras_callbacks import *
import numpy as np
from metrics import f1
import tensorflow as tf
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.losses import categorical_crossentropy
from sklearn.metrics import f1_score
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))
K.clear_session()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def dp_block(input):
    pool = ZeroPadding1D((0, 1))(input)
    pool = MaxPooling1D(pool_size=3, strides=2, padding='valid')(pool)
    conv = Conv1D(filters=250, strides=1, kernel_size=3, padding='same')(pool)  # keep dimension the same
    conv = Conv1D(filters=250, strides=1, kernel_size=3, padding='same', use_bias=False)(conv)
    shortcut = Add()([conv, pool])
    return shortcut

In [3]:
def build_model(embedding_layer, maxlen, targets):
    sent_input = Input((maxlen,), dtype='int32')
    sent_embedded = embedding_layer(sent_input)

    # region embedding
    conv = Conv1D(filters=250, strides=1, kernel_size=3, padding='same')(sent_embedded)

    while conv.shape[1] > 2:
         conv = dp_block(conv)
    
    output_list = []
    for target in targets:
        flat_target = Flatten(name='flatten'+target)(conv)
        output_target = Dense(4, name='output'+target, activation='softmax')(flat_target)
        output_list.append(output_target)
    model = Model(sent_input, output_list)
    return model

# Load Data

In [4]:
from torch_data import UserCommentDataset, calculate_labels, save_predictions
from train_config import data_path_config, targets
from gensim.models import KeyedVectors
from utils import *



In [5]:
maxlen = 1024

In [6]:
train_dataset = UserCommentDataset(data_path_config['train_data_path'],
                                   targets=targets,
                                   content='jieba_seg',
                                   transformers=None)
validate_dataset = UserCommentDataset(data_path_config['valid_data_path'],
                                      targets=targets,
                                      content='jieba_seg',
                                      transformers=None)
test_dataset = UserCommentDataset(data_path_config['test_data_path'],
                                  targets=None,
                                  content='jieba_seg',
                                  transformers=None)

In [7]:
train_sentences, train_targets = zip(*train_dataset)
validate_sentences, validate_targets = zip(*validate_dataset)
test_sentences, _ = zip(*test_dataset)

In [8]:
def decompose_targets(data):
    decomposed_list = []
    for idx, _ in enumerate(targets):
        target_list = [x[idx] for x in data]
        target_array = np.vstack(target_list)
        decomposed_list.append(target_array)
    return decomposed_list

Y_train = decompose_targets(train_targets)
Y_valid = decompose_targets(validate_targets)

In [9]:
w2v = KeyedVectors.load_word2vec_format(data_path_config['embedding_path'],
                                        binary=True, unicode_errors='ignore')
tok2idx = build_tok2idx(w2v)

In [10]:
w2v_matrix = build_embedding_matrix(tok2idx, w2v, data_path_config['embedding_dim'])

In [11]:
X_train_seq = texts_to_sequences(train_sentences, tok2idx)
X_valid_seq = texts_to_sequences(validate_sentences, tok2idx)
X_test_seq = texts_to_sequences(test_sentences, tok2idx)

In [12]:
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# Modeling

In [13]:
model_name = 'DPCNN'
lr_schedule = generate_learning_rate_schedule(0.001, 0.5, 20, 0)
checkpoint = generate_check_point(model_name)
early_stopping = generate_early_stopping()
tensorboard = generate_tensorboard(model_name, 'ALL')
callbacks=[lr_schedule, checkpoint, early_stopping, tensorboard]

In [14]:
embedding_layer = Embedding(input_dim=w2v_matrix.shape[0],
                            output_dim=w2v_matrix.shape[1],
                            weights=[w2v_matrix],
                            trainable=False)

In [15]:
model = build_model(embedding_layer, maxlen, targets)

In [16]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1024)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1024, 200)    13966000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1024, 250)    150250      embedding_1[0][0]                
__________________________________________________________________________________________________
zero_padding1d_1 (ZeroPadding1D (None, 1025, 250)    0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
max_poolin

In [None]:
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.fit(X_train_pad, 
          Y_train, batch_size=64,
          callbacks=callbacks,
          epochs=100,
          validation_data=(X_valid_pad, Y_valid))

Train on 105000 samples, validate on 15000 samples
Epoch 1/100

Epoch 00001: LearningRateScheduler reducing learning rate to 0.5.
Epoch 2/100

Epoch 00002: LearningRateScheduler reducing learning rate to 0.49829566201032827.
 10560/105000 [==>...........................] - ETA: 1:36 - loss: 2.3842e-06 - outputlocation_traffic_convenience_loss: 1.1921e-07 - outputlocation_distance_from_business_district_loss: 1.1921e-07 - outputlocation_easy_to_find_loss: 1.1921e-07 - outputservice_wait_time_loss: 1.1921e-07 - outputservice_waiters_attitude_loss: 1.1921e-07 - outputservice_parking_convenience_loss: 1.1921e-07 - outputservice_serving_speed_loss: 1.1921e-07 - outputprice_level_loss: 1.1921e-07 - outputprice_cost_effective_loss: 1.1921e-07 - outputprice_discount_loss: 1.1921e-07 - outputenvironment_decoration_loss: 1.1921e-07 - outputenvironment_noise_loss: 1.1921e-07 - outputenvironment_space_loss: 1.1921e-07 - outputenvironment_cleaness_loss: 1.1921e-07 - outputdish_portion_loss: 1.1921e