In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
import string
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
from tensorflow.keras.layers import Embedding, Dropout, SeparableConv1D, MaxPooling1D, GlobalAveragePooling1D, Dense, Input, Conv1D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [0]:
def sepr(elem, start=1):
    latin = string.ascii_lowercase + string.digits + '$'
    for ind in range(start, len(elem)):
        if elem[ind] == ' ':
            continue
        a = elem[ind-1] in latin
        b = elem[ind] in latin
        if a != b:
            return sepr(elem[:ind] + ' ' + elem[ind:], ind + 2)
    return elem

def sequence_vectorize(train_texts, val_texts, TOP_K=20000, MAX_SEQUENCE_LENGTH=500):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index
    
    
def _data_generator(x_t, x_d, y, batch_size):
    """Generates batches of vectorized texts for training/validation.

    # Arguments
        x_t: np.matrix, feature(title) matrix.
        x_d: np.matrix, feature(description) matrix.
        y: np.ndarray, labels.
        batch_size: int, number of samples per batch.

    # Returns
        Yields feature and label data in batches.
    """
    num_samples = x_t.shape[0]
    num_batches = num_samples // batch_size
    if num_samples % batch_size:
        num_batches += 1

    while 1:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            if end_idx > num_samples:
                end_idx = num_samples
            x_t_batch = x_t[start_idx:end_idx]
            x_d_batch = x_d[start_idx:end_idx]
            y_batch = y[start_idx:end_idx]
            yield [x_t_batch, x_d_batch], y_batch


In [0]:
data = pd.read_csv('divar_posts_dataset.csv', 
usecols=['cat1', 'cat2', 'cat3', 'title', 'desc'])
data['cat1'] = data['cat1'].fillna('na')
data['cat2'] = data['cat2'].fillna('na')
data['cat3'] = data['cat3'].fillna('na')
data['desc'] = data['desc'].fillna('')
data['title'] = data['title'].fillna('')
data['cats'] = data.cat1 + '_' + data.cat2 + '_' + data.cat3
data = data[np.bitwise_and(data.cats != 'na_mobile_200000', np.bitwise_and(data.cats != 'na_mobile_270000', data.cats != 'na_mobile_8000'))]
data.desc = data.desc.map(sepr)
data.title = data.title.map(sepr)

In [0]:
X_desc_train, X_desc_val, X_title_train, X_title_val, y_train, y_val = train_test_split(data.desc, data.title, data.cats, test_size=0.2, random_state=63)

le = LabelEncoder()
y_train = le.fit_transform(y_train.values)
y_val = le.transform(y_val.values)

In [0]:
X_desc_train, X_desc_val, word_index_desc = sequence_vectorize(X_desc_train, X_desc_val, TOP_K=55000)
X_title_train, X_title_val, word_index_title = sequence_vectorize(X_title_train, X_title_val, TOP_K=20000)

In [0]:
print(X_desc_train.shape, X_desc_val.shape)
print(X_title_train.shape, X_title_val.shape)
print(y_train.shape, y_val.shape)

(796405, 322) (199102, 322)
(796405, 19) (199102, 19)
(796405,) (199102,)


In [0]:
num_features_title = min(len(word_index_title) + 1, 20000)
num_features_desc = min(len(word_index_desc) + 1, 55000)
embedding_dim_title = 50
embedding_dim_desc = 100
kernel_size = 3
filters = 32
pool_size = 3
dropout_rate = 0.5
num_classes = max(y_train) + 1

ts_input = Input(shape=(X_title_train.shape[1],))
ts_model = ts_input
ts_model = Embedding(input_dim=num_features_title,
                            output_dim=embedding_dim_title,
                            input_length=(X_title_train.shape[1],))(ts_model)
ts_model = Dropout(rate=dropout_rate)(ts_model)
ts_model = Conv1D(filters=filters,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ts_model)
ts_model = Conv1D(filters=filters,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ts_model)
ts_model = MaxPooling1D(pool_size=pool_size)(ts_model)
ts_model = Conv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ts_model)
ts_model = Conv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ts_model)
ts_model = GlobalAveragePooling1D()(ts_model)

ds_input = Input(shape=(X_desc_train.shape[1],))
ds_model = ds_input
ds_model = Embedding(input_dim=num_features_desc,
                            output_dim=embedding_dim_desc,
                            input_length=(X_desc_train.shape[1],))(ds_model)
ds_model = Dropout(rate=dropout_rate)(ds_model)
ds_model = Conv1D(filters=filters,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ds_model)
ds_model = Conv1D(filters=filters,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ds_model)
ds_model = MaxPooling1D(pool_size=pool_size)(ds_model)
ds_model = Conv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ds_model)
ds_model = Conv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          padding='same')(ds_model)
ds_model = GlobalAveragePooling1D()(ds_model)

merged = tf.keras.layers.Concatenate()([ts_model, ds_model])
merged = Dropout(rate=dropout_rate)(merged)
merged = Dense(num_classes, activation='softmax')(merged)
newModel = tf.keras.models.Model([ts_input,ds_input], merged)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
learning_rate=1e-3
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
newModel.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])
batch_size = 64
# Alaki :)
num_features = min(len(word_index_desc) + 1, 20000)
training_generator = _data_generator(
    X_title_train, X_desc_train, y_train, batch_size)
validation_generator = _data_generator(
    X_title_val, X_desc_val, y_val, batch_size)

# Get number of training steps. This indicated the number of steps it takes
# to cover all samples in one epoch.
steps_per_epoch = X_title_train.shape[0] // batch_size
if X_title_train.shape[0] % batch_size:
    steps_per_epoch += 1

# Get number of validation steps.
validation_steps = X_title_train.shape[0] // batch_size
if X_title_train.shape[0] % batch_size:
    validation_steps += 1
    
callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=6), 
        tf.keras.callbacks.ModelCheckpoint('output/sepCNN_total.hdf5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)]

In [None]:
history = newModel.fit_generator(
            generator=training_generator,
            steps_per_epoch=steps_per_epoch,
            validation_data=validation_generator,
            validation_steps=validation_steps,
            callbacks=callbacks,
            epochs=1000,
            verbose=1,
            )