In [None]:
# Better Char CNN based on the code used in the paper by Ruder et. al

In [None]:
# imports
import logging
import time
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

from utils import *

%matplotlib inline

In [None]:
np.random.seed(123)

start = time.time()

data = import_dataset()

print('Loading Twitter dataset took %d seconds.' % (time.time() - start))

In [None]:
# Settings for our network
embedding_size = 200
num_hidden = 0
num_layers = 1
hidden_size = 250
l2 = 0
dropout_rate = 0.5
filter_lengths = [6, 7, 8]
nb_filters = 100
max_len_char = 140
epochs = 15
batch_mode = 'off'
optimizer = 'adadelta'
chars = 'no_numeric_upper'
batch_size = 50

parameters = 'num_hidden=%d, num_layers=%d, max_len_char=%d, batch_mode=%s, hidden_size=%d, chars=%s, l2=%f, dropout_rate=%f, filter_lengths=%s, nb_filters=%d, epochs=%d, batch_size=%d, optimizer=%s'\
                         % (num_hidden, num_layers, max_len_char, batch_mode, hidden_size, chars, l2, dropout_rate, str(filter_lengths), nb_filters, epochs, batch_size, optimizer)
print(parameters)

In [None]:
# load the character set
chars_set = load_charset()

for every document, we have a feature matrix of **shape max_len_char x number of chars** (number of features) over which we convolve


for all documents, we thus have a 3d tensor


if we have a lot of documents, this tensor becomes too big (numpy arrays can be max 2GB in size, no matter the memory)
we first try to create this array, if this fails, we default to batch_mode where we create the tensor only
for the current mini-batch; this takes longer, but at least we can process an infinite amount of documents

In [None]:
# Only keep the top 10 most frequent authors to work with
top10_authors = np.array(data.author.value_counts().index[:10])
top10_authors

top10_authors_data = data[data.author.isin(top10_authors)]
print("Number of Tweets: {}".format(len(top10_authors_data)))
top10_authors_data.head()

In [None]:
# only keep characters that appear at least 100 times in the corpus
small_chars_set =  dict(filter(lambda x: x[1]>=100, chars_set.items()))
small_char_indices = dict((c, i) for i, c in enumerate(small_chars_set))

In [None]:
X_char = np.zeros((len(top10_authors_data), max_len_char, len(small_chars_set)), dtype=np.bool)

if we only use characters as input channel, apply convolutions directly on one-hot character matrix without embedding layer

In [None]:
# build the data matrix with the OHE stuff
# padding is incorporated in the process by letting it be 0

# building X
for doc_num, doc in enumerate(top10_authors_data.text):
    for char_num, char in enumerate(doc):
        # how to deal with docs, just keep part of the document             
        if char_num >= max_len_char:
            break
        # unknown characters and padding are all mapped to the 0 vector
        if char in small_char_indices:
                X_char[doc_num, char_num, small_char_indices[char]] = 1

In [None]:
# building Y
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

ohe = OneHotEncoder()
le = LabelEncoder()
Y = ohe.fit_transform(le.fit_transform(top10_authors_data.author.values).reshape(-1, 1)).todense()

In [None]:
# Train-Validation-Test split
# Test is 90% of original
# Val is 10% of train

from sklearn.model_selection import train_test_split

X_train_char, X_test_char, Y_train, Y_test = train_test_split(X_char, Y, test_size=0.10, random_state=42)

X_train_char, X_val_char, Y_train, Y_val = train_test_split(X_train_char, Y_train, test_size=0.10, random_state=42)

In [None]:
print('%d train char sequences' % len(X_train_char))
print('%d test char sequences' % len(X_test_char))
print('%d validation char sequences' % len(X_val_char))

In [None]:
# reset some unused variables
X_char = None
Y = None

## Model

In [None]:
from keras.layers import InputLayer, Convolution1D, MaxPooling1D, Concatenate, Flatten, Dense, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

input_layer = (Input(name = 'input', shape=(max_len_char, len(small_chars_set))))

conv1 = Convolution1D(filters=nb_filters, kernel_size=j, padding="valid", activation="relu",\
                                         strides=1, name ='conv%d_%d' % (i, j))(input_layer)
convs = []
for i in range(num_layers):
    for j in filter_lengths:
        conv = (Convolution1D(filters=nb_filters, kernel_size=j, padding="valid", activation="relu",\
                                         strides=1, name ='conv%d_%d' % (i, j))(input_layer))
        pool = MaxPooling1D(pool_size =max_len_char - j + 1, name='pool%d_%d' % (i, j))(conv)
        convs.append(pool)
        
concat = Concatenate()(convs)
flatten = Flatten()(concat)
flatten.get_shape()

hidden = Dense(hidden_size, activation="relu")(flatten)
dropout = Dropout(rate=dropout_rate)(hidden)

output = Dense(10, activation='softmax')(dropout)

model = Model(inputs=input_layer, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


filepath="../models/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

hist = model.fit(X_train_char, Y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val_char, Y_val),\
         callbacks = callbacks_list)

In [None]:
#model.load_weights("weights.best.hdf5")
model.summary()