In [1]:
# imports
import time
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

from utils import *

%matplotlib inline

Using TensorFlow backend.


In [2]:
# Settings for our network
embedding_size = 200
num_hidden = 0
num_layers = 1
hidden_size = 250
l2 = 0
dropout_rate = 0.5
filter_lengths = [6, 7, 8]
nb_filters = 100
max_len_char = 140
epochs = 15
batch_mode = 'off'
optimizer = 'adadelta'
chars = 'no_numeric_upper'
batch_size = 50

parameters = 'num_hidden=%d, num_layers=%d, max_len_char=%d, batch_mode=%s, hidden_size=%d, chars=%s, l2=%f, dropout_rate=%f, filter_lengths=%s, nb_filters=%d, epochs=%d, batch_size=%d, optimizer=%s'\
                         % (num_hidden, num_layers, max_len_char, batch_mode, hidden_size, chars, l2, dropout_rate, str(filter_lengths), nb_filters, epochs, batch_size, optimizer)
print(parameters)

num_hidden=0, num_layers=1, max_len_char=140, batch_mode=off, hidden_size=250, chars=no_numeric_upper, l2=0.000000, dropout_rate=0.500000, filter_lengths=[6, 7, 8], nb_filters=100, epochs=15, batch_size=50, optimizer=adadelta


In [3]:
from utils import *
data = load_10_people()
X_train, Y_train, X_val, Y_val, X_test, Y_test = data['X_train'], data['Y_train'], data['X_val'], data['Y_val'],\
                                                data['X_test'], data['Y_test']
    
X_train = np.argmax(X_train, -1)
X_val = np.argmax(X_val, -1)
X_test = np.argmax(X_test, -1)

Loading 1004399 tweets from 4391 unique users.
Loading Twitter dataset took 2 seconds.
Number of Tweets: 97728
Only keeping characters that appear at least 100 times in the corpus
Character set consists of 246 characters
Building X...
Building Y...
Splitting Data...
79159 train char sequences
9773 test char sequences
8796 validation char sequences


In [4]:
# model with dynamic embeddings
from keras.layers import InputLayer, Convolution1D, MaxPooling1D, Concatenate, Flatten, Dense, Dropout, Input
from keras.layers import Embedding
from keras.models import Model
from keras.callbacks import ModelCheckpoint

In [5]:
# dynamic embeddings and more n-grams
input_layer = (Input(name = 'input', shape=(max_len_char,)))

# Dynamic embeddings: TensorShape([Dimension(None), Dimension(246), Dimension(140)])
embed = Embedding(input_dim=246, output_dim=140)(input_layer)

convs = []
for i in range(num_layers):
    for ksize in [2,3,4,5,6]:
        conv = (Convolution1D(filters=nb_filters, kernel_size=ksize, padding="valid", activation="relu",\
                                                 strides=1, name ='conv%d_%d' % (i, ksize))(embed))
        pool = MaxPooling1D(pool_size =max_len_char - ksize + 1, name='pool%d_%d' % (i, ksize))(conv)
        convs.append(pool)
        
concat = Concatenate()(convs)
flatten = Flatten()(concat)
flatten.get_shape()

hidden = Dense(hidden_size, activation="relu")(flatten)
dropout = Dropout(rate=dropout_rate)(hidden)

output = Dense(10, activation='softmax')(dropout)

model = Model(inputs=input_layer, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [6]:
filepath="../models/dynamic_m2-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

hist = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_data=[X_val, Y_val],\
         callbacks = callbacks_list)

Train on 79159 samples, validate on 8796 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
