In [32]:
import fastText
import math
import numpy as np 
from numpy import random
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Conv1D, Conv2D, Dropout, MaxPooling1D, GlobalMaxPooling1D, Bidirectional, Input, Masking, Flatten, Concatenate
from keras import regularizers
import re

In [2]:
ft = fastText.load_model("../../fastText/wiki.de.bin")

nb_embedding_dims = ft.get_dimension()
nb_sequence_length = 75

In [3]:
# words = ["mein", "dein", "unser", "mein"]
# for w in words:
#     if w in word_vectors:
#         print("I know " + w)
#     else:
#         wv = ft.get_word_vector(w)
#         print(wv)
#         word_vectors[w] = wv

In [14]:
def my_generator(features, labels, batch_size):
    
    batch_features = np.zeros((batch_size, nb_sequence_length, nb_embedding_dims))
    batch_labels = np.zeros((batch_size, 2))

    while True:
        # print(len(features))
        for i in range(batch_size):
            index = random.choice(len(features), 1)[0]
            batch_features[i] = process_features(features[index], nb_sequence_length, nb_embedding_dims)
            # print(batch_features[i])
            # print(batch_features[i].shape)
            batch_labels[i] = labels[index]
        yield batch_features, batch_labels

In [24]:
# word_splitter = re.compile("[\w+]|[\W+]", re.UNICODE)
word_vectors = {}
def process_features(textline, nb_sequence_length, nb_embedding_dims):
    # words = re.split(word_splitter, textline.strip())
    words = re.compile('[\w-]+|[\W ]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    # print(words)
    features = np.zeros((nb_sequence_length, nb_embedding_dims))
    max_words = min(len(words), nb_sequence_length)
    idx = nb_sequence_length - len(words[:max_words])
    for w in words[:max_words]:
        if w in word_vectors:
            wv = word_vectors[w]
        else:
            wv = ft.get_word_vector(w.lower())
            word_vectors[w] = wv
        features[idx] = wv
        # print(str(idx) + " " + w)
        idx = idx + 1
    return features

In [25]:
train_lines = [line.strip().split("\t") for line in open('../../Resources/OffLang/sample_train.txt', encoding = "UTF-8")]
dev_lines = [line.strip().split("\t") for line in open('../../Resources/OffLang/sample_dev.txt', encoding = "UTF-8")]

In [26]:
from keras.utils import to_categorical

train_sentences = [x[0] for x in train_lines]
train_labels = to_categorical([0 if x[1] == "OTHER" else 1 for x in train_lines])
# train_labels = [0 if x[1] == "OTHER" else 1 for x in train_lines]

dev_sentences = [x[0] for x in dev_lines]
dev_labels = to_categorical([0 if x[1] == "OTHER" else 1 for x in dev_lines])
# dev_labels = [0 if x[1] == "OTHER" else 1 for x in dev_lines]

In [20]:
# print(train_labels)

In [None]:
model = Sequential([
    LSTM(64, recurrent_dropout = 0.5, dropout = 0.5, activation = 'relu', input_shape=(nb_sequence_length, nb_embedding_dims)),
    Dense(32, activation = 'relu'),
    Dropout(0.2),
    Dense(2, activation = 'softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])

In [None]:
# model = Sequential([
#     Conv1D(128, kernel_size = 3, padding = 'valid', input_shape=(nb_sequence_length, nb_embedding_dims), activation = 'relu'),
#     MaxPooling1D(5),
#     Flatten(),
#     Dense(64, activation = 'relu'),
#     Dropout(0.2),
#     Dense(2, activation = 'softmax')
# ])
# model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics = ['accuracy'])

In [45]:
filter_sizes = (3, 4, 5)
model_input = Input(shape = (nb_sequence_length, nb_embedding_dims))
model_layers = Dropout(0.8)(model_input)
conv_blocks = []
for sz in filter_sizes:
    conv = Conv1D(filters = 100,
                         kernel_size = sz,
                         padding = "valid",
                         activation = "relu",
                         strides = 1,
                 kernel_regularizer = regularizers.l2(0.0001))(model_layers)
    conv = GlobalMaxPooling1D()(conv)
    # conv = Flatten()(conv)
    conv_blocks.append(conv)
model_concatenated = Concatenate()(conv_blocks)
model_concatenated = Dropout(0.8)(model_concatenated)
model_concatenated = Dense(64, activation = "relu")(model_concatenated)
model_output = Dense(2, activation = "softmax")(model_concatenated)
model = Model(model_input, model_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [46]:
samples_per_epoch = len(train_sentences)
epochs = 50
batch_size = 32
steps_per_epoch = math.ceil(samples_per_epoch / batch_size)

In [47]:
model.fit_generator(
    my_generator(train_sentences, train_labels, batch_size), 
    steps_per_epoch=steps_per_epoch, nb_epoch=epochs,
    validation_data = my_generator(dev_sentences, dev_labels, batch_size),
    validation_steps = math.ceil(len(dev_sentences) / batch_size)
)

  """
  """


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
 13/132 [=>............................] - ETA: 5s - loss: 0.8392 - acc: 0.6442

KeyboardInterrupt: 