In [1]:
#Model: LSTM + Dropout
#Word Embedding: Pre-Trained (https://devmount.github.io/GermanWordEmbeddings/)
#Dataset: 3
#based on https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py

In [2]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


Using TensorFlow backend.


In [3]:
BASE_DIR = ''
GLOVE_DIR = BASE_DIR
TEXT_DATA_DIR = './dataset3/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [4]:
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glovegerman.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 608130 word vectors.


In [5]:
# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname==fname:
                fpath = os.path.join(path, fname)
                print (fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'utf-8'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))


Processing text dataset
000b816ed90ce07f567458b517a6ed92.txt
000bb94253f32888336e055fbe278441.txt
000c5dca59b26b762772262e42b08003.txt
00b3084e0eadc369cc46614cec48956b.txt
00b548c46a3c1cc8a423a10e619fa2b8.txt
00b576b7ff74c2c28c42cbede1d9e2e6.txt
00b59296d49dc72abe6843d08fe07fe8.txt
00b71bffea10ae5426fb24ed49b6d503.txt
00b90f5dd8e2a7adb8918dccce5c494c.txt
00b92fe26f908cd772aeb238b4bf074c.txt
00b963e041c94e92ea6b2b7dcb525120.txt
00ba940eca84eac01dba2e694661f8e8.txt
00bd6dea1a7035d66844b3f80fc4c8a6.txt
00bdb0c218a32378a49fd871ea75bfa8.txt
00be0d3c56cc26ab34ce97afed13e6af.txt
00bf392fe5a3676925a6f7341c6417ec.txt
00bf3b5eb19112cbbcfb970257a0c1cd.txt
00c0c5c1b86f75270dc49e5bf0303d8d.txt
00c17f686e4139669ff62fe601045774.txt
00c1e5b67e77791adacf5a3b2b63faf5.txt
00c28b5f5ba37b0e5e7298b5e43a349d.txt
00c316d6cddc4aa026149b4b3e17ab48.txt
00c317a0523dcb6e68df850b2e30ef80.txt
00c62aca6488f98af9e307aecb8a88fa.txt
00c642f62dd3284028c809f1e8667d97.txt
00c687d22048a15ecc0eaa0a52e84b1d.txt
00c7aef163cd60

0d1c4ca9f5a90174187d3e56d15ed9b2.txt
0d1d8851189297d1be8f4aef41306b52.txt
0d1f3d70deca4c6c94b5906fb22d9b0f.txt
0d1fc3a9f780cb480681abdf778bee0b.txt
0d204f42ad0f6c77be26aafc15e160a4.txt
0d22bf43caca5c307890e2fdcae3691a.txt
0d23052f84728fe257df682ae59ec73f.txt
0d23900f45e73f26c03486b63d42cfeb.txt
0d23e88023ad72c41467e3c3822712e5.txt
0d24595fa095d216d18f115da72ab461.txt
0d24645b44c07fb6a2f3cddcb6f703dd.txt
0d260d37cc773157afc303c3c298461b.txt
0d28a5c27a910598cc5c9588af07ec1c.txt
0d29b648fd23bbee6f302b4d743df829.txt
0d2ac2723a0786508705addc49245552.txt
0d2aff2b264c39716a12560043366428.txt
0d2b599f28845d9bf5e3a6a5c01a3630.txt
0d2bb5ac55d9a2c50ca5e2aaa5e831bf.txt
0d2be7d58637d3937041beea8960dd68.txt
0d2c92ad1638e8ce76659fb329bc5009.txt
0d2d03aa81ea6e587a2e2d14d68b35f5.txt
0d2d87b5c61a52d1aa004ead1cd859ce.txt
0d2fabd5f4c6ed745efbd9f875fd3142.txt
0d32f10b0238cf74f0c42dc3c54aeaff.txt
0d33b2cfd430fe3fb245186ef77891c4.txt
0d33b8012acf31fd9b928cd465399d2b.txt
0d340f7b8be62956bc3cae72ac522a62.txt
0

5a009e80-4c77-11e9-8a61-1bce99029578.txt
5a30c120-4c75-11e9-8a61-1bce99029578.txt
5a71cc50-4c76-11e9-8a61-1bce99029578.txt
5aaa6880-4c76-11e9-8a61-1bce99029578.txt
5b4e5430-4c77-11e9-8a61-1bce99029578.txt
5b5ce510-4c75-11e9-8a61-1bce99029578.txt
5ba5fc00-4c75-11e9-8a61-1bce99029578.txt
5bf83ab0-4c75-11e9-8a61-1bce99029578.txt
5c128400-4c76-11e9-8a61-1bce99029578.txt
5c28cb20-4c76-11e9-8a61-1bce99029578.txt
5c693a10-4c77-11e9-8a61-1bce99029578.txt
5c9c1bd0-4c75-11e9-8a61-1bce99029578.txt
5cd72900-4c75-11e9-8a61-1bce99029578.txt
5d095c90-4c75-11e9-8a61-1bce99029578.txt
5d2ed400-4c77-11e9-8a61-1bce99029578.txt
5d4055a0-4c76-11e9-8a61-1bce99029578.txt
5d42e320-4c75-11e9-8a61-1bce99029578.txt
5d4b0400-4c76-11e9-8a61-1bce99029578.txt
5d63e330-4c76-11e9-8a61-1bce99029578.txt
5d6a6850-4c75-11e9-8a61-1bce99029578.txt
5d9e70a0-4c75-11e9-8a61-1bce99029578.txt
5db07200-4c75-11e9-8a61-1bce99029578.txt
5e3b4ce0-4c75-11e9-8a61-1bce99029578.txt
5e61d5c0-4c77-11e9-8a61-1bce99029578.txt
5e970c00-4c76-11

e7ab0eb0-4c76-11e9-8a61-1bce99029578.txt
e7c4ff50-4c76-11e9-8a61-1bce99029578.txt
e7cbf9a0-4c75-11e9-8a61-1bce99029578.txt
e7edb270-4c75-11e9-8a61-1bce99029578.txt
e7fb2a80-4c76-11e9-8a61-1bce99029578.txt
e8026100-4c77-11e9-8a61-1bce99029578.txt
e805ce50-4c75-11e9-8a61-1bce99029578.txt
e8d41ad0-4c75-11e9-8a61-1bce99029578.txt
e8ef6b00-4c75-11e9-8a61-1bce99029578.txt
e93ae100-4c77-11e9-8a61-1bce99029578.txt
e95933e0-4c76-11e9-8a61-1bce99029578.txt
e9634600-4c76-11e9-8a61-1bce99029578.txt
e96fc920-4c76-11e9-8a61-1bce99029578.txt
e9945d90-4c75-11e9-8a61-1bce99029578.txt
e9ab8f10-4c75-11e9-8a61-1bce99029578.txt
ea1610b0-4c75-11e9-8a61-1bce99029578.txt
ea326060-4c77-11e9-8a61-1bce99029578.txt
ea3e3cb0-4c76-11e9-8a61-1bce99029578.txt
ea6a07a0-4c76-11e9-8a61-1bce99029578.txt
ea9e2c70-4c75-11e9-8a61-1bce99029578.txt
eb023380-4c77-11e9-8a61-1bce99029578.txt
eb1ea710-4c75-11e9-8a61-1bce99029578.txt
eb7731e0-4c76-11e9-8a61-1bce99029578.txt
ebbec230-4c76-11e9-8a61-1bce99029578.txt
ebcbfe10-4c75-11

In [7]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 115454 unique tokens.
Shape of data tensor: (3242, 1000)
Shape of label tensor: (3242, 2)


In [8]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [9]:
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Preparing embedding matrix.


In [10]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


In [11]:
print('Training model.')
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM, Flatten, Dropout
import keras 

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(100)(embedded_sequences)
x = Dropout(0.25)(x)
preds = Dense(len(labels_index), activation='sigmoid')(x)

Training model.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [12]:
model = Model(sequence_input, preds)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         6000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 6,160,602
Trainable params: 160,602
Non-trainable params: 6,000,000
_________________________________________________________________
None


In [13]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=64,
          epochs=10,
          validation_data=(x_val, y_val))

Instructions for updating:
Use tf.cast instead.
Train on 2594 samples, validate on 648 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f15a3509748>