In [1]:
#Model: GRU + Dropout
#Word Embedding: Pre-Trained (https://devmount.github.io/GermanWordEmbeddings/)
#Dataset: 3
#based on https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py

In [2]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


Using TensorFlow backend.


In [3]:
BASE_DIR = ''
GLOVE_DIR = BASE_DIR
TEXT_DATA_DIR = './dataset3/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [4]:
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glovegerman.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 608130 word vectors.


In [5]:
# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname==fname:
                fpath = os.path.join(path, fname)
                print (fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'utf-8'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))


Processing text dataset
000b0b153d53c16926e4573830f3a1e9.txt
000b816ed90ce07f567458b517a6ed92.txt
000bb94253f32888336e055fbe278441.txt
000c5dca59b26b762772262e42b08003.txt
00a18190bd1472e9bca9ab3a186dd848.txt
00a26a688c64e459b51748da5e042259.txt
00a2b02fde45c7fef1adff930ecc3c1c.txt
00a33bdd9f43358bfcedb5d9b337b344.txt
00a4849991b17e324661b8f7c265b160.txt
00a4e8b24f6453a45874875d1b1fde12.txt
00a510c7e16d48d7a1e34f1d3584ceeb.txt
00a5698ddc80e04cace4404e71ab07e6.txt
00a61b74c148561abb4e5901b27f9e9e.txt
00a64ad9d90607fdc561adc4472e996f.txt
00a6e5f24277a5aa23c1c8845a3a79b9.txt
00a88f4f578214d23b32399f1a5fafbc.txt
00aaf8c74df45f0423ae5ce242a005b6.txt
00aba2e4aa2df50af567e5b869bbdd89.txt
00aba961655165b115715c92db735d5d.txt
00acebe53e3e63ddc5c6d1cc9d0e41cf.txt
00ad4d3d5cbdf96a81552f825372f661.txt
00ad95d25449b3953802fa9210b16be7.txt
00aebe4ae047b9214fa68fa0578a6e77.txt
00aef63d3ef8b9187f5a33e0a1aacd32.txt
00b2e16853f91043f1cfd56a8ec1125d.txt
00b3084e0eadc369cc46614cec48956b.txt
00b548c46a3c1c

0aab194ab3a3bc21eec0c1a37d0140c3.txt
0aae2c21a3852ce6058b444f82a53689.txt
0aaeddd2c256c38cb2c9fc839f255d73.txt
0aaf444ec002d4e7d10f6170075f6bc0.txt
0aaf7c8ad94f7cfac7d43ca37ab45fd9.txt
0aafaf0c9715b7f7adf2692e1fdfc2f9.txt
0aaff5d2a3133169753075f0a7d5334a.txt
0ab1a9859754dbbb748811d24c8e183b.txt
0ab1adf253f5d1397de07d769aa1f760.txt
0ab2988e91d9e38a031fdc692556ab34.txt
0ab2fe778904f49310c54e324924af1f.txt
0ab37564aa05b40bdb4c61e6bb96d70a.txt
0ab3f34edc225823a6cc0355822ff1d1.txt
0ab4fb40aed6e1c81e0d5f73d96b1051.txt
0ab5edc962556100c2f9bfbf846a926a.txt
0ab76d32994303c5f7709d2198e8c600.txt
0ab834a3549e43d3a436fca240a25458.txt
0ab913a8e8592dc9f137a6bfaeeb9052.txt
0ab9f1cda2cd5872b0eaa67a647925b0.txt
0ab9f2a3791f9b382380e0b10a545061.txt
0abad9926d9e40a30bacb3358d0780da.txt
0abc1355eac4deba9da1d7720f24a547.txt
0abc5842d7016fbcd2e40791de7ef657.txt
0abd6c25255849a78a1e4872175a6129.txt
0abe3087005873274a1057199a9727ef.txt
0abe537ba08c6d6ee8208d9237f0e43c.txt
0abf253d1c14c9f9684e61b809915510.txt
0

0c3104a6265525aaa1971c1217677bda.txt
0c3174745a835664e8c865932068e885.txt
0c322f25e3fcb2aa05701960310832d7.txt
0c325e563e8ddd918257c4717df67d94.txt
0c325e6ef747a354a4201711dd350cdf.txt
0c32770883490580dc8abc675be2d631.txt
0c327a2d89bd2cd5f27c535b0e310ef6.txt
0c328910ef89f98093fc2c3c330a11ba.txt
0c32c0a5d3bc8d264f685c09f1496c77.txt
0c3319744bde297e0e7b97382e116fbb.txt
0c3405efdf5888b15077e202809ba5b6.txt
0c34a156c214db329e3468217962443d.txt
0c34d4c7295833411b4b691bb48127b0.txt
0c35b82f82fe6338e9d6509c17e41be5.txt
0c360a87d7a5f65c78f463082b670b16.txt
0c363e2dd1dcc0968d7ceb37e8d4ff94.txt
0c36b05ca2f217c378f7d2df61830bcf.txt
0c3811e6396ee73d82c1287ecdb4e547.txt
0c38f55310edd9d21e10135696d23515.txt
0c38fe2b8f3919545b8c496d0d8c88c0.txt
0c39ddc187402513eb4186736ea1b7f2.txt
0c3acbfc69e76704bab85af3633c1284.txt
0c3bb2a588fca064cd853af741cc73dd.txt
0c3bde84efa612cb4f8c434f4afcce8c.txt
0c3cc7f08a1e2772dd3c5cee9c7b2d97.txt
0c3de901103f1b53966ec9fe869be53d.txt
0c3e1ade6792b2349ffebe291d394274.txt
0

0d47138bb9e5a1cdfed9bcdf5e25632d.txt
0d47a5f32d854527fd5b43822f9dacef.txt
0d493c955083871fb93d87b806c3ba76.txt
0d496fd5e1ea4651a4690e5fdcc7b5b9.txt
0d49d919f65660212aa4300bb7e5d6dc.txt
0d4a038008232749bc761bebfeb297b8.txt
0d4ada4674e6dfa1c8fb088cbf31ac9f.txt
0d4b649f3012e15be85cb36a8093c9e2.txt
0d4bc7a34ae480d8053b81e48a91264c.txt
0d4bca803df4db9f9f8da785b8f40009.txt
0d4c7a0386f4229ee47a93e729ca1d31.txt
0d4db34ad1dff50ccebba8be1a7332fb.txt
0d4eb682a3dc472eb31665a0d9ee5d58.txt
0d4fd5267f0aa498bb1abf181bd3f02b.txt
0d51809b90256ce99a7f37d038358708.txt
0d51b0d26380437c9b06e41d0216fc7f.txt
0d51dfd1cbacf79e3ef4b4f4cb0fefb6.txt
0d52a34238e2740d236c6fdfc97bab99.txt
0d52c35cf09d68654a24f6f84027d593.txt
0d5339d0a4a68dbb40e08810c62262c4.txt
0d55ff7291fcfeab707c3a77ea036bef.txt
0d57569b761cc814c1c2c04ca9bf09ba.txt
0d580d2f5f2b5090eda4436a840df666.txt
0d59718e58730276a3870b6e25251d6c.txt
0d598a68bfba0f61ee5d1091861ca540.txt
0d5a3cb53825e96e1f12fb5576e4545d.txt
0d5b215bbfbf960c929450183fb9d1d5.txt
0

7281bae0-4c75-11e9-8a61-1bce99029578.txt
728e3e00-4c75-11e9-8a61-1bce99029578.txt
729aa4a0-4c76-11e9-8a61-1bce99029578.txt
72a0c9b0-4c77-11e9-8a61-1bce99029578.txt
7305df00-4c75-11e9-8a61-1bce99029578.txt
73954dc0-4c75-11e9-8a61-1bce99029578.txt
739d15f0-4c75-11e9-8a61-1bce99029578.txt
73ae2cf0-4c75-11e9-8a61-1bce99029578.txt
73cc5160-4c77-11e9-8a61-1bce99029578.txt
73cdcd70-4c76-11e9-8a61-1bce99029578.txt
73d0b3a0-4c76-11e9-8a61-1bce99029578.txt
73eb35f0-4c75-11e9-8a61-1bce99029578.txt
74f0fb40-4c77-11e9-8a61-1bce99029578.txt
7538c490-4c75-11e9-8a61-1bce99029578.txt
75850fd0-4c75-11e9-8a61-1bce99029578.txt
75a52580-4c76-11e9-8a61-1bce99029578.txt
75abb530-4c76-11e9-8a61-1bce99029578.txt
761c0dc0-4c77-11e9-8a61-1bce99029578.txt
7684a580-4c75-11e9-8a61-1bce99029578.txt
769db650-4c76-11e9-8a61-1bce99029578.txt
7703cb20-4c76-11e9-8a61-1bce99029578.txt
7711fbf0-4c76-11e9-8a61-1bce99029578.txt
774fd2d0-4c77-11e9-8a61-1bce99029578.txt
77a25510-4c76-11e9-8a61-1bce99029578.txt
77d1e600-4c75-11

baa486e0-4c75-11e9-8a61-1bce99029578.txt
baf633e0-4c76-11e9-8a61-1bce99029578.txt
bb1b2b90-4c77-11e9-8a61-1bce99029578.txt
bb347560-4c76-11e9-8a61-1bce99029578.txt
bba140f0-4c76-11e9-8a61-1bce99029578.txt
bbb2f430-4c76-11e9-8a61-1bce99029578.txt
bbe08950-4c75-11e9-8a61-1bce99029578.txt
bbf43860-4c75-11e9-8a61-1bce99029578.txt
bc0eda60-4c77-11e9-8a61-1bce99029578.txt
bc1c6460-4c76-11e9-8a61-1bce99029578.txt
bc2f7730-4c76-11e9-8a61-1bce99029578.txt
bc51a530-4c76-11e9-8a61-1bce99029578.txt
bc9421e0-4c75-11e9-8a61-1bce99029578.txt
bd03e8c0-4c77-11e9-8a61-1bce99029578.txt
bd12c7c0-4c75-11e9-8a61-1bce99029578.txt
bd29ab20-4c75-11e9-8a61-1bce99029578.txt
bd2f9e90-4c75-11e9-8a61-1bce99029578.txt
bd35c3a0-4c76-11e9-8a61-1bce99029578.txt
be1ea790-4c77-11e9-8a61-1bce99029578.txt
be30c570-4c76-11e9-8a61-1bce99029578.txt
be46b3e0-4c75-11e9-8a61-1bce99029578.txt
be7a9520-4c75-11e9-8a61-1bce99029578.txt
bf08a450-4c75-11e9-8a61-1bce99029578.txt
bf349650-4c75-11e9-8a61-1bce99029578.txt
bf5d85a0-4c76-11

factiva-20190512-1705.txt_36.txt.txt_0.txt
factiva-20190512-1705.txt_41.txt.txt_0.txt
factiva-20190512-1705.txt_47.txt.txt_0.txt
factiva-20190512-1705.txt_74.txt.txt_0.txt
factiva-20190512-1705.txt_91.txt.txt_0.txt
factiva-20190512-1707.txt_44.txt.txt_0.txt
factiva-20190512-1707.txt_59.txt.txt_0.txt
factiva-20190512-1707.txt_68.txt.txt_0.txt
factiva-20190512-1709.txt_11.txt.txt_0.txt
factiva-20190512-1710.txt_33.txt.txt_0.txt
factiva-20190512-1710.txt_6.txt.txt_0.txt
factiva-20190512-1711.txt_81.txt.txt_0.txt
factiva-20190512-1712.txt_0.txt.txt_0.txt
factiva-20190512-1712.txt_61.txt.txt_0.txt
factiva-20190512-1712.txt_63.txt.txt_0.txt
factiva-20190512-1712.txt_79.txt.txt_0.txt
factiva-20190512-1712.txt_85.txt.txt_0.txt
factiva-20190512-1712.txt_89.txt.txt_0.txt
factiva-20190512-1713.txt_74.txt.txt_0.txt
factiva-20190512-1735.txt_3.txt.txt_0.txt
factiva-20190512-1735.txt_5.txt.txt_0.txt
factiva-20190512-1735.txt_65.txt.txt_0.txt
factiva-20190512-1735.txt_75.txt.txt_0.txt
fb58e630-4c76-1

In [7]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 79661 unique tokens.
Shape of data tensor: (2770, 1000)
Shape of label tensor: (2770, 2)


In [8]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [9]:
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Preparing embedding matrix.


In [10]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


In [11]:
print('Training model.')
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, GRU, Dropout
import keras 

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = GRU(100)(embedded_sequences)
x = Dropout(0.25)(x)
preds = Dense(2, activation='sigmoid')(x)

Training model.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [12]:
model = Model(sequence_input, preds)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         6000000   
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               120300    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 6,120,502
Trainable params: 120,502
Non-trainable params: 6,000,000
_________________________________________________________________
None


In [13]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=64,
          epochs=10,
          validation_data=(x_val, y_val))

Instructions for updating:
Use tf.cast instead.
Train on 2216 samples, validate on 554 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2a60b74ac8>