# Morphological Encoder-Decoder (MED)
Kann and Schütze, 2016

In [1]:
import keras
from keras.layers import Bidirectional, Concatenate, Dot, Input, GRU, Embedding
from keras.layers import RepeatVector, Dense, Activation
from keras.layers import MaxoutDense
from keras.models import Model
from keras.optimizers import Adadelta
from keras import backend as K
import numpy as np
from utils import *

Using TensorFlow backend.


## Load data

In [2]:
dataset, char_vocab, tag_vocab = load_dataset("data/german-task2-train")
print("Dataset size: " + str(dataset.shape[0]))
print("Character vocabulary:\n" + str(char_vocab))
print("\nTag vocabulary:\n" + str(tag_vocab))
print("\n" + str(dataset[0]))

Dataset size: 12800
Character vocabulary:
['<pad>', '<unk>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ß', 'ä', 'ö', 'ü']

Tag vocabulary:
['<unk>', 'IN=alt=LGSPEC1', 'IN=aspect=PFV', 'IN=case=ACC', 'IN=case=DAT', 'IN=case=GEN', 'IN=case=NOM', 'IN=comp=CMPR', 'IN=comp=SPRL', 'IN=finite=NFIN', 'IN=gen=FEM', 'IN=gen=MASC', 'IN=gen=NEUT', 'IN=mood=IND', 'IN=mood={OPT/SBJV}', 'IN=mood={SBJV/COND}', 'IN=num=PL', 'IN=num=SG', 'IN=per=1', 'IN=per=2', 'IN=per=3', 'IN=pos=ADJ', 'IN=pos=N', 'IN=pos=V', 'IN=tense=PRS', 'IN=tense=PST', 'OUT=alt=LGSPEC1', 'OUT=aspect=PFV', 'OUT=case=ACC', 'OUT=case=DAT', 'OUT=case=GEN', 'OUT=case=NOM', 'OUT=comp=CMPR', 'OUT=comp=SPRL', 'OUT=finite=NFIN', 'OUT=gen=FEM', 'OUT=gen=MASC', 'OUT=gen=NEUT', 'OUT=mood=IND', 'OUT=mood={OPT/SBJV}', 'OUT=mood={SBJV/COND}', 'OUT=num=PL', 'OUT=num=SG', 'OUT=per=1', 'OUT=per=2', 'OUT=per=3', 'OUT=pos=ADJ', 'OUT=pos=N', 'OUT=pos=V', 'OUT=tens

In [3]:
X, Y, input_vocab, output_vocab = preprocess_data(dataset, char_vocab, tag_vocab, 50, 40)

aerodynamischstes
[2, 6, 19, 16, 5, 26, 15, 2, 14, 10, 20, 4, 9, 20, 21, 6, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Initialization

In [9]:
Tx = 50
Ty = 40
in_vocab_size = len(input_vocab)
out_vocab_size = len(output_vocab)

# initialize layers as global variables
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(100, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation("softmax", name="attention_weights") # We are using a custom softmax(axis = 1) loaded in this notebook
dotor = Dot(axes = 1)

## Attention model
Bahdanau et. al., 2014

In [10]:
def attention_step(a, s_prev):
    s_prev = repeator(s_prev)
    concat = concatenator([a, s_prev])
    e = densor1(concat)
    e = densor2(e)
    alphas = activator(e)
    context = dotor([alphas, a])
    
    return context

## Encoder-Decoder model

In [11]:
enc_size = 100
dec_size = 100
embed_size = 300

def encoder_decoder(Tx, Ty, enc_size, dec_size, in_vocab_size, out_vocab_size):
    X = Input(shape=(Tx,)) #X = Input(shape=(Tx, vocab_size))
    s0 = Input(shape=(enc_size,), name='s0')
    
    encoder = Bidirectional(GRU(enc_size, return_sequences=True), name="encoder")
    decoder = GRU(dec_size, return_state=True, name="decoder")
    maxout = MaxoutDense(out_vocab_size)
    embeddings = Embedding(in_vocab_size, embed_size, input_length=Tx)(X)
    
    s = s0
    outputs = []
    a = encoder(embeddings)
    for t in range(Ty):  
        context = attention_step(a, s)
        s, _ = decoder(context, initial_state=s)
        out = maxout(s)
        outputs.append(out)
    
    model = Model(inputs=[X, s0], outputs=outputs, name="MED")
    
    return model

In [13]:
model = encoder_decoder(Tx, Ty, enc_size, dec_size, in_vocab_size, out_vocab_size)



In [14]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 300)      24600       input_2[0][0]                    
__________________________________________________________________________________________________
s0 (InputLayer)                 (None, 100)          0                                            
__________________________________________________________________________________________________
encoder (Bidirectional)         (None, 50, 200)      240600      embedding_2[0][0]                
__________________________________________________________________________________________________
repeat_vec

In [9]:
out = model.compile(optimizer=keras.optimizers.Adadelta(), metrics=['accuracy'], loss='categorical_crossentropy')