# Aufgabenstellung
Übertragen sie die Berechnung von Wortvektoren als Embeddings auf die deutsche Sprache. Verwenden sie zuerst einen sehr kleinen Korpus, auch wenn damit keine guten Ergebnisse zu erzielen sind. Es geht ums Prinzip der Berechnung.

# Text einlesen

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from string import punctuation

In [46]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('german')

def normalize_document(doc):
    remove_terms = punctuation + '0123456789'
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-ZäöüÄÖÜ\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
text_corpus = []

with open("./amrain.txt", "r", encoding="utf-8") as f:
    text_amrain = f.readlines()
    
with open("./pankraz.txt", "r", encoding="utf-8") as f:
    text_pankraz = f.readlines()

text_corpus = text_amrain + text_pankraz

remove_terms = punctuation + '0123456789'
norm_text = [[word.lower() for word in sent if word not in remove_terms] for sent in text_corpus]
norm_text = [''.join(sent) for sent in norm_text]


norm_text = filter(None, normalize_corpus(norm_text))
norm_text = [tok_sent for tok_sent in norm_text if len(tok_sent.split()) > 2]
print('Total lines:', len(text_corpus))
print('\nSample line:', text_corpus[0])
print('\nProcessed line:', norm_text[0])


Total lines: 318

Sample line: Regula Amrain war die Frau eines abwesenden Seldwylers; dieser hatte einen großen Steinbruch hinter dem Städtchen besessen und seine Zeitlang ausgebeutet und zwar auf Seldwyler Art. Das ganze Nest war beinahe aus dem guten Sandstein gebaut, aus welchem der Berg bestand; aber das Schuldenwesen, das auf den Häusern ruhte, hatte von jeher recht eigentlich schon mit den Steinen begonnen, aus denen sie gebaut waren; denn nichts schien den Seldwylern so wohl geeignet als Stoff und Gegenstand eines muntern Verkehrs als ein solcher Steinbruch, und derselbe glich einer in Felsen gehauenen römischen Schaubühne, über welche die Besitzer emsig hinwegliefen, einer den andern jagend.


Processed line: regula amrain frau abwesenden seldwylers groen steinbruch städtchen besessen zeitlang ausgebeutet seldwyler art ganze nest beinahe guten sandstein gebaut berg bestand schuldenwesen häusern ruhte jeher recht eigentlich schon steinen begonnen denen gebaut schien seldwylern 

# Vokabular erstellen

In [48]:
from keras.preprocessing import text
from keras import utils
from keras.preprocessing import sequence

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_text)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v: k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_text]

vocab_size = len(word2id)
embed_size = 100
window_size = 2

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 6909
Vocabulary Sample: [('mehr', 1), ('mutter', 2), ('wurde', 3), ('ganz', 4), ('schon', 5), ('sah', 6), ('wohl', 7), ('ging', 8), ('sagte', 9), ('frau', 10)]


# Kontextbezug herstellen

In [49]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                  for i in range(start, end)
                                  if 0 <= i < sentence_length
                                  and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = utils.to_categorical(label_word, vocab_size)
            yield (x, y)


i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])

        if i == 10:
            break
        i += 1

Context (X): ['regula', 'amrain', 'abwesenden', 'seldwylers'] -> Target (Y): frau
Context (X): ['amrain', 'frau', 'seldwylers', 'groen'] -> Target (Y): abwesenden
Context (X): ['frau', 'abwesenden', 'groen', 'steinbruch'] -> Target (Y): seldwylers
Context (X): ['abwesenden', 'seldwylers', 'steinbruch', 'städtchen'] -> Target (Y): groen
Context (X): ['seldwylers', 'groen', 'städtchen', 'besessen'] -> Target (Y): steinbruch
Context (X): ['groen', 'steinbruch', 'besessen', 'zeitlang'] -> Target (Y): städtchen
Context (X): ['steinbruch', 'städtchen', 'zeitlang', 'ausgebeutet'] -> Target (Y): besessen
Context (X): ['städtchen', 'besessen', 'ausgebeutet', 'seldwyler'] -> Target (Y): zeitlang
Context (X): ['besessen', 'zeitlang', 'seldwyler', 'art'] -> Target (Y): ausgebeutet
Context (X): ['zeitlang', 'ausgebeutet', 'art', 'ganze'] -> Target (Y): seldwyler
Context (X): ['ausgebeutet', 'seldwyler', 'ganze', 'nest'] -> Target (Y): art


# Neuronales Netzwerk erstellen

In [50]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size * 2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))

cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 100)            690900    
                                                                 
 lambda_1 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 6909)              697809    
                                                                 
Total params: 1388709 (5.30 MB)
Trainable params: 1388709 (5.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


# Modell für 5 Epochen trainieren

In [51]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 138485.29325437546

Epoch: 2 	Loss: 138084.50889873505

Epoch: 3 	Loss: 137780.97097921371

Epoch: 4 	Loss: 137383.44759082794

Epoch: 5 	Loss: 136825.6836619377



# Word Embeddings

In [52]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

(6908, 100)


In [53]:
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
mutter,0.900185,0.436805,-0.590087,0.424985,0.191628,0.309301,-0.219675,0.216454,-0.657697,0.710062,...,0.486133,0.118618,0.477306,0.223108,0.535116,-0.388917,0.377261,-0.454641,-1.067472,-0.44536
wurde,0.917882,0.055035,-0.232964,0.447332,-0.280784,0.824357,-0.641831,0.53987,-0.850077,0.359259,...,0.506576,-0.200272,-0.078323,0.374256,0.512242,0.454328,1.032714,0.355244,-0.513173,0.344595
ganz,0.468482,0.695844,-0.121286,0.839853,0.423197,-0.353781,0.380722,0.449927,-0.488304,0.412985,...,-0.084821,-0.057305,0.495286,-0.174868,0.143616,-0.109467,-0.104221,-0.095787,0.075746,0.171558
schon,0.603444,0.301495,0.435678,0.584432,-0.219162,-0.321656,0.360808,0.581558,-0.058781,0.406662,...,-0.180848,-0.163308,0.084813,0.575552,0.446059,-0.074478,-0.434575,-0.251001,-0.755862,-0.622512
sah,0.64232,-0.585364,0.461161,-0.645764,0.501583,-0.560788,-0.104652,0.420738,0.223776,-0.286944,...,0.808237,-0.149937,-0.142692,-0.296029,0.487739,0.114529,0.8677,0.034719,0.2517,-0.319899


# Distanzmatrix bilden und ähnliche Wörter finden

In [54]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

(6908, 6908)


In [55]:
# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term] - 1].argsort()[1:6] + 1]
                 for search_term in ['amrain', 'städtchen', 'ausgebeutet', 'zeitlang', 'abwesenden', 'regula']}

similar_words

{'amrain': ['regula', 'regel', 'nachdem', 'etwa', 'daher'],
 'städtchen': ['nachzufahren', 'ganzes', 'tatzen', 'wüster', 'begibt'],
 'ausgebeutet': ['handlichen',
  'republikaner',
  'kleinlichsten',
  'klammernd',
  'einrichtungen'],
 'zeitlang': ['schleunigst',
  'zärtlichkeit',
  'anblick',
  'gebaut',
  'anzukünden'],
 'abwesenden': ['suchten',
  'scheidung',
  'vierzehnten',
  'beschaffenheit',
  'nahegelegt'],
 'regula': ['regel', 'nachdem', 'deswegen', 'steckte', 'wohlbegütert']}