In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 200
%matplotlib inline

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

## Load up sample corpus - Bible

In [3]:
from nltk.corpus import gutenberg
from string import punctuation

bible = gutenberg.sents('bible-kjv.txt')
remove_terms = punctuation + '0123456789'

norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])

Total lines: 30103

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: god said let firmament midst waters let divide waters waters


In [4]:
# wir kürzen "etwas", damit die Berechnung schneller geht...
#  gerne später rückgängig machen

norm_bible = norm_bible[:5000]

## Implementing a word2vec model using a CBOW (Continuous Bag of Words) neural network architecture

### Build Vocabulary

In [None]:
from keras.preprocessing import text
from keras import utils
from keras.preprocessing import sequence

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v: k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

vocab_size = len(word2id)
embed_size = 100
window_size = 2

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

### Build (context_words, target_word) pair generator

In [None]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                  for i in range(start, end)
                                  if 0 <= i < sentence_length
                                  and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = utils.to_categorical(label_word, vocab_size)
            yield (x, y)


i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])

        if i == 10:
            break
        i += 1

### Build CBOW Deep Network Model

In [22]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size * 2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))

cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 4, 100)            462200    
                                                                 
 lambda_2 (Lambda)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 4622)              466822    
                                                                 
Total params: 929022 (3.54 MB)
Trainable params: 929022 (3.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


### Train model for 5 epochs

Achtung: dauert etwas...

In [24]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 573281.9398591736
Epoch: 2 	Loss: 573792.0676976203
Epoch: 3 	Loss: 574732.1212097561
Epoch: 4 	Loss: 573669.8267420736
Epoch: 5 	Loss: 572883.4556642018


### Get word embeddings

In [25]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)


(4621, 100)


In [26]:
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
unto,1.233148,0.71638,0.795285,0.687711,0.717798,-0.608559,0.849003,1.028589,0.206659,-0.528851,...,-0.981629,0.626677,-1.185195,-0.984692,0.810447,-1.052438,-0.872827,-0.956356,1.36029,-1.319813
lord,0.730807,-0.523129,-0.187607,0.017499,1.060174,-0.403305,-0.145262,0.806336,0.480354,0.532435,...,0.106211,0.620304,0.531243,0.392222,-0.090168,-0.739844,0.007493,0.156161,-0.408205,0.901727
thou,-0.270326,-0.285472,-0.041295,-0.023545,0.403086,-0.002234,-0.364126,0.373829,0.698925,0.686178,...,0.388882,0.675534,0.415061,0.413768,0.820637,0.304831,-0.860124,0.312828,-0.102423,-1.264907
thy,1.529649,1.392774,0.077602,1.194607,0.088102,1.163991,-0.041349,0.251497,-1.067705,-0.505953,...,-1.301913,1.258092,0.79997,-1.415297,-0.405153,-1.372715,-0.580238,1.005748,1.309335,-0.701609
thee,-0.047566,0.246222,-0.100607,0.052973,0.710275,0.39817,0.379861,0.238995,0.174828,-0.272554,...,-0.434646,0.322128,-0.369394,-0.326891,0.548898,-0.304507,0.124086,-0.547234,0.49408,-0.517652


### Build a distance matrix to view the most similar words (contextually)

In [27]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)


(4621, 4621)


In [31]:
# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term] - 1].argsort()[1:6] + 1]
                 for search_term in ['god', 'noah', 'egypt', 'water', 'moses', 'famine']}

similar_words

{'god': ['hath', 'face', 'blessed', 'taken', 'behold'],
 'noah': ['shem', 'flood', 'lamech', 'ham', 'sixty'],
 'egypt': ['canaan', 'pharaoh', 'joseph', 'servants', 'dwell'],
 'water': ['clothes', 'bathe', 'flesh', 'clean', 'unclean'],
 'moses': ['aaron', 'daughter', 'pharaoh', 'balaam', 'joseph'],
 'famine': ['phallu', 'prison', 'elder', 'shaken', 'fourscore']}