# Imports

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import os
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases
import codecs
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from tabulate import tabulate

In [2]:
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')

[nltk_data] Downloading package punkt to /home/iuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Einlesen des Datensatzes "ff_fanfiction.csv"
ff = pd.read_csv("/home/iuser/DH/DH_2020_2021/word_embeddings_HP/Datensatz/df_fanfiction.csv", encoding='utf-8')

#Struktur von "ff" und erste 10 Einträge
#ff.shape #(9811, 5)
#ff.head(10)

In [None]:
#Wegen starker Unausgegligenheit, wurde die ursprüngliche Länge des Korpus der Länge vom "Harry Potter"-Datensatz 
#angepasst, also gekürzt - siehe "Sophia.ipynb" -.
ff_short = ff[:21] 
ff_short

In [None]:
# Aus "Word2vec_HP_1_7_ohne Stoppwörter"
stopwords = stopwords.words('german')

In [None]:
# Hilfsfunktion zur Bereinigung und Tokenisierung

def sentence_to_wordlist(raw:str):
    text = re.sub('[^A-Za-z_äÄöÖüÜß]',' ', raw).split() # Umlaute werden hinzugefügt
    filtered_text = [word for word in text if word not in stopwords]
    return filtered_text

In [None]:
# Hilfsfunktion zur Extraktion der Sätze
# gibt eine List von tokenisierten Sätzen aus 
# berücksichtigt auch Bigramme

def prepare_text(raw_text):
    raw_sentences = tokenizer.tokenize(raw_text.lower())
    tokenized_sentences = Parallel(n_jobs=-1)(delayed(sentence_to_wordlist)(raw_sentence) for raw_sentence in raw_sentences)
    phrases = Phrases(tokenized_sentences)
    bigram = Phraser(phrases)
    sentences = list(bigram[tokenized_sentences])
    return sentences

In [None]:
# Zu verfeinern

text1 = ff_short.Text[0] 
text2 = ff_short.Text[1]
text3 = ff_short.Text[2]
text4 = ff_short.Text[3]
text5 = ff_short.Text[4]
text6 = ff_short.Text[5]
text7 = ff_short.Text[6]

books = text1+text2+text3+text4+text5+text6+text7

In [None]:
sentences = prepare_text(books)

# sentences ist eine Liste von tokenisierten Sätzen, zum Beispiel:
print(sentences[1000])

In [None]:
text = ff_short.Text[2] # 

sentences = prepare_text(text)

# sentences ist eine Liste von tokenisierten Sätzen, zum Beispiel:
print(sentences[1000])

<h3>Training von Word2Vec (mit Gensim)</h3>

In [None]:
# Paramter setzen
workers = 4
seed = 42 #just because

In [None]:
# Ordner anlegen zum Abspeichern von trainierten Modellen
if not os.path.exists('trained_models'):
    os.makedirs('trained_models')

In [None]:
# Training
w2v_ff_short = Word2Vec(sentences=sentences, 
                   vector_size=300,  # "size"-Parameter musste unbenannt werden
                   window=7, 
                   min_count=3,
                   workers=workers,
                   sg=1,
                   seed=seed)

# trainiertes Modell speichern
w2v_ff_short.save(os.path.join('trained_models', 'w2v_ff_short.bin'))

In [None]:
# trainiertes Modell laden
w2v_ff_short = Word2Vec.load(os.path.join('trained_models', 'w2v_ff_short.bin'))

<h3>Exploration des Word2Vec-Modells</h3>

<h4>Wörter als Vektoren</h4>

In [None]:
# Vergleicht 2 Vektoren (Cosinus Similarität)

w2v_ff_short.wv.similarity('', '')  # "similarity" Method deprecated

In [None]:
w2v_ff_short.wv.similarity('', '')

In [None]:
# Wortvektoren als DataFrame 

tsne = TSNE(n_components=2, random_state=seed)
word_vectors = w2v_ff_short.wv.vectors  # deprecated `syn0`
word_vectors_2d = tsne.fit_transform(word_vectors)

points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, word_vectors_2d[w2v_ff_short.wv.key_to_index[word]])  # 'model.wv.vocab["word"].index' deprecated
            for word in w2v_ff_short.wv.key_to_index
        ]
    ],
    columns=['word', 'x', 'y'])

# random 5 Wörter und ihre Koordinaten ausgeben
points.sample(5)

In [None]:
# alle Wortvektoren plotten

sns.set_context('poster')

plt.style.use('seaborn-whitegrid')
points.plot.scatter('x', 'y', s=10, figsize=(20, 12))

In [None]:
# Hilfsfunktion, um durch den Plot zu navigieren (ranzoomen)

def plot_region(x_bounds, y_bounds, padding=0.005, fontsize=11):
    myslice = points[
        (x_bounds[0] <= points.x) &
        (points.x <= x_bounds[1]) & 
        (y_bounds[0] <= points.y) &
        (points.y <= y_bounds[1])]
    
    ax = myslice.plot.scatter('x', 'y', s=35, figsize=(9, 5))
    for i, point in myslice.iterrows():
        ax.text(point.x + padding, point.y + padding, point.word, fontsize=fontsize)
    

In [None]:
# ranzoomen
x_bounds = (15, 20)       # Bereich x-Achse einstellen
y_bounds = (30, 35)       # Bereich y-Achse einstellen

#myslice = points[
 #       (x_bounds[0] <= points.x) &
  #      (points.x <= x_bounds[1]) & 
   #     (y_bounds[0] <= points.y) &
    #    (points.y <= y_bounds[1])]

len(myslice)

plot_region(x_bounds=x_bounds, y_bounds=y_bounds)


In [None]:
# nur ausgewählte Wortvektoren plotten

tsne = TSNE(n_components=2, random_state=seed)

words =  ['']

vectors = [w2v_ff_short.wv[word] for word in words]  # deprecated `__getitem__`, self.wv.__getitem__() instead
plt.figure(figsize=[30,25])
plt.style.use('seaborn-whitegrid')
Y = tsne.fit_transform(vectors[:200])
plt.scatter(Y[:, 0], Y[:, 1])
for label, x, y in zip(words, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.show()

<h4>Ähnliche Wörter</h4>

In [None]:
w2v_ff_short.wv.most_similar(positive=[''], topn=5)

In [None]:
w2v_ff_short.wv.most_similar(positive=[''], topn=5)

In [None]:
w2v_ff_short.wv.most_similar(positive=['hexe'], topn=5)

In [None]:
# Hilfsfunktion zum Plotten ähnlicher Wörter im Vektorraum

def plot_closest_words(model, word, plot_style):
    
    arr = np.empty((0,300), dtype='f') 
    closest_words = model.similar_by_word(word)
    word_labels = [word]
    arr = np.append(arr, np.array([model[word]]), axis=0)
    
    for wrd_score in closest_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
                
    plt.figure(figsize=[12,6])
    plt.style.use(plot_style)
                    
    tsne = TSNE(n_components=2, random_state=42)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()       

In [None]:
plot_closest_words(w2v_ff_short.wv, '', 'seaborn-whitegrid')

In [None]:
plot_closest_words(w2v_ff_short.wv, '', 'seaborn-whitegrid')

<h3>Verrechnung der Vektoren</h3>

In [None]:
# Hilfsfunktion zur Analogiebildung

def analogy(model, word1, word2, word3):
    similarities = model.most_similar(positive=[word1, word3], negative=[word2])
    word4 = similarities[0][0]
    print('{word1} steht in Beziehung zu {word2}, wie {word4} zu {word3}'.format(**locals()))
    return word4

In [None]:
analogy(w2v_ff_short.wv, '', '', '')

In [None]:
analogy(w2v_ff_short.wv, '', '', '')