In [1]:
# -*- coding: utf-8 -*-''
from twec.twec import TWEC
from gensim.models.word2vec import Word2Vec, PathLineSentences
from gensim.models import KeyedVectors
from gensim import utils
from pathlib import Path
import numpy as np
import pickle
import os
import glob
import itertools
import importlib

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
data_folder = Path('../obj')

In [4]:
def load_corpus(filepath):
    with open(str(data_folder / filepath), 'rb') as f:
        corpus = pickle.load(f)
        return corpus

#### Create slices for TWEC - slices for both training and testing

In [5]:
# Load preprocessed slices
slice_1_spell = load_corpus('slice_1_lem_spell.pkl')
slice_2_spell = load_corpus('slice_2_lem_spell.pkl')
slice_3_spell = load_corpus('slice_3_lem_spell.pkl')
slice_4_spell = load_corpus('slice_4_lem_spell.pkl')
slice_5_spell = load_corpus('slice_5_lem_spell.pkl')

In [10]:
slice_1_spell = list(itertools.chain(*slice_1_spell))
slice_2_spell = list(itertools.chain(*slice_2_spell))
slice_3_spell = list(itertools.chain(*slice_3_spell))
slice_4_spell = list(itertools.chain(*slice_4_spell))
slice_5_spell = list(itertools.chain(*slice_5_spell))

In [13]:
# Save slices in LineSentence format
utils.save_as_line_sentence(slice_1_spell, os.path.join(Path('.\\examples\\training\\slice_1.txt')))
utils.save_as_line_sentence(slice_2_spell, os.path.join(Path('.\\examples\\training\\slice_2.txt')))
utils.save_as_line_sentence(slice_3_spell, os.path.join(Path('.\\examples\\training\\slice_3.txt')))
utils.save_as_line_sentence(slice_4_spell, os.path.join(Path('.\\examples\\training\\slice_4.txt')))
utils.save_as_line_sentence(slice_5_spell, os.path.join(Path('.\\examples\\training\\slice_5.txt')))

In [18]:
full_corpus = slice_1_spell + slice_2_spell + slice_3_spell + slice_4_spell + slice_5_spell

In [25]:
utils.save_as_line_sentence(full_corpus,'.\\examples\\training\\compass.txt') 

In [4]:
# standard params gensim.Word2Vec iter=5, ns=10 
aligner = TWEC(size=200, siter=5, diter=5, workers=1, ns=10)
aligner.train_compass("examples/training/compass", overwrite=True)

Training the compass.
Compass will be overwritten after training
New train_model


In [21]:
# This param setting is needed to evaluate the temporal embedding spaces --> no negative samples and hierarchical softmax applied 
# aligner = TWEC(size=200, siter=10, diter=10, workers=4, ns=0, hs=1, test='.\\examples\\testing')

TypeError: __init__() got an unexpected keyword argument 'hs'

In [5]:
slice_1 = aligner.train_slice("examples/training/compass/slice_1.txt", save=True)
slice_2 = aligner.train_slice("examples/training/compass/slice_2.txt", save=True)
slice_3 = aligner.train_slice("examples/training/compass/slice_3.txt", save=True)
slice_4 = aligner.train_slice("examples/training/compass/slice_4.txt", save=True)
slice_5 = aligner.train_slice("examples/training/compass/slice_5.txt", save=True)

Training temporal embeddings: slice examples/training/compass/slice_1.txt.
New train_model
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice examples/training/compass/slice_2.txt.
New train_model
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice examples/training/compass/slice_3.txt.
New train_model
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice examples/training/compass/slice_4.txt.
New train_model
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice examples/training/compass/slice_5.txt.
New train_model
Initializing temporal embeddings from the atemporal compass.


In [5]:
compass = Word2Vec.load("model/compass.model") 
model1 = Word2Vec.load("model/slice_1.model")
model2 = Word2Vec.load("model/slice_2.model")
model3 = Word2Vec.load("model/slice_3.model")
model4 = Word2Vec.load("model/slice_4.model")
model5 = Word2Vec.load("model/slice_5.model")

In [6]:
compass = compass.wv
model_1 = model1.wv
model_2 = model2.wv
model_3 = model3.wv
model_4 = model4.wv
model_5 = model5.wv

In [7]:
print(len(compass.vocab))
print(len(model_1.vocab))
print(len(model_2.vocab))
print(len(model_3.vocab))
print(len(model_4.vocab))
print(len(model_5.vocab))

160889
52598
51592
60557
68109
62447


In [8]:
compass.save_word2vec_format('model/compass.txt', binary=True)
model_1.save_word2vec_format('model/slice_1.txt', binary=True)
model_2.save_word2vec_format('model/slice_2.txt', binary=True)
model_3.save_word2vec_format('model/slice_3.txt', binary=True)
model_4.save_word2vec_format('model/slice_4.txt', binary=True)
model_5.save_word2vec_format('model/slice_5.txt', binary=True)

#### Create slices for TWEC - slices for both training and testing

In [45]:
def generate_train_and_test_sets(train, split=0.05):
    test = []
    for i in range(0,round(split * len(train))):
        random_number = np.random.randint(0,len(train))
        # Remove doc at given index and add to test set
        test.append(train.pop(random_number))
    return train,test

In [49]:
slice_1_train, slice_1_test = generate_train_and_test_sets(slice_1_spell)
slice_2_train, slice_2_test = generate_train_and_test_sets(slice_2_spell)
slice_3_train, slice_3_test = generate_train_and_test_sets(slice_3_spell)
slice_4_train, slice_4_test = generate_train_and_test_sets(slice_4_spell)
slice_5_train, slice_5_test = generate_train_and_test_sets(slice_5_spell)

In [None]:
# Save testing slices
gensim.utils.save_as_line_sentence(slice_1_test, os.path.join(Path('.\\twec\\examples\\testing\\slice_1_test.txt')))
gensim.utils.save_as_line_sentence(slice_2_test, os.path.join(Path('.\\twec\\examples\\testing\\slice_2_test.txt')))
gensim.utils.save_as_line_sentence(slice_3_test, os.path.join(Path('.\\twec\\examples\\testing\\slice_3_test.txt')))
gensim.utils.save_as_line_sentence(slice_4_test, os.path.join(Path('.\\twec\\examples\\testing\\slice_4_test.txt')))
gensim.utils.save_as_line_sentence(slice_5_test, os.path.join(Path('.\\twec\\examples\\testing\\slice_5_test.txt')))

In [15]:
model_1.most_similar('haß')
model_2.most_similar('haß')
model_3.most_similar('haß')
model_4.most_similar('haß')
model_5.most_similar('haß')

[('feindschaft', 0.7777882218360901),
 ('antipathie', 0.7710212469100952),
 ('widerwillen', 0.7268883585929871),
 ('verachtung', 0.7198671698570251),
 ('feindseligkeit', 0.7176477909088135),
 ('abneigung', 0.6986110806465149),
 ('groll', 0.693888783454895),
 ('animosität', 0.6788550615310669),
 ('verbitterung', 0.6756240725517273),
 ('unbilden', 0.6729105710983276)]

[('feindschaft', 0.7855952382087708),
 ('mißgunst', 0.7646018266677856),
 ('feindseligkeit', 0.7410603761672974),
 ('gehässigkeit', 0.7298420667648315),
 ('neid', 0.7282276153564453),
 ('abneigung', 0.7243658304214478),
 ('mißtrauen', 0.6989201307296753),
 ('antipathie', 0.6822444200515747),
 ('wut', 0.6798548698425293),
 ('verbitterung', 0.6706445217132568)]

[('feindschaft', 0.7284114360809326),
 ('verachtung', 0.7158603072166443),
 ('neid', 0.705698549747467),
 ('groll', 0.6960954666137695),
 ('antipathie', 0.6945077776908875),
 ('mißtrauen', 0.6902238726615906),
 ('hetze', 0.6819382309913635),
 ('abneigung', 0.6748616695404053),
 ('verbitterung', 0.6705626249313354),
 ('wut', 0.6676822900772095)]

[('feindschaft', 0.7295164465904236),
 ('mißgunst', 0.7236109972000122),
 ('neid', 0.6853048801422119),
 ('groll', 0.672009289264679),
 ('abneigung', 0.6471599340438843),
 ('vorurteil', 0.6460909843444824),
 ('mißtrauen', 0.6455237865447998),
 ('hetze', 0.6418379545211792),
 ('wut', 0.6321764588356018),
 ('verachtung', 0.6308550238609314)]

[('wut', 0.7203575968742371),
 ('feindschaft', 0.7080402374267578),
 ('neid', 0.65813148021698),
 ('leidenschaft', 0.6579920053482056),
 ('vorurteil', 0.6500828266143799),
 ('verachtung', 0.6427229642868042),
 ('aufbäumen', 0.636103630065918),
 ('gift', 0.6285501718521118),
 ('hetze', 0.6277855634689331),
 ('abscheu', 0.6212161779403687)]

In [33]:
jude_1 = model_1['jude']
jude_2 = model_2['jude']
jude_3 = model_3['jude']
jude_4 = model_4['jude']
jude_5 = model_5['jude']
christ_1 = model_1['christ']
christ_2 = model_2['christ']
christ_3 = model_3['christ']
christ_4 = model_4['christ']
christ_5 = model_5['christ']

print('Jude:')
print(np.dot(jude_1, jude_2) / (np.linalg.norm(jude_1) * np.linalg.norm(jude_2)))
print(np.dot(jude_1, jude_3) / (np.linalg.norm(jude_1) * np.linalg.norm(jude_3)))
print(np.dot(jude_1, jude_4) / (np.linalg.norm(jude_1) * np.linalg.norm(jude_4)))
print(np.dot(jude_1, jude_5) / (np.linalg.norm(jude_1) * np.linalg.norm(jude_5)))

print('\n')

print('Christ:')
print(np.dot(jude_1, christ_2) / (np.linalg.norm(jude_1) * np.linalg.norm(christ_2)))
print(np.dot(jude_1, christ_3) / (np.linalg.norm(jude_1) * np.linalg.norm(christ_3)))
print(np.dot(jude_1, christ_4) / (np.linalg.norm(jude_1) * np.linalg.norm(christ_4)))
print(np.dot(jude_1, christ_5) / (np.linalg.norm(jude_1) * np.linalg.norm(christ_5)))

Jude:
0.5276708
0.5594604
0.5391677
0.5198425


Christ:
0.47117013
0.41780522
0.39033422
0.45575985
