# modules

## setup

### imports

In [None]:
import os
import pickle
import random
import time
from functools import partial
from typing import TypeAlias

import hnswlib
import hunspell
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import spacy
from gensim.models import Word2Vec
from scipy.linalg import orthogonal_procrustes
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

### config

In [None]:
ENABLE_TEST = True

MODELS_WORD2VEC_FOLDER = "/veld/input/models/"
TEXTS_FOLDER = "/veld/input/texts/"
CACHE_FOLDER = "/veld/storage/cache/"

INDEX_EF_CONSTRUCTION = 100
INDEX_M = 16

PLOT_SLEEP = 2

nlp = spacy.load("de_core_news_sm")
hunspell_check = hunspell.HunSpell("/usr/share/hunspell/de_DE.dic", "/usr/share/hunspell/de_DE.aff")
random.seed(42)
pio.renderers.default = "iframe"

## helpers

### pickle_save

In [None]:
def pickle_save(data, file):
    pickle_path = CACHE_FOLDER + file + ".pkl"
    with open(pickle_path, "wb") as f:
        pickle.dump(data, f)
        print("pickle_save: persisted into cache at:", pickle_path)

### pickle_load

In [None]:
def pickle_load(file):
    pickle_path = CACHE_FOLDER + file + ".pkl"
    with open(pickle_path, "rb") as f:
        result = pickle.load(f)
        print("pickle_load: loaded from cache at:", pickle_path)
    return result

### load_cache_or_run

In [None]:
def load_cache_or_run(func):
    pickle_file = func.__name__
    if os.path.exists(CACHE_FOLDER + pickle_file + ".pkl"):
        result = pickle_load(pickle_file)
    else:
        result = func()
        pickle_save(result, pickle_file)
    return result

### is_word

In [None]:
def is_word(word):
    try:
        _ = int(word)
    except:
        if len(word) == 1:
            return False
        else:
            return hunspell_check.spell(word)
    else:
        return False

## data structures

### tpye aliases

In [None]:
Lemma: TypeAlias = str
Word: TypeAlias = str
Decade: TypeAlias = int
LineNumber: TypeAlias = int
WordNumber: TypeAlias = int
OccurrenceCount: TypeAlias = int
Diff: TypeAlias = float
Trajectory: TypeAlias = float
CosSim: TypeAlias = float
DecadesStr: TypeAlias = str  # spans three decades, e.g. "174-176"
Embedding: TypeAlias = np.ndarray
Model: TypeAlias = Word2Vec
DecadeList: TypeAlias = list[Decade]

# lemma data structures
LemmaDiffDict: TypeAlias = dict[Lemma, Diff]
LemmaTrajectoryDict: TypeAlias = dict[Lemma, Trajectory]
LineNumberDict: TypeAlias = dict[LineNumber, list[WordNumber]]
LemmaOccurrencePositionDict: TypeAlias = dict[Lemma, LineNumberDict]
LemmaOccurrenceCountDict: TypeAlias = dict[Lemma, OccurrenceCount]
LemmaWordDict: TypeAlias = dict[Lemma, list[Word]]
WordLemmaDict: TypeAlias = dict[Word, Lemma]

# index data structure
IdToLemmaDict: TypeAlias = dict[int, Lemma]
LemmaToIdDict: TypeAlias = dict[Lemma, int]
Index: TypeAlias = tuple[hnswlib.Index, LemmaToIdDict, IdToLemmaDict]

DecadeData: TypeAlias = list[Index, WordLemmaDict, LemmaWordDict, LemmaOccurrenceCountDict, LemmaOccurrencePositionDict, Decade]
DecadeDict: TypeAlias = dict[Decade, DecadeData]
LemmaDecadeDiffDict: TypeAlias = dict[Lemma, dict[DecadesStr, Diff]]

### create_decades_list

In [None]:
def create_decades_list(decade_start: int = 155, decade_end: int = 191) -> DecadeList:
    decade_list = []
    for model_file in os.listdir(MODELS_WORD2VEC_FOLDER):
        if model_file.endswith(".bin"):
            decade = int(model_file.split(".bin")[0])
            decade_list.append(decade)
    decade_list = sorted(decade_list)
    i_start = 0
    i_end = len(decade_list)
    for i, decade in enumerate(decade_list):
        if decade_start and decade_start == decade:
            i_start = i
        if decade_end and decade_end == decade:
            i_end = i + 1
    decade_list = decade_list[i_start:i_end]
    print("create_decades_list: decade_list:", decade_list)
    return decade_list


def create_decades_list_test():
    return create_decades_list(decade_start=165, decade_end=175)


if ENABLE_TEST:
    decade_list = load_cache_or_run(create_decades_list_test)

### load_model

In [None]:
def load_model(decade: Decade) -> Model:
    model_path = MODELS_WORD2VEC_FOLDER + str(decade) + ".bin"
    model = Word2Vec.load(model_path)
    print("load_model_word2vec: model_path:", model_path)
    print("load_model_word2vec: model:", model)
    return model


def load_model_test():
    model_174 = load_model(174)
    model_175 = load_model(175)
    model_176 = load_model(176)
    return model_174, model_175, model_176


model_174, model_175, model_176 = load_cache_or_run(load_model_test)

### create_lemma_word_dicts

In [None]:
def create_lemma_word_dicts(model: Model) -> tuple[LemmaWordDict, WordLemmaDict]:
    word_lemma_dict = {}
    lemma_word_dict = {}
    for word in model.wv.index_to_key:
        lemma = nlp(word)[0].lemma_
        word_list = lemma_word_dict.get(lemma, [])
        word_list.append(word)
        lemma_word_dict[lemma] = word_list
        word_lemma_dict[word] = lemma
    print("create_lemma_word_dicts: len(lemma_word_dict):", len(lemma_word_dict))
    return lemma_word_dict, word_lemma_dict


def create_lemma_word_dicts_test():
    lemma_word_dict_174, word_lemma_dict_174 = create_lemma_word_dicts(model_174)
    lemma_word_dict_175, word_lemma_dict_175 = create_lemma_word_dicts(model_175)
    lemma_word_dict_176, word_lemma_dict_176 = create_lemma_word_dicts(model_176)
    return lemma_word_dict_174, word_lemma_dict_174, lemma_word_dict_175, word_lemma_dict_175, lemma_word_dict_176, word_lemma_dict_176


lemma_word_dict_174, word_lemma_dict_174, lemma_word_dict_175, word_lemma_dict_175, lemma_word_dict_176, word_lemma_dict_176 = (
    load_cache_or_run(create_lemma_word_dicts_test)
)

### create_index

In [None]:
def create_index(lemma_word_dict: LemmaWordDict, model: Model) -> Index:
    id_to_lemma_dict: IdToLemmaDict = {}
    lemma_to_id_dict: LemmaToIdDict = {}
    embedding_array = []
    for lemma_id, (lemma, word_list) in enumerate(lemma_word_dict.items()):
        id_to_lemma_dict[lemma_id] = lemma
        lemma_to_id_dict[lemma] = lemma_id
        word_embedding_list = [model.wv[word] for word in word_list]
        embedding_average = np.mean(np.array(word_embedding_list), axis=0)
        embedding_normalized = embedding_average / np.linalg.norm(embedding_average)
        embedding_array.append(embedding_normalized)
    embedding_array = np.array(embedding_array)
    max_elements = len(embedding_array)
    dim = embedding_array[0].shape[0]
    hnsw_index = hnswlib.Index(space="cosine", dim=dim)
    hnsw_index.init_index(max_elements=max_elements, ef_construction=INDEX_EF_CONSTRUCTION, M=INDEX_M)
    hnsw_index.add_items(embedding_array, ids=list(id_to_lemma_dict.keys()))
    index = (hnsw_index, lemma_to_id_dict, id_to_lemma_dict)
    print("create_lemma_dict_and_index: hnsw_index.get_current_count:", hnsw_index.get_current_count())
    return index


def create_index_test():
    index_174 = create_index(lemma_word_dict_174, model_174)
    index_175 = create_index(lemma_word_dict_175, model_175)
    index_176 = create_index(lemma_word_dict_176, model_176)
    return index_174, index_175, index_176


index_174, index_175, index_176 = load_cache_or_run(create_index_test)

### create_occurrence_dicts

In [None]:
def create_occurrence_dicts(decade: Decade, word_lemma_dict: WordLemmaDict) -> tuple[LemmaOccurrencePositionDict, LemmaOccurrenceCountDict]:
    lemma_occurrence_position_dict = {}
    lemma_occurrence_count_dict = {}
    total_occurrence_count = 0
    with open(TEXTS_FOLDER + str(decade) + ".txt", "r") as f:
        for line_number, line in enumerate(f):
            for word_number, word in enumerate(line.rstrip("\n").split(" ")):
                lemma = word_lemma_dict.get(word)
                if lemma:
                    line_number_dict: LineNumberDict = lemma_occurrence_position_dict.get(lemma, {})
                    word_number_list: list[WordNumber] = line_number_dict.get(line_number, [])
                    word_number_list.append(word_number)
                    line_number_dict[line_number] = word_number_list
                    lemma_occurrence_position_dict[lemma] = line_number_dict
                    occurrence_count = lemma_occurrence_count_dict.get(lemma, 0)
                    lemma_occurrence_count_dict[lemma] = occurrence_count + 1
                    total_occurrence_count += 1
    lemma_count = len(lemma_occurrence_count_dict)
    occurrence_avg = total_occurrence_count / lemma_count
    median_pos = int(lemma_count / 2)
    occurrence_median = list(lemma_occurrence_count_dict.values())[median_pos]
    print("create_occurrence_dicts: lemma_count:", lemma_count)
    print("create_occurrence_dicts: total_occurrence_count:", total_occurrence_count)
    print("create_occurrence_dicts: occurrence_avg:", occurrence_avg)
    print("create_occurrence_dicts: occurrence_median:", occurrence_median)
    return lemma_occurrence_position_dict, lemma_occurrence_count_dict


def create_occurrence_dicts_test():
    lemma_occurrence_position_dict_174, lemma_occurrence_count_dict_174 = create_occurrence_dicts(174, word_lemma_dict_174)
    lemma_occurrence_position_dict_175, lemma_occurrence_count_dict_175 = create_occurrence_dicts(175, word_lemma_dict_175)
    lemma_occurrence_position_dict_176, lemma_occurrence_count_dict_176 = create_occurrence_dicts(176, word_lemma_dict_176)
    return (
        lemma_occurrence_position_dict_174,
        lemma_occurrence_count_dict_174,
        lemma_occurrence_position_dict_175,
        lemma_occurrence_count_dict_175,
        lemma_occurrence_position_dict_176,
        lemma_occurrence_count_dict_176,
    )


if ENABLE_TEST:
    (
        lemma_occurrence_position_dict_174,
        lemma_occurrence_count_dict_174,
        lemma_occurrence_position_dict_175,
        lemma_occurrence_count_dict_175,
        lemma_occurrence_position_dict_176,
        lemma_occurrence_count_dict_176,
    ) = load_cache_or_run(create_occurrence_dicts_test)

### sort_lemma_dict_by_value_to_list

In [None]:
def sort_lemma_dict_by_value_to_list(lemma_dict: dict[Lemma, int | float], desc=True) -> list[Lemma]:
    if desc:
        sort_mod = -1
    else:
        sort_mod = 1
    lemma_list_sorted = [(lemma, value) for lemma, value in lemma_dict.items()]
    lemma_list_sorted = [l[0] for l in sorted(lemma_list_sorted, key=lambda x: sort_mod * x[1])]
    print("sort_lemma_occurrence_count_dict: len(lemma_list_sorted):", len(lemma_list_sorted))
    return lemma_list_sorted


def sort_lemma_dict_by_value_to_list_test():
    lemma_list_sorted_174 = sort_lemma_dict_by_value_to_list(lemma_occurrence_count_dict_174)
    lemma_list_sorted_175 = sort_lemma_dict_by_value_to_list(lemma_occurrence_count_dict_175)
    lemma_list_sorted_176 = sort_lemma_dict_by_value_to_list(lemma_occurrence_count_dict_176)
    return lemma_list_sorted_174, lemma_list_sorted_175, lemma_list_sorted_176


lemma_list_sorted_174, lemma_list_sorted_175, lemma_list_sorted_176 = load_cache_or_run(sort_lemma_dict_by_value_to_list_test)

### sort_lemma_dict_by_value_to_dict

In [None]:
def sort_lemma_dict_by_value_to_dict(lemma_dict: dict[Lemma, int | float], desc=True) -> dict:
    lemma_list_sorted = sort_lemma_dict_by_value_to_list(lemma_dict, desc)
    lemma_dict_new = {}
    for lemma in lemma_list_sorted:
        lemma_dict_new[lemma] = lemma_dict[lemma]
    return lemma_dict_new


def sort_lemma_dict_by_value_to_dict_test():
    global lemma_occurrence_count_dict_174
    global lemma_occurrence_count_dict_175
    global lemma_occurrence_count_dict_176
    lemma_occurrence_count_dict_174 = sort_lemma_dict_by_value_to_dict(lemma_occurrence_count_dict_174)
    lemma_occurrence_count_dict_175 = sort_lemma_dict_by_value_to_dict(lemma_occurrence_count_dict_175)
    lemma_occurrence_count_dict_176 = sort_lemma_dict_by_value_to_dict(lemma_occurrence_count_dict_176)
    return lemma_occurrence_count_dict_174, lemma_occurrence_count_dict_175, lemma_occurrence_count_dict_176


lemma_occurrence_count_dict_174, lemma_occurrence_count_dict_175, lemma_occurrence_count_dict_176 = load_cache_or_run(
    sort_lemma_dict_by_value_to_dict_test
)

### sort_by_occurrence

In [None]:
def sort_by_occurrence(lemma_list_sorted: list[Lemma], sortable_lemma_dict: dict) -> dict:
    sortable_lemma_dict_new = {}
    for lemma in lemma_list_sorted:
        sortable_lemma_dict_new[lemma] = sortable_lemma_dict[lemma]
    print("sort_by_occurrence: len(sortable_lemma_dict_new):", len(sortable_lemma_dict_new))
    return sortable_lemma_dict_new


def sort_by_occurrence_test():
    global lemma_occurrence_position_dict_174
    global lemma_occurrence_position_dict_175
    global lemma_occurrence_position_dict_176
    global lemma_word_dict_174
    global lemma_word_dict_175
    global lemma_word_dict_176
    lemma_occurrence_position_dict_174 = sort_by_occurrence(lemma_list_sorted_174, lemma_occurrence_position_dict_174)
    lemma_occurrence_position_dict_175 = sort_by_occurrence(lemma_list_sorted_175, lemma_occurrence_position_dict_175)
    lemma_occurrence_position_dict_176 = sort_by_occurrence(lemma_list_sorted_176, lemma_occurrence_position_dict_176)
    lemma_word_dict_174 = sort_by_occurrence(lemma_list_sorted_174, lemma_word_dict_174)
    lemma_word_dict_175 = sort_by_occurrence(lemma_list_sorted_175, lemma_word_dict_175)
    lemma_word_dict_176 = sort_by_occurrence(lemma_list_sorted_176, lemma_word_dict_176)
    return (
        lemma_occurrence_position_dict_174,
        lemma_occurrence_position_dict_175,
        lemma_occurrence_position_dict_176,
        lemma_word_dict_174,
        lemma_word_dict_175,
        lemma_word_dict_176,
    )


(
    lemma_occurrence_position_dict_174,
    lemma_occurrence_position_dict_175,
    lemma_occurrence_position_dict_176,
    lemma_word_dict_174,
    lemma_word_dict_175,
    lemma_word_dict_176,
) = load_cache_or_run(sort_by_occurrence_test)

### get_common_lemma

In [None]:
def get_common_lemma(*lemma_dict_list):
    lemma_set_list = []
    for lemma_dict in lemma_dict_list:
        lemma_set_list.append(set(lemma_dict.keys()))
    return set.intersection(*lemma_set_list)


if ENABLE_TEST:
    print(get_common_lemma({"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2, "c": 3}))

### get_occurrences

In [None]:
def get_occurrences(
    decade: Decade,
    line_number_dict: LineNumberDict,
    max_elem: int = None,
    highlight_lemma: bool = True,
    keep_lemma: bool = True,
) -> list[str]:
    text_list = []
    with open(TEXTS_FOLDER + str(decade) + ".txt", "r") as f:
        num_print = 0
        for line_number, line in enumerate(f):
            word_number_list = line_number_dict.get(line_number)
            if word_number_list:
                word_number_set = set(word_number_list)
                text = ""
                for word_number, word in enumerate(line.rstrip("\n").split(" ")):
                    if word_number in word_number_set and highlight_lemma and keep_lemma:
                        text += " ### " + word + " ###"
                    elif word_number not in word_number_set or (not highlight_lemma and keep_lemma):
                        text += " " + word
                    else:
                        pass
                text_list.append(text)
                num_print += 1
                if max_elem and num_print == max_elem:
                    break
    return text_list


if ENABLE_TEST:
    print(get_occurrences(175, lemma_occurrence_position_dict_175["gehen"], max_elem=1))
    print(get_occurrences(175, lemma_occurrence_position_dict_175["gehen"], max_elem=1, highlight_lemma=False))
    print(get_occurrences(175, lemma_occurrence_position_dict_175["gehen"], max_elem=1, keep_lemma=False))
    print(len(get_occurrences(175, lemma_occurrence_position_dict_175["gehen"], max_elem=None, highlight_lemma=False)))

### merge_count_occurrences_dict

In [None]:
def merge_count_occurrences_dict(lemma_occurrence_count_dict_a, lemma_occurrence_count_dict_b):
    lemma_occurrence_count_dict_merged = {}
    for lemma in get_common_lemma(lemma_occurrence_count_dict_a, lemma_occurrence_count_dict_b):
        lemma_occurrence_count_dict_merged[lemma] = lemma_occurrence_count_dict_a[lemma] + lemma_occurrence_count_dict_b[lemma]
    print("merge_count_occurrences_dict: len(lemma_occurrence_count_dict_merged):", len(lemma_occurrence_count_dict_merged))
    return lemma_occurrence_count_dict_merged


if ENABLE_TEST:
    lemma_occurrence_count_dict_174_175_merged = merge_count_occurrences_dict(
        lemma_occurrence_count_dict_174, lemma_occurrence_count_dict_175
    )

### filter_on_values

In [None]:
def filter_on_values(key_value_dict, limit_min=None, limit_max=None):
    key_value_dict_new = {}
    for lemma, value in key_value_dict.items():
        if limit_min and limit_max:
            if limit_min <= value <= limit_max:
                key_value_dict_new[lemma] = value
        elif limit_min:
            if limit_min <= value:
                key_value_dict_new[lemma] = value
        elif limit_max:
            if value <= limit_max:
                key_value_dict_new[lemma] = value
    return key_value_dict_new


if ENABLE_TEST:
    print(filter_on_values({"gehen": 0.9, "laufen": 0.5, "wandern": 0.7}, limit_min=0.6, limit_max=0.8))

## vector and index functions

### calculate_cos_sim

In [None]:
def calculate_cos_sim(embedding_a: np.ndarray, embedding_b: np.ndarray) -> float:
    return np.dot(embedding_a, embedding_b) / (np.linalg.norm(embedding_a) * np.linalg.norm(embedding_b))

### calculate_cos_distance

In [None]:
def calculate_cos_distance(embedding_a: np.ndarray, embedding_b: np.ndarray) -> float:
    return 1 - calculate_cos_sim(embedding_a, embedding_b)

### query_embedding

In [None]:
def query_embedding(index: Index, lemma: Lemma) -> float:
    lemma_id = index[1].get(lemma)
    if lemma_id is not None:
        embedding = index[0].get_items([lemma_id])[0]
    else:
        embedding = None
    return embedding


if ENABLE_TEST:
    embedding = query_embedding(index_175, "gehen")
    print(embedding.shape)
    embedding = query_embedding(index_175, "kljwklerjas")
    print(embedding is None)

### query_related

In [None]:
def query_related(
    index: Index,
    lemma: Lemma,
    n: int = 10,
    return_as_dict: bool = True,
    keep_search: bool = False,
) -> dict[str, float] | list[str]:
    result = None
    try:
        id_embedding = index[1][lemma]
    except:
        print("not found")
    else:
        if keep_search:
            distances_start = 0
        else:
            n += 1
            distances_start = 1
        embedding = index[0].get_items([id_embedding])[0]
        ids, distances = index[0].knn_query(embedding.reshape(1, -1), k=n)
        if return_as_dict:
            result = {}
        else:
            result = []
        for id_other, distance in list(zip(ids[0], distances[0]))[distances_start:]:
            lemma_related = index[2][id_other]
            if return_as_dict:
                result[lemma_related] = distance
            else:
                result.append(lemma_related)
    return result


if ENABLE_TEST:
    print(query_related(index_175, "gehen", n=10))
    print(query_related(index_176, "gehen", n=10))
    print(query_related(index_176, "gehen", n=10, return_as_dict=False, keep_search=True))

### calculate_average_sentence_embedding_from_sentence

In [None]:
def calculate_average_sentence_embedding_from_sentence(
    sentence: str,
    index: Index,
    word_lemma_dict: WordLemmaDict,
    show_exception: bool = False,
) -> np.ndarray:
    embedding_list = []
    for word in sentence.split(" "):
        try:
            embedding = query_embedding(index, word_lemma_dict[word])
            if embedding is not None:
                embedding_list.append(embedding)
        except Exception as ex:
            if show_exception:
                print(ex, word)
    embedding_avg = np.mean(np.array(embedding_list), axis=0)
    return embedding_avg


if ENABLE_TEST:
    v1 = calculate_average_sentence_embedding_from_sentence(
        "der mensch ist in dem haus", index_175, word_lemma_dict_174, show_exception=True
    )
    v2 = calculate_average_sentence_embedding_from_sentence(
        "der mann ist in der hütte", index_175, word_lemma_dict_175, show_exception=True
    )
    v3 = calculate_average_sentence_embedding_from_sentence(
        "die ziege ist auf dem feld", index_175, word_lemma_dict_175, show_exception=True
    )
    print(calculate_cos_sim(v1, v2))
    print(calculate_cos_sim(v2, v3))

### calculate_average_sentence_embedding_from_lemma

In [None]:
def calculate_average_sentence_embedding_from_lemma(
    decade: Decade,
    line_number_dict: LineNumberDict,
    index: Index,
    word_lemma_dict: WordLemmaDict,
    max_sentences: int = 2,
) -> np.ndarray:
    sentence_list = get_occurrences(decade, line_number_dict, max_elem=max_sentences, highlight_lemma=False)
    sentence_embedding_dict = {}
    for sentence in sentence_list:
        sentence_embedding_dict[sentence] = calculate_average_sentence_embedding_from_sentence(sentence, index, word_lemma_dict)
    return sentence_embedding_dict


if ENABLE_TEST:
    sentence_embedding_dict = calculate_average_sentence_embedding_from_lemma(
        175,
        lemma_occurrence_position_dict_175["gehen"],
        index_175,
        word_lemma_dict_175,
        max_sentences=1,
    )
    for sentence, embedding in sentence_embedding_dict.items():
        print(sentence)
        print(embedding.shape)

### create_tsne

In [None]:
def create_tsne(embeddings, perplexity=None):
    if perplexity is None:
        perplexity = 5
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    embeddings_reduced = tsne.fit_transform(np.array(embeddings))
    return embeddings_reduced


if ENABLE_TEST:
    embeddings_reduced = create_tsne([query_embedding(index_175, "gehen"), query_embedding(index_175, "laufen")], perplexity=1)
    print(embeddings_reduced)

## difference analysis functions

### create_procrustes_alignment

In [None]:
def create_procrustes_alignment(
    index_a: Index,
    index_b: Index,
    lemma_occurrence_count_dict_a: LemmaOccurrenceCountDict,
    lemma_occurrence_count_dict_b: LemmaOccurrenceCountDict,
):

    # create overlap matrices with embeddings weighted by count of occurrence
    common_lemma = get_common_lemma(lemma_occurrence_count_dict_a, lemma_occurrence_count_dict_b)
    overlap_matrix_a = []
    overlap_matrix_b = []
    print("create_procrustes_alignment: len(common_lemma):", len(common_lemma))
    for lemma in common_lemma:
        occurrence_count_sqrt = np.sqrt((lemma_occurrence_count_dict_a[lemma] + lemma_occurrence_count_dict_b[lemma]) / 2)
        embedding_a = query_embedding(index_a, lemma) * occurrence_count_sqrt
        embedding_b = query_embedding(index_b, lemma) * occurrence_count_sqrt
        overlap_matrix_a.append(embedding_a)
        overlap_matrix_b.append(embedding_b)
    overlap_matrix_a = np.stack(overlap_matrix_a)
    overlap_matrix_b = np.stack(overlap_matrix_b)

    # do procrustes transformation
    r, _ = orthogonal_procrustes(overlap_matrix_b, overlap_matrix_a)
    matrix_b = []
    index_b_hnsw, lemma_to_id_b, id_to_lemma_b = index_b
    index_b_id_to_lemma_keys = list(id_to_lemma_b.keys())
    for i in index_b_id_to_lemma_keys:
        embedding_b = index_b_hnsw.get_items([i])[0]
        matrix_b.append(embedding_b)
    matrix_b = np.stack(matrix_b)
    matrix_b_aligned = matrix_b @ r
    matrix_b_aligned_normalized = matrix_b_aligned / np.linalg.norm(matrix_b_aligned, axis=1, keepdims=True)
    print("create_procrustes_alignment: matrix_b_aligned.shape:", matrix_b_aligned.shape)

    # create new index data structure
    index_b_aligned = hnswlib.Index(space="cosine", dim=index_b[0].dim)
    index_b_aligned.init_index(max_elements=index_b[0].get_max_elements(), ef_construction=INDEX_EF_CONSTRUCTION, M=INDEX_M)
    index_b_aligned.add_items(matrix_b_aligned_normalized, index_b_id_to_lemma_keys)

    return (index_b_aligned, index_b[1], index_b[2])


def create_procrustes_alignment_test():
    index_aligned_175 = create_procrustes_alignment(
        index_174,
        index_175,
        lemma_occurrence_count_dict_174,
        lemma_occurrence_count_dict_175,
    )
    index_aligned_176 = create_procrustes_alignment(
        index_aligned_175,
        index_176,
        lemma_occurrence_count_dict_175,
        lemma_occurrence_count_dict_176,
    )
    return index_aligned_175, index_aligned_176


index_aligned_175, index_aligned_176 = load_cache_or_run(create_procrustes_alignment_test)
if ENABLE_TEST:
    for lemma in ["gehen", "und", "wohnen", "brauen", "Fürst"]:
        print("create_procrustes_alignment: lemma:", lemma)
        embedding_174 = query_embedding(index_174, lemma)
        embedding_175 = query_embedding(index_175, lemma)
        embedding_176 = query_embedding(index_176, lemma)
        embedding_aligned_175 = query_embedding(index_aligned_175, lemma)
        embedding_aligned_176 = query_embedding(index_aligned_176, lemma)
        cos_sim_174_175 = calculate_cos_sim(embedding_174, embedding_175)
        cos_sim_175_176 = calculate_cos_sim(embedding_175, embedding_176)
        cos_sim_aligned_174_175 = calculate_cos_sim(embedding_174, embedding_aligned_175)
        cos_sim_aligned_175_176 = calculate_cos_sim(embedding_aligned_175, embedding_aligned_176)
        print("create_procrustes_alignment:", "cos_sim_174_175:", cos_sim_174_175)
        print("create_procrustes_alignment:", "cos_sim_175_176:", cos_sim_175_176)
        print("create_procrustes_alignment:", "cos_sim_aligned_174_175:", cos_sim_aligned_174_175)
        print("create_procrustes_alignment:", "cos_sim_aligned_175_176:", cos_sim_aligned_175_176)

### calculate_cos_sim_between_indices

In [None]:
def calculate_cos_sim_between_indices(index_a, index_b):
    common_lemma = get_common_lemma(index_a[1], index_b[1])
    lemma_cos_sim_dict = {}
    for lemma in common_lemma:
        lemma_cos_sim_dict[lemma] = calculate_cos_sim(query_embedding(index_a, lemma), query_embedding(index_b, lemma))
    lemma_cos_sim_dict = sort_lemma_dict_by_value_to_dict(lemma_cos_sim_dict)
    print("calculcate_cos_sim_between_indices: len(lemma_cos_sim_dict):", len(lemma_cos_sim_dict))
    return lemma_cos_sim_dict


def calculate_cos_sim_between_indices_test():
    lemma_cos_sim_dict_174_175 = calculate_cos_sim_between_indices(index_174, index_aligned_175)
    return lemma_cos_sim_dict_174_175


if ENABLE_TEST:
    lemma_cos_sim_dict_174_175 = load_cache_or_run(calculate_cos_sim_between_indices_test)
    for lemma in ["gehen", "und", "wohnen", "brauen", "Fürst"]:
        print("calculcate_cos_sim_between_indices: lemma:", lemma)
        print(lemma_cos_sim_dict_174_175[lemma])

### weight_and_filter_lemmas

In [None]:
def weight_and_filter_lemmas(lemma_value_dict, lemma_occurrence_count_dict, filter_threshold=100):
    lemma_value_dict_weighted = {}
    for lemma, value in lemma_value_dict.items():
        if filter_threshold and lemma_occurrence_count_dict[lemma] >= filter_threshold:
            lemma_value_dict_weighted[lemma] = value * np.log1p(np.sqrt(lemma_occurrence_count_dict[lemma]))
    lemma_value_dict_weighted = sort_lemma_dict_by_value_to_dict(lemma_value_dict_weighted)
    print("weight_and_filter_lemmas: len(lemma_value_dict_weighted)", len(lemma_value_dict_weighted))
    return lemma_value_dict_weighted


if ENABLE_TEST:
    lemma_cos_sim_dict_174_175_weighted = weight_and_filter_lemmas(lemma_cos_sim_dict_174_175, lemma_occurrence_count_dict_174_175_merged)

### calculate_trajectory_from_lemma

In [None]:
def calculate_trajectory_from_lemma(index_a: Index, index_b: Index, index_c: Index, lemma: Lemma) -> Trajectory:
    a = query_embedding(index_a, lemma)
    b = query_embedding(index_b, lemma)
    c = query_embedding(index_c, lemma)
    ab = a - b
    bc = b - c
    ab_bc_trajectory = np.dot(ab, bc)
    return ab_bc_trajectory


if ENABLE_TEST:
    for lemma in ["gehen", "und", "wohnen", "brauen", "Fürst"]:
        ab_bc_trajectory = calculate_trajectory_from_lemma(index_174, index_aligned_175, index_aligned_176, lemma)
        print("calculate_trajectory_from_diff_per_lemma:", lemma, ab_bc_trajectory)

### calculate_trajectory_dict_from_index

In [None]:
def calculate_trajectory_dict_from_index(index_a: Index, index_b: Index, index_c: Index) -> LemmaTrajectoryDict:
    lemma_trajectory_diff_list = []
    common_lemma = get_common_lemma(index_a[1], index_b[1], index_c[1])
    for lemma in common_lemma:
        lemma_trajectory_diff = calculate_trajectory_from_lemma(index_a, index_b, index_c, lemma)
        lemma_trajectory_diff_list.append((lemma, lemma_trajectory_diff))
    lemma_trajectory_diff_list = sorted(lemma_trajectory_diff_list, key=lambda x: -x[1])
    lemma_trajectory_dict = {l: d for l, d in lemma_trajectory_diff_list}
    return lemma_trajectory_dict


if ENABLE_TEST:
    lemma_trajectory_dict_174_175_176 = calculate_trajectory_dict_from_index(index_174, index_aligned_175, index_aligned_176)
    print("len(lemma_trajectory_diff_dict_174_175_176)", len(lemma_trajectory_dict_174_175_176))

### calculate_relative_diff_from_lemma

In [None]:
def calculate_relative_diff_from_lemma(
    index_a: Index,
    index_b: Index,
    lemma_occurrence_count_dict_a: LemmaOccurrenceCountDict,
    lemma_occurrence_count_dict_b: LemmaOccurrenceCountDict,
    lemma: Lemma,
) -> float:
    diff = None

    # key and dict synchronization
    distances_a_dict = query_related(index_a, lemma, n=100)
    distances_b_dict = query_related(index_b, lemma, n=100)
    embedding_index_a_lemma = query_embedding(index_a, lemma)
    embedding_index_b_lemma = query_embedding(index_b, lemma)
    if embedding_index_a_lemma is not None and embedding_index_b_lemma is not None:
        distances_a_lemma_set = set(distances_a_dict.keys())
        distances_b_lemma_set = set(distances_b_dict.keys())
        lemma_all = set()
        for lemma_a in distances_a_lemma_set:
            is_in_both = True
            if lemma_a not in distances_b_lemma_set:
                embedding_index_b_lemma_a = query_embedding(index_b, lemma_a)
                if embedding_index_b_lemma_a is not None:
                    distances_b_dict[lemma_a] = calculate_cos_distance(embedding_index_a_lemma, embedding_index_b_lemma_a)
                else:
                    is_in_both = False
            if is_in_both:
                lemma_all.add(lemma_a)
        for lemma_b in distances_b_lemma_set:
            is_in_both = True
            if lemma_b not in distances_a_lemma_set:
                embedding_index_a_lemma_b = query_embedding(index_a, lemma_b)
                if embedding_index_a_lemma_b is not None:
                    distances_a_dict[lemma_b] = calculate_cos_distance(embedding_index_b_lemma, embedding_index_a_lemma_b)
                else:
                    is_in_both = False
            if is_in_both:
                lemma_all.add(lemma_b)

        # difference calculation
        diff = 0
        for lemma_related in lemma_all:
            distance_a = distances_a_dict[lemma_related]
            distance_b = distances_b_dict[lemma_related]
            occurrence_count_avg = (lemma_occurrence_count_dict_a[lemma_related] + lemma_occurrence_count_dict_b[lemma_related]) / 2
            # diff += abs(distance_a - distance_b) * np.log1p(occurrence_count_avg)
            diff += (abs(distance_a - distance_b) / np.sqrt(occurrence_count_avg)) * occurrence_count_avg
        diff /= len(lemma_all)

    return diff


if ENABLE_TEST:
    print(
        calculate_relative_diff_from_lemma(index_174, index_175, lemma_occurrence_count_dict_174, lemma_occurrence_count_dict_175, "gehen")
    )
    print(
        calculate_relative_diff_from_lemma(index_175, index_176, lemma_occurrence_count_dict_175, lemma_occurrence_count_dict_176, "brauen")
    )

### calculate_relative_diff_dict_from_index

In [None]:
def calculate_relative_diff_dict_from_index(
    index_a: Index,
    index_b: Index,
    lemma_occurrence_count_dict_a: LemmaOccurrenceCountDict,
    lemma_occurrence_count_dict_b: LemmaOccurrenceCountDict,
    min_occurrence: int = None,
    max_occurrence: int = None,
) -> LemmaDiffDict:
    lemma_diff_dict = {}
    lemma_common = get_common_lemma(index_a[1], index_b[1])
    for lemma in lemma_common:
        count_a = lemma_occurrence_count_dict_a[lemma]
        count_b = lemma_occurrence_count_dict_b[lemma]
        if (min_occurrence is None or (count_a >= min_occurrence and count_b >= min_occurrence)) and (
            max_occurrence is None or (count_a <= max_occurrence and count_b <= max_occurrence)
        ):
            diff = calculate_relative_diff_from_lemma(index_a, index_b, lemma_occurrence_count_dict_a, lemma_occurrence_count_dict_b, lemma)
            # diff_list.append((lemma, diff))
            lemma_diff_dict[lemma] = diff
    # diff_list = sorted(diff_list, key=lambda x: -x[1])
    # lemma_diff_dict = {lemma: diff for lemma, diff in diff_list}
    lemma_diff_dict = sort_lemma_dict_by_value_to_dict(lemma_diff_dict, desc=False)
    print("create_diff_dict_from_index: len(lemma_diff_dict):", len(lemma_diff_dict))
    return lemma_diff_dict


def create_relative_diff_dict_from_index_test():
    lemma_diff_174_175_dict = calculate_relative_diff_dict_from_index(
        index_174,
        index_175,
        lemma_occurrence_count_dict_174,
        lemma_occurrence_count_dict_175,
    )
    lemma_diff_175_176_dict = calculate_relative_diff_dict_from_index(
        index_175,
        index_176,
        lemma_occurrence_count_dict_175,
        lemma_occurrence_count_dict_176,
    )
    return lemma_diff_174_175_dict, lemma_diff_175_176_dict


if ENABLE_TEST:
    lemma_relative_diff_174_175_dict, lemma_diff_175_176_dict = load_cache_or_run(create_relative_diff_dict_from_index_test)

### normalize_lemma_value_dict

In [None]:
def normalize_lemma_value_dict(lemma_value_dict, enable_inversion=False):
    move = min(lemma_value_dict.values())
    lemma_value_dict_normalized = {lemma: diff - move for lemma, diff in lemma_value_dict.items()}
    scale = 2 / max(lemma_value_dict_normalized.values())
    lemma_value_dict_normalized = {lemma: diff * scale for lemma, diff in lemma_value_dict_normalized.items()}
    lemma_value_dict_normalized = {lemma: diff - 1 for lemma, diff in lemma_value_dict_normalized.items()}
    if enable_inversion:
        lemma_value_dict_normalized = {lemma: diff * -1 for lemma, diff in lemma_value_dict_normalized.items()}
    print("normalize_lemma_value_dict: len(lemma_value_dict_normalized):", len(lemma_value_dict_normalized))
    return lemma_value_dict_normalized


if ENABLE_TEST:
    lemma_cos_sim_dict_174_175_normalized = normalize_lemma_value_dict(lemma_cos_sim_dict_174_175)
    lemma_trajectory_dict_174_175_176_normalized = normalize_lemma_value_dict(lemma_trajectory_dict_174_175_176)
    lemma_relative_diff_174_175_dict_normalized = normalize_lemma_value_dict(lemma_relative_diff_174_175_dict, enable_inversion=True)

### create_random_lemma_value_dict

In [None]:
def create_random_lemma_value_dict(lemma_value_dict, range_min=-1, range_max=1):
    range_min *= 1000
    range_max *= 1000
    random_lemma_value_dict = {}
    for lemma in lemma_value_dict.keys():
        random_lemma_value_dict[lemma] = random.randint(range_min, range_max) / 1000
    return random_lemma_value_dict


if ENABLE_TEST:
    lemma_random_cos_sim_dict = create_random_lemma_value_dict(lemma_relative_diff_174_175_dict_normalized)
    lemma_random_trajectory_dict = create_random_lemma_value_dict(lemma_trajectory_dict_174_175_176_normalized)
    random_relative_diff_dict = create_random_lemma_value_dict(lemma_relative_diff_174_175_dict_normalized)

### create_lemma_diff_diff_dicts

In [None]:
def create_lemma_diff_diff_dicts(*lemma_value_dict_list):
    lemma_value_dict_list
    common_lemma = get_common_lemma(*lemma_value_dict_list)
    lemma_diff_dict = {}
    for lemma in common_lemma:
        l = len(lemma_value_dict_list)
        for i_a in range(0, l):
            for i_b in range(i_a + 1, l):
                lemma_value_dict_a = lemma_value_dict_list[i_a]
                lemma_value_dict_b = lemma_value_dict_list[i_b]
                diff = lemma_diff_dict.get(lemma, 0)
                diff += abs(lemma_value_dict_a[lemma] - lemma_value_dict_b[lemma])
                lemma_diff_dict[lemma] = diff
    lemma_diff_dict = sort_lemma_dict_by_value_to_dict(lemma_diff_dict, desc=False)
    diff_avg = sum(lemma_diff_dict.values()) / len(lemma_diff_dict)
    diff_median = list(lemma_diff_dict.values())[int(len(lemma_diff_dict) / 2)]
    print("create_lemma_diff_diff_dicts: avg_diff:", diff_avg)
    print("create_lemma_diff_diff_dicts: diff_median:", diff_median)
    return lemma_diff_dict


if ENABLE_TEST:
    lemma_diff_cos_sim_trajectory_dict_174_176 = create_lemma_diff_diff_dicts(
        lemma_cos_sim_dict_174_175_normalized,
        lemma_trajectory_dict_174_175_176_normalized,
    )
    lemma_diff_cos_sim_relative_diff_dict_174_176 = create_lemma_diff_diff_dicts(
        lemma_cos_sim_dict_174_175_normalized,
        lemma_relative_diff_174_175_dict_normalized,
    )
    lemma_diff_trajectory_relative_diff_dict_174_176 = create_lemma_diff_diff_dicts(
        lemma_trajectory_dict_174_175_176_normalized,
        lemma_relative_diff_174_175_dict_normalized,
    )
    lemma_diff_random_cos_sim_trajectory_dict = create_lemma_diff_diff_dicts(
        lemma_random_cos_sim_dict,
        lemma_random_trajectory_dict,
    )
    random_diff_cos_sim_relative_diff_dict = create_lemma_diff_diff_dicts(
        lemma_random_cos_sim_dict,
        random_relative_diff_dict,
    )
    random_diff_trajectory_relative_diff_dict = create_lemma_diff_diff_dicts(
        lemma_random_trajectory_dict,
        random_relative_diff_dict,
    )

### merge_lemma_diff_diff_dict

In [None]:
def merge_lemma_diff_diff_dict(lemma_diff_diff_dict, lemma_diff_dict_a, lemma_diff_dict_b):
    lemma_diff_merged_dict = {}
    for lemma in lemma_diff_diff_dict.keys():
        lemma_diff_merged_dict["1:" + lemma] = lemma_diff_dict_a[lemma]
        lemma_diff_merged_dict["2:" + lemma] = lemma_diff_dict_b[lemma]
    print("merge_lemma_diff_diff_dict: len(lemma_diff_merged_dict):", len(lemma_diff_merged_dict))
    return lemma_diff_merged_dict


if ENABLE_TEST:
    lemma_merged_cos_sim_trajectory_dict_174_176 = merge_lemma_diff_diff_dict(
        lemma_diff_cos_sim_trajectory_dict_174_176,
        lemma_cos_sim_dict_174_175_normalized,
        lemma_trajectory_dict_174_175_176_normalized,
    )
    lemma_merged_cos_sim_relative_diff_dict_174_176 = merge_lemma_diff_diff_dict(
        lemma_diff_cos_sim_relative_diff_dict_174_176,
        lemma_cos_sim_dict_174_175_normalized,
        lemma_relative_diff_174_175_dict_normalized,
    )
    lemma_merged_trajectory_relative_diff_dict_174_176 = merge_lemma_diff_diff_dict(
        lemma_diff_trajectory_relative_diff_dict_174_176,
        lemma_trajectory_dict_174_175_176_normalized,
        lemma_relative_diff_174_175_dict_normalized,
    )
    lemma_merged_random_diff_dict = merge_lemma_diff_diff_dict(
        lemma_diff_random_cos_sim_trajectory_dict,
        lemma_random_cos_sim_dict,
        lemma_random_trajectory_dict,
    )

### calculate_average_diff

In [None]:
def calculate_average_diff(global_diff_dict):
    decade_average_diff_dict = {}
    for lemma, diff in global_diff_dict.items():
        for decade, cos_sim in diff.items():
            diff_per_decade_list = decade_average_diff_dict.get(decade, [])
            diff_per_decade_list.append(cos_sim)
            decade_average_diff_dict[decade] = diff_per_decade_list
    decade_average_diff_dict_new = {}
    for lemma, diff_per_decade_list in decade_average_diff_dict.items():
        decade_average_diff_dict_new[lemma] = sum(diff_per_decade_list) / len(diff_per_decade_list)
    decade_average_diff_dict = decade_average_diff_dict_new
    return decade_average_diff_dict

## aggregate functions

### create_decade_data

In [None]:
def create_decade_data(decade: Decade) -> DecadeData:
    model = load_model(decade)
    lemma_word_dict, word_lemma_dict = create_lemma_word_dicts(model)
    index = create_index(lemma_word_dict, model)
    lemma_occurrence_position_dict, lemma_occurrence_count_dict = create_occurrence_dicts(decade, word_lemma_dict)
    lemma_list_sorted = sort_lemma_dict_by_value_to_list(lemma_occurrence_count_dict)
    lemma_occurrence_count_dict = sort_by_occurrence(lemma_list_sorted, lemma_occurrence_count_dict)
    lemma_occurrence_position_dict = sort_by_occurrence(lemma_list_sorted, lemma_occurrence_position_dict)
    lemma_word_dict = sort_by_occurrence(lemma_list_sorted, lemma_word_dict)
    return [index, word_lemma_dict, lemma_word_dict, lemma_occurrence_count_dict, lemma_occurrence_position_dict, decade]


def create_decade_data_test():
    return create_decade_data(174), create_decade_data(175), create_decade_data(176)


if ENABLE_TEST:
    decade_data_174, decade_data_175, decade_data_176 = load_cache_or_run(create_decade_data_test)

### align_decade_data

In [None]:
def align_decade_data(decade_data_a: DecadeData, decade_data_b: DecadeData) -> DecadeData:
    index_a = decade_data_a[0]
    index_b = decade_data_b[0]
    lemma_occurrence_count_dict_a = decade_data_a[3]
    lemma_occurrence_count_dict_b = decade_data_b[3]
    index_b = create_procrustes_alignment(index_a, index_b, lemma_occurrence_count_dict_a, lemma_occurrence_count_dict_b)
    decade_data_b[0] = index_b
    return decade_data_b


def align_decade_data_test():
    global decade_data_175
    global decade_data_176
    decade_data_175 = align_decade_data(decade_data_174, decade_data_175)
    decade_data_176 = align_decade_data(decade_data_175, decade_data_176)
    return decade_data_175, decade_data_176


if ENABLE_TEST:
    decade_data_175, decade_data_176 = load_cache_or_run(align_decade_data_test)

### preprocess_and_persist_decade_data

In [None]:
def preprocess_and_persist_decade_data(decade_list):

    def does_exist(decade):
        return os.path.exists(f"{CACHE_FOLDER}preprocess_and_persist_decade_data__{decade}.pkl")

    decade = decade_list[0]
    if not does_exist(decade):
        decade_data_a = create_decade_data(decade)
        pickle_save(decade_data_a, f"preprocess_and_persist_decade_data__{decade_list[0]}")
    for decade in decade_list[1:]:
        if not does_exist(decade):
            decade_data_b = create_decade_data(decade)
            decade_data_b = align_decade_data(decade_data_a, decade_data_b)
            pickle_save(decade_data_b, f"preprocess_and_persist_decade_data__{decade}")
            decade_data_a = decade_data_b


if ENABLE_TEST:
    decade_list = create_decades_list(decade_start=165, decade_end=175)
    preprocess_and_persist_decade_data(decade_list)

### load_persisted_decade_data

In [None]:
def load_persisted_decade_data(decade):
    return pickle_load(f"preprocess_and_persist_decade_data__{decade}")


if ENABLE_TEST:
    decade_data_175 = load_persisted_decade_data(175)

### load_embeddings_from_persisted_decade_data

In [None]:
def load_embeddings_from_persisted_decade_data(decade_list, lemma_list):
    lemma_decade_embdding_list_tmp = []
    for decade in decade_list:
        decade_data = load_persisted_decade_data(decade)
        for lemma in lemma_list:
            index = decade_data[0]
            embedding = query_embedding(index, lemma)
            if embedding is not None:
                lemma_decade_embdding_list_tmp.append((lemma, decade, embedding))
    lemma_decade_embdding_dict = {}
    for lemma, decade, embedding in lemma_decade_embdding_list_tmp:
        decade_dict = lemma_decade_embdding_dict.get(lemma, {})
        decade_dict[decade] = embedding
        lemma_decade_embdding_dict[lemma] = decade_dict
    print("load_embeddings_from_persisted_decade_data: len(lemma_decade_embdding_dict):", len(lemma_decade_embdding_dict))
    return lemma_decade_embdding_dict


def load_embeddings_from_persisted_decade_data_test():
    lemma_decade_embdding_dict = load_embeddings_from_persisted_decade_data(
        decade_list, ["gehen", "laufen", "Spiritus", "Prätention", "jkljasdjal"]
    )
    return lemma_decade_embdding_dict


if ENABLE_TEST:
    lemma_decade_embdding_dict = load_cache_or_run(load_embeddings_from_persisted_decade_data_test)

### merge_global_lemma_dict

In [None]:
def merge_global_lemma_dict(global_lemma_decade_dict: LemmaDecadeDiffDict, enable_avg=True) -> dict[Lemma, float]:
    global_lemma_decade_dict_merged = {}
    for lemma, decades_dict in global_lemma_decade_dict.items():
        decades_merged = sum(decades_dict.values())
        if enable_avg:
            decades_merged /= len(decades_dict)
        global_lemma_decade_dict_merged[lemma] = decades_merged
    global_lemma_decade_dict_merged = sort_lemma_dict_by_value_to_dict(global_lemma_decade_dict_merged)
    print("merge_global_lemma_dict: len(global_lemma_decade_dict_merged):", len(global_lemma_decade_dict_merged))
    return global_lemma_decade_dict_merged


if ENABLE_TEST:
    lemma_decade_dict_test = {
        "gehen": {
            174: 0.2,
            175: 0.3,
            176: 0.1,
        },
        "laufen": {
            174: 0.4,
            175: 0.6,
            176: 0.1,
        },
    }
    print(merge_global_lemma_dict(lemma_decade_dict_test))
    print(merge_global_lemma_dict(lemma_decade_dict_test, enable_avg=False))

### calculate_cos_sim_between_decades

In [None]:
def calculate_cos_sim_between_decades(decade_list, threshold=1000) -> LemmaDecadeDiffDict:
    decade_data_a = load_persisted_decade_data(decade_list[0])
    global_lemma_cos_sim_dict = {}
    for decade in decade_list[1:]:
        decade_data_b = load_persisted_decade_data(decade)
        cos_sim_decade_str = str(decade_data_a[5]) + "-" + str(decade_data_b[5])
        lemma_cos_sim_dict = calculate_cos_sim_between_indices(decade_data_a[0], decade_data_b[0])
        lemma_occurrence_count_merged = merge_count_occurrences_dict(decade_data_a[3], decade_data_b[3])
        lemma_cos_sim_dict_filtered = {}
        for lemma, occurrence_count in lemma_occurrence_count_merged.items():
            if threshold and occurrence_count >= threshold:
                lemma_cos_sim_dict_filtered[lemma] = lemma_cos_sim_dict[lemma]
        lemma_cos_sim_dict = lemma_cos_sim_dict_filtered
        for lemma, cos_sim in lemma_cos_sim_dict.items():
            decade_cos_sim_dict = global_lemma_cos_sim_dict.get(lemma, {})
            decade_cos_sim_dict[cos_sim_decade_str] = cos_sim
            global_lemma_cos_sim_dict[lemma] = decade_cos_sim_dict
        decade_data_a = decade_data_b
    print("calculate_cos_sim_between_decades: len(global_lemma_cos_sim_dict):", len(global_lemma_cos_sim_dict))
    return global_lemma_cos_sim_dict


def calculate_cos_sim_between_decades_test():
    return calculate_cos_sim_between_decades(decade_list)


if ENABLE_TEST:
    print(decade_list)
    global_lemma_cos_sim_dict = load_cache_or_run(calculate_cos_sim_between_decades_test)
    global_lemma_cos_sim_dict_merged = merge_global_lemma_dict(global_lemma_cos_sim_dict)

### calculate_trajectories_between_decades

In [None]:
def calculate_trajectories_between_decades(decade_list: DecadeList) -> LemmaDecadeDiffDict:
    decade_data_a = load_persisted_decade_data(decade_list[0])
    decade_data_b = load_persisted_decade_data(decade_list[1])
    global_lemma_trajectory_dict = {}
    for decade in decade_list[2:]:
        decade_data_c = load_persisted_decade_data(decade)
        print("calculate_trajectories_between_decades: calculating trajectories between", decade_data_a[5], "and", decade_data_c[5])
        index_a = decade_data_a[0]
        index_b = decade_data_b[0]
        index_c = decade_data_c[0]
        lemma_trajectory_dict_abc = calculate_trajectory_dict_from_index(index_a, index_b, index_c)
        for lemma, trajectory in lemma_trajectory_dict_abc.items():
            trajectory_lemma_dict = global_lemma_trajectory_dict.get(lemma, {})
            trajectory_decades_str = str(decade_data_a[5]) + "-" + str(decade_data_c[5])
            trajectory_lemma_dict[trajectory_decades_str] = trajectory
            global_lemma_trajectory_dict[lemma] = trajectory_lemma_dict
        decade_data_a = decade_data_b
        decade_data_b = decade_data_c
    return global_lemma_trajectory_dict


def calculate_trajectories_between_decades_test():
    decade_list = create_decades_list(decade_start=165, decade_end=175)
    return calculate_trajectories_between_decades(decade_list)


if ENABLE_TEST:
    global_lemma_trajectory_dict = load_cache_or_run(calculate_trajectories_between_decades_test)
    global_lemma_trajectory_dict_merged = merge_global_lemma_dict(global_lemma_trajectory_dict)

### calculate_relative_diff_between_decades

In [None]:
def calculate_relative_diff_between_decades(decade_list: DecadeList) -> LemmaDecadeDiffDict:
    decade_data_a = load_persisted_decade_data(decade_list[0])
    global_lemma_relative_diff_dict = {}
    for decade in decade_list[1:]:
        decade_data_b = load_persisted_decade_data(decade)
        decade_str = str(decade_data_a[5]) + "-" + str(decade_data_b[5])
        lemma_relative_diff_dict = calculate_relative_diff_dict_from_index(
            decade_data_a[0],
            decade_data_b[0],
            decade_data_a[3],
            decade_data_b[3],
        )
        lemma_relative_diff_dict_merged = merge_count_occurrences_dict(decade_data_a[3], decade_data_b[3])
        lemma_relative_diff_dict_filtered = {}
        for lemma, occurrence_count in lemma_relative_diff_dict_merged.items():
            lemma_relative_diff_dict_filtered[lemma] = lemma_relative_diff_dict[lemma]
        lemma_relative_diff_dict = lemma_relative_diff_dict_filtered
        for lemma, relative_diff in lemma_relative_diff_dict.items():
            decade_relative_diff_dict = global_lemma_relative_diff_dict.get(lemma, {})
            decade_relative_diff_dict[decade_str] = relative_diff
            global_lemma_relative_diff_dict[lemma] = decade_relative_diff_dict
        decade_data_a = decade_data_b
    print("calculate_relative_diff_between_decades: len(global_lemma_relative_diff_dict):", len(global_lemma_relative_diff_dict))
    return global_lemma_relative_diff_dict


if ENABLE_TEST:
    decade_list = create_decades_list(decade_start=165, decade_end=175)
    global_lemma_relative_diff_dict = calculate_relative_diff_between_decades(decade_list)
    global_lemma_relative_diff_dict_merged = merge_global_lemma_dict(global_lemma_relative_diff_dict)

## plotting

### plot_tsne_from_labels_embeddings

In [None]:
def plot_tsne_from_labels_embeddings(
    labels: list[str],
    embeddings: list[np.ndarray],
    title: str = None,
    height: int = None,
    width: int = None,
    rotation_degree: int = None,
    perplexity: int = None,
):
    reduced_vectors_tsne = create_tsne(embeddings, perplexity)

    if rotation_degree:
        angle_rad = np.deg2rad(-rotation_degree)
        rotation_matrix = np.array([[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]])
        reduced_vectors_tsne = reduced_vectors_tsne @ rotation_matrix.T

    if height is None:
        height = 800
    if width is None:
        width = 800
    fig = px.scatter(
        x=reduced_vectors_tsne[:, 0],
        y=reduced_vectors_tsne[:, 1],
        text=labels,
        height=height,
        width=width,
        title=title,
    )
    fig.update_layout(xaxis=dict(title=None, showticklabels=False), yaxis=dict(title=None, showticklabels=False))
    fig.update_traces(
        marker=dict(size=10),
        textposition="bottom center",
        textfont=dict(size=12),
    )
    fig.show()
    time.sleep(PLOT_SLEEP)


if ENABLE_TEST:
    lemma_list = query_related(index_175, "gehen", keep_search=True, n=20, return_as_dict=False)
    embedding_list = []
    for lemma in lemma_list:
        embedding_list.append(query_embedding(index_175, lemma))
    plot_tsne_from_labels_embeddings(lemma_list, embedding_list)

### plot_tsne_from_lemma_and_related

In [None]:
def plot_tsne_from_lemma_and_related(
    index: Index,
    lemma: str = None,
    n: int = 100,
    title: str = None,
    height: int = None,
    width: int = None,
    rotation_degree: int = None,
):
    lemma_list = query_related(index, lemma, n=n, keep_search=True, return_as_dict=False)
    embedding_list = []
    for lemma in lemma_list:
        embedding_list.append(query_embedding(index_175, lemma))
    plot_tsne_from_labels_embeddings(lemma_list, embedding_list)


if ENABLE_TEST:
    plot_tsne_from_lemma_and_related(index_175, lemma="gehen", n=200)

### plot_2d_scatter

In [None]:
def plot_2d_scatter(data: dict | list[list], title: str = None, draw_line=False):
    key_list = []
    value_list = []
    if type(data) is dict:
        for key, value in data.items():
            key_list.append(key)
            value_list.append(value)
    else:
        for key, value in data:
            key_list.append(key)
            value_list.append(value)
    fig = px.scatter(x=key_list, y=value_list, title=title)
    if draw_line:
        fig.update_traces(mode="lines+markers")
    fig.update_layout(xaxis_title=None, yaxis_title=None)
    fig.show()
    time.sleep(PLOT_SLEEP)


if ENABLE_TEST:

    # general lemma occurrence count
    plot_2d_scatter(lemma_occurrence_count_dict_174, "lemma_occurrence_count_dict_174")

    # base differences
    plot_2d_scatter(lemma_cos_sim_dict_174_175, "lemma_cos_sim_dict_174_175")
    plot_2d_scatter(lemma_trajectory_dict_174_175_176, "lemma_trajectory_dict_174_175_176")
    plot_2d_scatter(lemma_relative_diff_174_175_dict, "lemma_relative_diff_174_175_dict")

    # normalized differences
    plot_2d_scatter(lemma_cos_sim_dict_174_175_normalized, "lemma_cos_sim_dict_174_175_normalized")
    plot_2d_scatter(lemma_trajectory_dict_174_175_176_normalized, "lemma_trajectory_dict_174_175_176_normalized")
    plot_2d_scatter(lemma_relative_diff_174_175_dict_normalized, "lemma_relative_diff_174_175_dict_normalized")

    # differences of differences
    plot_2d_scatter(lemma_diff_cos_sim_trajectory_dict_174_176, "lemma_diff_cos_sim_trajectory_dict_174_176")
    plot_2d_scatter(lemma_diff_cos_sim_relative_diff_dict_174_176, "lemma_diff_cos_sim_relative_diff_dict_174_176")
    plot_2d_scatter(lemma_diff_trajectory_relative_diff_dict_174_176, "lemma_diff_trajectory_relative_diff_dict_174_176")
    plot_2d_scatter(lemma_diff_random_cos_sim_trajectory_dict, "lemma_diff_random_cos_sim_trajectory_dict")

    # merged differences of differences
    plot_2d_scatter(lemma_merged_cos_sim_trajectory_dict_174_176, "lemma_merged_cos_sim_trajectory_dict_174_176")
    plot_2d_scatter(lemma_merged_cos_sim_relative_diff_dict_174_176, "lemma_merged_cos_sim_relative_diff_dict_174_176")
    plot_2d_scatter(lemma_merged_trajectory_relative_diff_dict_174_176, "lemma_merged_trajectory_relative_diff_dict_174_176")
    plot_2d_scatter(lemma_merged_random_diff_dict, "lemma_merged_random_diff_dict")

### plot_trajectory

In [None]:
def plot_lemma_and_decade(lemma_decade_embdding_dict: dict[Lemma, dict[Decade, Embedding]], perplexity=None, title=None):

    # prepare data
    global_labels_list = []
    global_embeddings_list = []
    group_end_position_list = []
    position_count = 0
    for lemma, decade_embedding_dict in lemma_decade_embdding_dict.items():
        for decade, embedding in decade_embedding_dict.items():
            global_labels_list.append(str(decade) + ":" + lemma)
            global_embeddings_list.append(embedding)
            position_count += 1
        group_end_position_list.append(position_count)
    if 1 < len(global_embeddings_list) < 6:
        perplexity = len(global_embeddings_list) - 1
    else:
        perplexity = None
    lemma_embeddings_reduced_array = create_tsne(global_embeddings_list, perplexity=perplexity)

    # create plot
    fig = go.Figure()
    group_start_position = 0
    for group_end_position in group_end_position_list:
        lemma_respective_embeddings = lemma_embeddings_reduced_array[group_start_position:group_end_position]
        fig.add_trace(
            go.Scatter(
                x=lemma_respective_embeddings[:, 0],
                y=lemma_respective_embeddings[:, 1],
                mode="lines",
            )
        )
        group_start_position = group_end_position
    fig.add_trace(
        go.Scatter(
            x=lemma_embeddings_reduced_array[:, 0],
            y=lemma_embeddings_reduced_array[:, 1],
            mode="markers+text",
            text=global_labels_list,
            textposition="top center",
        )
    )
    fig.update_layout(
        title=title if title is not None else "",
        showlegend=False,
        width=800,
        height=800,
    )
    fig.show()
    time.sleep(PLOT_SLEEP)


if ENABLE_TEST:
    lemma_list = ["der", "gefunden", "Nachmittag"]
    lemma_decade_embdding_dict = {}
    print("trajectories:")
    for lemma in lemma_list:
        print(lemma, lemma_trajectory_dict_174_175_176[lemma])
        lemma_decade_embdding_dict[lemma] = {
            174: query_embedding(index_174, lemma),
            175: query_embedding(index_aligned_175, lemma),
            176: query_embedding(index_aligned_176, lemma),
        }
    plot_lemma_and_decade(lemma_decade_embdding_dict)

### plot_trajectory_from_lemma_list

In [None]:
def plot_lemma_and_decade_from_lemma_list(lemma_list, title=None):
    lemma_decade_embdding_dict = load_embeddings_from_persisted_decade_data(decade_list, lemma_list)
    plot_lemma_and_decade(lemma_decade_embdding_dict, title=title)

# Analysis

## prepare_all

In [None]:
def prepare_all():
    decade_list = create_decades_list(decade_start=180, decade_end=185)
    preprocess_and_persist_decade_data(decade_list)
    global_lemma_cos_sim_dict = calculate_cos_sim_between_decades(decade_list)
    global_lemma_cos_sim_dict_merged = merge_global_lemma_dict(global_lemma_cos_sim_dict)
    global_lemma_trajectory_dict = calculate_trajectories_between_decades(decade_list)
    global_lemma_trajectory_dict_merged = merge_global_lemma_dict(global_lemma_trajectory_dict)
    global_lemma_relative_diff_dict = calculate_relative_diff_between_decades(decade_list)
    global_lemma_relative_diff_dict_merged = merge_global_lemma_dict(global_lemma_relative_diff_dict)
    return (
        decade_list,
        global_lemma_cos_sim_dict,
        global_lemma_cos_sim_dict_merged,
        global_lemma_trajectory_dict,
        global_lemma_trajectory_dict_merged,
        global_lemma_relative_diff_dict,
        global_lemma_relative_diff_dict_merged,
    )


(
    decade_list,
    global_lemma_cos_sim_dict,
    global_lemma_cos_sim_dict_merged,
    global_lemma_trajectory_dict,
    global_lemma_trajectory_dict_merged,
    global_lemma_relative_diff_dict,
    global_lemma_relative_diff_dict_merged,
) = load_cache_or_run(prepare_all)

## global changes analysis

### global cosine similarity changes

In [None]:
plot_2d_scatter(global_lemma_cos_sim_dict_merged, "global average change between decades regarding cosine similarity")

### global trajectories

In [None]:
plot_2d_scatter(global_lemma_trajectory_dict_merged, "global average change between decades regarding trajectory")

In [None]:
plot_2d_scatter(global_lemma_relative_diff_dict_merged, "global average change between decades regarding relative differences")

### filter_on_decades

In [None]:
def filter_on_decades(global_lemma_dict, num_decades=25):
    global_lemma_dict_filtered = {}
    for lemma, decade_dict in global_lemma_dict.items():
        if len(decade_dict) > num_decades:
            global_lemma_dict_filtered[lemma] = decade_dict
    return global_lemma_dict_filtered

In [None]:
global_lemma_cos_sim_dict_filtered = filter_on_decades(global_lemma_cos_sim_dict)
global_lemma_cos_sim_dict_filtered_merged = merge_global_lemma_dict(global_lemma_cos_sim_dict_filtered)
plot_2d_scatter(global_lemma_cos_sim_dict_filtered_merged, "global filtered average change between decades regarding cosine similarity")

In [None]:
global_lemma_trajectory_dict_filtered = filter_on_decades(global_lemma_trajectory_dict)
global_lemma_trajectory_dict_filtered_merged = merge_global_lemma_dict(global_lemma_trajectory_dict_filtered)
plot_2d_scatter(global_lemma_trajectory_dict_filtered_merged, "global filtered average change between decades regarding trajectory")

In [None]:
global_lemma_relative_diff_dict_filtered = filter_on_decades(global_lemma_relative_diff_dict, num_decades=3)
global_lemma_relative_diff_dict_filtered_merged = merge_global_lemma_dict(global_lemma_relative_diff_dict_filtered)
plot_2d_scatter(
    global_lemma_relative_diff_dict_filtered_merged, "global filtered average change between decades regarding relative differences"
)

### compare cos sim and trajectories differences

### normalize global diffs

In [None]:
global_lemma_cos_sim_dict_filtered_merged_normalized = normalize_lemma_value_dict(global_lemma_cos_sim_dict_filtered_merged)
plot_2d_scatter(
    global_lemma_cos_sim_dict_filtered_merged_normalized,
    "global normalized filtered average change between decades regarding cosine similarity",
)

In [None]:
global_lemma_trajectory_dict_filtered_merged_normalized = normalize_lemma_value_dict(global_lemma_trajectory_dict_filtered_merged)
plot_2d_scatter(
    global_lemma_trajectory_dict_filtered_merged_normalized,
    "global normalized filtered average change between decades regarding trajectory",
)

In [None]:
global_lemma_relative_diff_dict_filtered_merged_normalized = normalize_lemma_value_dict(
    global_lemma_relative_diff_dict_filtered_merged,
    enable_inversion=True,
)
plot_2d_scatter(
    global_lemma_relative_diff_dict_filtered_merged_normalized,
    "global normalized filtered average change between decades regarding trajectory",
)

In [None]:
global_lemma_diffs_compared = create_lemma_diff_diff_dicts(
    global_lemma_cos_sim_dict_filtered_merged_normalized, global_lemma_trajectory_dict_filtered_merged_normalized
)
plot_2d_scatter(global_lemma_diffs_compared, "difference between normalized cos sim and trajectories")

In [None]:
global_lemma_diffs_compared_merged = merge_lemma_diff_diff_dict(
    global_lemma_diffs_compared,
    global_lemma_cos_sim_dict_filtered_merged_normalized,
    global_lemma_trajectory_dict_filtered_merged_normalized,
)
plot_2d_scatter(global_lemma_diffs_compared_merged)

In [None]:
random_dict_a = create_random_lemma_value_dict(global_lemma_cos_sim_dict_filtered_merged_normalized)
random_dict_b = create_random_lemma_value_dict(global_lemma_trajectory_dict_filtered_merged_normalized)
random_lemma_diff_diff_dict = create_lemma_diff_diff_dicts(random_dict_a, random_dict_b)
global_lemma_diffs_compared_merged_random = merge_lemma_diff_diff_dict(random_lemma_diff_diff_dict, random_dict_a, random_dict_b)
plot_2d_scatter(global_lemma_diffs_compared_merged_random)

### compare cos sim and relative diff differences

### average differences per decade

In [None]:
average_cos_sim_per_decade_dict = calculate_average_diff(global_lemma_cos_sim_dict)
plot_2d_scatter(average_cos_sim_per_decade_dict, draw_line=True)

In [None]:
average_trajectory_per_decade_dict = calculate_average_diff(global_lemma_trajectory_dict)
plot_2d_scatter(average_trajectory_per_decade_dict, draw_line=True)

In [None]:
average_relative_diff_per_decade_dict = calculate_average_diff(global_lemma_relative_diff_dict)
plot_2d_scatter(average_relative_diff_per_decade_dict, draw_line=True)

## sample analysis

### get_lemma_by_position

In [None]:
def get_lemma_by_position(lemma_dict, index_start=0, n=10, direction_forward=True, index_start_is_percent=False):
    lemma_dict_selected = {}
    key_value_list = []
    if direction_forward:
        for key, value in lemma_dict.items():
            if is_word(key):
                key_value_list.append((key, value))
    else:
        for key, value in list(lemma_dict.items())[::-1]:
            if is_word(key):
                key_value_list.append((key, value))
    if index_start_is_percent:
        index_start = int((len(key_value_list) / 100) * index_start)
    index_end = index_start + n
    for i, (key, value) in enumerate(key_value_list):
        if index_start <= i:
            if i < index_end:
                lemma_dict_selected[key] = value
            else:
                break
    return lemma_dict_selected

### sample_and_plot_lemmas

In [None]:
def sample_and_plot_lemmas(
    global_lemma_cos_sim_dict,
    global_lemma_cos_sim_dict_merged,
    global_lemma_trajectory_dict,
    global_lemma_trajectory_dict_merged,
    index_start,
    n=10,
    direction_forward=True,
    index_start_is_percent=False,
    plot_limit=3,
):

    def sample_and_plot_lemmas_internal(
        lemma_diff_dict,
        lemma_diff_dict_merged,
        title,
    ):
        lemma_diff_dict_sampled = get_lemma_by_position(
            lemma_diff_dict_merged,
            index_start=index_start,
            n=n,
            direction_forward=direction_forward,
            index_start_is_percent=index_start_is_percent,
        )
        print(lemma_diff_dict_sampled)
        lemma_diff_list_sampled = list(lemma_diff_dict_sampled.keys())[:plot_limit]
        for lemma in lemma_diff_list_sampled:
            plot_2d_scatter(lemma_diff_dict[lemma], draw_line=True, title=title + ": " + lemma)
        return lemma_diff_list_sampled

    lemma_diff_list_sampled = sample_and_plot_lemmas_internal(
        global_lemma_cos_sim_dict, global_lemma_cos_sim_dict_merged, "cosine similarity"
    )
    lemma_diff_list_sampled += sample_and_plot_lemmas_internal(
        global_lemma_trajectory_dict, global_lemma_trajectory_dict_merged, "trajectory"
    )
    lemma_diff_list_sampled = list(set(lemma_diff_list_sampled))
    plot_lemma_and_decade_from_lemma_list(lemma_diff_list_sampled, "embeddings samples regarding cosine similarity and trajectory")

### top lemma

In [None]:
sample_and_plot_lemmas(
    global_lemma_cos_sim_dict,
    global_lemma_cos_sim_dict_filtered_merged,
    global_lemma_trajectory_dict,
    global_lemma_trajectory_dict_filtered_merged,
    index_start=0,
    n=10,
    plot_limit=3,
)

### middle lemmas

In [None]:
sample_and_plot_lemmas(
    global_lemma_cos_sim_dict,
    global_lemma_cos_sim_dict_filtered_merged,
    global_lemma_trajectory_dict,
    global_lemma_trajectory_dict_filtered_merged,
    index_start=50,
    n=10,
    plot_limit=3,
    index_start_is_percent=True,
)

### bottom lemmas

In [None]:
sample_and_plot_lemmas(
    global_lemma_cos_sim_dict,
    global_lemma_cos_sim_dict_filtered_merged,
    global_lemma_trajectory_dict,
    global_lemma_trajectory_dict_filtered_merged,
    index_start=0,
    n=10,
    plot_limit=3,
    direction_forward=False,
)

### together

In [None]:
# top
global_top_cos_sim = sample_diff_dict = get_lemma_by_position(
    global_lemma_cos_sim_dict_filtered_merged,
    index_start=0,
    n=2,
)
global_top_cos_sim = list(global_top_cos_sim.keys())

# middle
global_middle_cos_sim = sample_diff_dict = get_lemma_by_position(
    global_lemma_cos_sim_dict_filtered_merged,
    index_start=50,
    n=2,
    index_start_is_percent=True,
)
global_middle_cos_sim = list(global_middle_cos_sim.keys())

# bottom
global_bottom_cos_sim = sample_diff_dict = get_lemma_by_position(
    global_lemma_cos_sim_dict_filtered_merged,
    index_start=0,
    n=2,
    direction_forward=False,
)
global_bottom_cos_sim = list(global_bottom_cos_sim.keys())

# together
global_together_cos_sim = global_top_cos_sim + global_middle_cos_sim + global_bottom_cos_sim
plot_lemma_and_decade_from_lemma_list(global_together_cos_sim, "sampled via cosine similarity: " + str(global_together_cos_sim))

In [None]:
# top
global_top_trajectory = sample_diff_dict = get_lemma_by_position(
    global_lemma_trajectory_dict_filtered_merged,
    index_start=0,
    n=2,
)
global_top_trajectory = list(global_top_trajectory.keys())

# middle
global_middle_trajectory = sample_diff_dict = get_lemma_by_position(
    global_lemma_trajectory_dict_filtered_merged,
    index_start=50,
    n=2,
    index_start_is_percent=True,
)
global_middle_trajectory = list(global_middle_trajectory.keys())

# bottom
global_bottom_trajectory = sample_diff_dict = get_lemma_by_position(
    global_lemma_trajectory_dict_filtered_merged,
    index_start=0,
    n=2,
    direction_forward=False,
)
global_bottom_trajectory = list(global_bottom_trajectory.keys())

# together
global_together_trajectory = global_top_trajectory + global_middle_trajectory + global_bottom_trajectory
plot_lemma_and_decade_from_lemma_list(global_together_trajectory, "sampled via trajectory: " + str(global_together_trajectory))