# modules

## setup

### imports

In [None]:
import os
import pickle
import random
import time
from functools import partial
from typing import TypeAlias

import hnswlib
import hunspell
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import psycopg
import spacy
from gensim.models import Word2Vec
# from joblib import Memory
from pgvector.psycopg import register_vector
from psycopg.sql import SQL, Identifier, Placeholder
from scipy.linalg import orthogonal_procrustes
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

### config

In [None]:
TEST = True
RESET_DB = True

MODELS_WORD2VEC_FOLDER = "/veld/input/models/word2vec/"
MODELS_FASTTEXT_FOLDER = "/veld/input/models/fasttext/"
MODELS_GLOVE_FOLDER = "/veld/input/models/glove/"
TEXTS_FOLDER = "/veld/input/texts/"
# CACHE_FOLDER = "/veld/storage/cache/"

INDEX_EF_CONSTRUCTION = 100
INDEX_M = 16

AVAILABLE_DECADES = [147, 196]

PLOT_SLEEP = 2

# memory = Memory(location="/veld/storage/cache/", verbose=0)
nlp = spacy.load("de_core_news_sm")
hunspell_check = hunspell.HunSpell("/usr/share/hunspell/de_DE.dic", "/usr/share/hunspell/de_DE.aff")
random.seed(42)
pio.renderers.default = "notebook"

DB_NAME = "postgres_db"
DB_USER = "postgres_user"
DB_PASSWORD = "postgres_password"
DB_HOST = "veld_step_7_run_embeddings_sql_server"
DB_PORT = "5432"

COLS_EMBEDDING = ["lemma", "occurrence_count", "embedding"]
COLS_DIFF = ["lemma", "occurrence_count", "diff"]

TEST_LEMMA_RANGED_LIST = sorted(["d", "und", "gehen", "wohnen", "Fürst"], key=str.lower)
TEST_LEMMA_CLOSE_LIST = sorted(["gehen", "laufen", "wandern", "wohnen", "trinken"], key=str.lower)

## helpers

### is_word

In [None]:
def is_word(word):
    try:
        _ = int(word)
    except:
        if len(word) == 1 and word != "d":
            return False
        else:
            return hunspell_check.spell(word)
    else:
        return False

## data structures

### create_decades_list

In [None]:
def create_decades_list(decade_start: int = AVAILABLE_DECADES[0], decade_end: int = AVAILABLE_DECADES[1]):
    decade_list = [d for d in range(decade_start, decade_end + 1)]
    return decade_list


if TEST:
    decade_list_test = create_decades_list(183, 187)
    print(decade_list_test)

### tpye aliases

In [None]:
Lemma: TypeAlias = str
Decade: TypeAlias = int
LineNumber: TypeAlias = int
LemmaNumber: TypeAlias = int
OccurrenceCount: TypeAlias = int
RelativeDiff: TypeAlias = float
Trajectory: TypeAlias = float
CosSim: TypeAlias = float
Diff: TypeAlias = CosSim | Trajectory | RelativeDiff
DecadesStr: TypeAlias = str  # spans decades, e.g. "183-184"
Embedding: TypeAlias = np.ndarray

# lemma data structures
LemmaDiffDict: TypeAlias = dict[Lemma, Diff]
LineNumberDict: TypeAlias = dict[LineNumber, list[LemmaNumber]]
LemmaOccurrencePositionDict: TypeAlias = dict[Lemma, LineNumberDict]
LemmaOccurrenceCountDict: TypeAlias = dict[Lemma, OccurrenceCount]

# index data structure
IdToLemmaDict: TypeAlias = dict[int, Lemma]
LemmaToIdDict: TypeAlias = dict[Lemma, int]
Index: TypeAlias = tuple[hnswlib.Index, LemmaToIdDict, IdToLemmaDict]
Word2VecIndex: TypeAlias = Index
FastTextIndex: TypeAlias = Index
GloVeIndex: TypeAlias = Index

# Decade data
DecadeList: TypeAlias = list[Decade]
DecadeData: TypeAlias = list[Word2VecIndex, FastTextIndex, GloVeIndex, LemmaOccurrenceCountDict, LemmaOccurrencePositionDict]
DecadeDict: TypeAlias = dict[Decade, DecadeData]
DecadeLemmaDiffDict: TypeAlias = dict[DecadesStr, LemmaDiffDict]

### create_occurrence_dicts

In [None]:
def create_occurrence_dicts(decade) -> dict:
    print("create_occurrence_dicts: start: decade:", decade)
    # lemma_occurrence_position_dict = {}
    lemma_occurrence_count_dict = {}
    total_occurrence_count = 0
    with open(TEXTS_FOLDER + str(decade) + ".txt", "r") as f:
        for line_number, line in enumerate(f):
            for lemma_number, lemma in enumerate(line.rstrip("\n").split()):
                occurrence_count = lemma_occurrence_count_dict.get(lemma, 0)
                lemma_occurrence_count_dict[lemma] = occurrence_count + 1
                total_occurrence_count += 1
    lemma_count = len(lemma_occurrence_count_dict)
    occurrence_avg = total_occurrence_count / lemma_count
    median_pos = int(lemma_count / 2)
    occurrence_median = list(lemma_occurrence_count_dict.values())[median_pos]
    print("create_occurrence_dicts: uniqe lemma_count:", lemma_count)
    print("create_occurrence_dicts: total_occurrence_count:", total_occurrence_count)
    print("create_occurrence_dicts: occurrence_avg:", occurrence_avg)
    print("create_occurrence_dicts: occurrence_median:", occurrence_median)
    return lemma_occurrence_count_dict


if TEST:
    decade_lemma_occurrence_count_dict = {}
    for decade in decade_list_test:
        decade_lemma_occurrence_count_dict[decade] = create_occurrence_dicts(decade)

### sort_dict_by_value

In [None]:
def sort_dict_by_value(key_value_dict: dict, desc=True) -> dict:
    if desc:
        sort_mod = -1
    else:
        sort_mod = 1
    return dict(sorted(key_value_dict.items(), key=lambda x: sort_mod * x[1]))


if TEST:
    print(
        sort_dict_by_value(
            {
                "x": 3,
                "y": 2,
                "z": 4,
            },
            desc=False,
        )
    )

## DB functions

### connect_db

In [None]:
def connect_db(create_cursor=True):
    conn = psycopg.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST,
        port=DB_PORT,
    )
    conn.autocommit = True
    cursor = None
    if create_cursor:
        cursor = conn.cursor()
        cursor.execute("SELECT version();")
        print("connected to:", cursor.fetchone())
    return conn, cursor


conn, cursor = connect_db()

### drop_table

In [None]:
def drop_table(table_name):
    print("drop_table: table_name:", table_name)
    cursor.execute(f'DROP TABLE IF EXISTS "{table_name}" CASCADE;')


if TEST:
    drop_table("test")

### create_embeddings_table

In [None]:
def create_embeddings_table(table_name):
    print("create_embeddings_table: table_name:", table_name)
    cursor.execute(
        f"CREATE TABLE IF NOT EXISTS {table_name} ("
        f"lemma TEXT PRIMARY KEY, "
        f"occurrence_count INTEGER, "
        f"embedding VECTOR(300) not null"
        f");"
    )


if TEST:
    create_embeddings_table("test")

### create_diff_table

In [None]:
def create_diff_table(table_name):
    print("create_diff_table: table_name:", table_name)
    cursor.execute(
        f"CREATE TABLE IF NOT EXISTS {table_name} (" f"lemma TEXT PRIMARY KEY, " f"occurrence_count INTEGER, " f"diff REAL" f");"
    )


if TEST:
    create_diff_table("test_diff")

### insert_to_db

In [None]:
def insert_to_db(table_name, table_data, cols=COLS_EMBEDDING):
    print("insert_to_db: table_name:", table_name, "len(table_data):", len(table_data))
    query = SQL("INSERT INTO {table_name} ({cols}) VALUES ({values}) ON CONFLICT(lemma) DO NOTHING")
    query = query.format(
        table_name=Identifier(table_name),
        cols=SQL(", ").join([Identifier(c) for c in cols]),
        values=SQL(", ").join([Placeholder() for _ in cols]),
    )
    print(query.as_string())
    cursor.executemany(query, table_data)


if TEST:
    table_data = [
        ["gehen", 25, [4.3, 1.2, 0.3] * 100],
        ["laufen", 17, [3.2, 1.7, 2.5] * 100],
    ]
    insert_to_db("test", table_data)

### reset_db

In [None]:
def reset_db(decade_list):
    cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
    cursor.execute("SELECT tablename FROM pg_tables WHERE schemaname = 'public';")
    for table in cursor.fetchall():
        drop_table(table[0])
    for decade in decade_list:
        for model in ["word2vec", "fasttext", "glove"]:
            create_embeddings_table(f"{model}__{decade}")


if RESET_DB:
    decade_list_all = create_decades_list()
    reset_db(decade_list_all)

### register_vector_conn

In [None]:
def register_vector_conn():
    global conn
    global cursor
    cursor.close()
    conn.close()
    conn, _ = connect_db(create_cursor=False)
    register_vector(conn)
    cursor = conn.cursor()
    return conn, cursor


conn, cursor = register_vector_conn()

### load_word2vec_to_db

In [None]:
def load_word2vec_to_db(decade, lemma_occurrence_count_dict):
    print("create_index: start: decade:", decade)
    model_path = MODELS_WORD2VEC_FOLDER + str(decade) + ".bin"
    model = Word2Vec.load(model_path)
    db_insertion_list = []
    for lemma in model.wv.index_to_key:
        if is_word(lemma):
            embedding = model.wv[lemma]
            embedding_normalized = embedding / np.linalg.norm(embedding)
            db_insertion_list.append((lemma, lemma_occurrence_count_dict[lemma], embedding_normalized))
    insert_to_db(f"word2vec__{decade}", db_insertion_list)


if TEST:
    for decade, lemma_occurrence_count_dict in decade_lemma_occurrence_count_dict.items():
        load_word2vec_to_db(decade, lemma_occurrence_count_dict)

### query_generic

In [None]:
def query_generic(table_name, lemma_list=None, select_cols=COLS_EMBEDDING, order_by=None, order_desc=False, print_query=True):
    query = SQL("SELECT {select_cols} FROM {table_name}")
    query = query.format(select_cols=SQL(", ").join([Identifier(c) for c in select_cols]), table_name=Identifier(table_name))
    if lemma_list:
        query_where = SQL("WHERE lemma = ANY({lemma_list})")
        query_where = query_where.format(lemma_list=Placeholder("lemma_list"))
        query = SQL(" ").join([query, query_where])
        params = {"lemma_list": lemma_list}
    else:
        params = {}
    if order_by:
        query += SQL(" ORDER BY {order_col}").format(order_col=Identifier(order_by))
        if order_desc:
            query += SQL(" DESC")
    if print_query:
        print(query.as_string())
    cursor.execute(query=query, params=params)
    result = cursor.fetchall()
    if len(select_cols) == 1:
        result = [r[0] for r in result]
    return result


if TEST:
    print(len(query_generic("word2vec__185")))
    print(len(query_generic("word2vec__185", ["gehen"])))
    print(len(query_generic("word2vec__185", ["gehen", "laufen"])))
    print(len(query_generic("word2vec__185", ["gehen"], ["lemma"], "lemma", order_desc=True)))

### query_related

In [None]:
def query_related(table_name, lemma, n=10, select_cols=["lemma", "cos_sim"]):
    select_cols_sql = ", ".join(select_cols)
    cursor.execute(
        f"""
        WITH similarities AS (
            SELECT a.lemma, a.embedding, 1 - (a.embedding <=> b.embedding) AS cos_sim
            FROM {table_name} a
            CROSS JOIN (
                SELECT embedding FROM {table_name}
                WHERE lemma = '{lemma}'
            ) AS b
            WHERE lemma != '{lemma}'
        )
        SELECT {select_cols_sql}
        FROM similarities
        ORDER BY cos_sim DESC
        LIMIT {n};
    """
    )
    return cursor.fetchall()


if TEST:
    print(query_related("word2vec__185", "gehen"))
    print(query_related("word2vec__185", "Frau", n=5))
    print(query_related("word2vec__185", "Frau", n=1, select_cols=["embedding"])[0][0].shape)

### query_mutual_lemmas

In [None]:
def query_mutual_lemmas(table_list, include_count=True, print_query=True):
    select_occurrence_part = ""
    if include_count:
        select_occurrence_part = ", " + " + ".join([t + ".occurrence_count" for t in table_list])
    join_part = table_list[0]
    for table in table_list[1:]:
        join_part += " INNER JOIN " + table + " USING (lemma) "
    query = f"SELECT lemma {select_occurrence_part} FROM {join_part}"
    query += ";"
    if print_query:
        print("query_mutual_lemmas: query:", query)
    cursor.execute(query)
    result = cursor.fetchall()
    if not include_count:
        result = [r[0] for r in result]
    return result


if TEST:
    table_list_list = [
        [["word2vec__183"], True],
        [["word2vec__183", "word2vec__184"], True],
        [["word2vec__183", "word2vec__184", "word2vec__185"], False],
    ]
    for table_list, include_count in table_list_list:
        r = query_mutual_lemmas(table_list, include_count)
        print(len(r))
        print(r[0:50])

### query_average_occurrence_count

In [None]:
def query_average_occurrence_count(table_prefix, decade_list):
    lemma_count_dict = {}
    for decade in decade_list:
        for lemma, occurrence_count in query_generic(f"{table_prefix}__{decade}", select_cols=["lemma", "occurrence_count"]):
            occurrence_count_list = lemma_count_dict.get(lemma, [])
            occurrence_count_list.append(occurrence_count)
            lemma_count_dict[lemma] = occurrence_count_list
    lemma_count_average_dict = {l: int(sum(c) / len(c)) for l, c in lemma_count_dict.items()}
    lemma_count_average_dict = sort_dict_by_value(lemma_count_average_dict)
    return lemma_count_average_dict


if TEST:
    lemma_count_average_dict = query_average_occurrence_count("word2vec", decade_list_test)
    print(list(lemma_count_average_dict.items())[:10])

### query_over_mutual

In [None]:
def query_over_mutual(table_name_list, select_cols=COLS_EMBEDDING):
    print("query_over_mutual: start: table_name_list:", table_name_list)
    common_lemma = query_mutual_lemmas(table_name_list, include_count=False)
    embeddings_table_list = []
    for table_name in table_name_list:
        embeddings_table_list.append(query_generic(table_name, common_lemma, select_cols=select_cols, order_by="lemma"))
    return zip(*embeddings_table_list)


if TEST:
    for count, x in enumerate(query_over_mutual(["word2vec__183", "word2vec__184"])):
        a = x[0]
        b = x[1]
        print(a[0:2], a[2].shape, b[0:2], b[2].shape)
        if count == 10:
            break

## difference analysis functions

### calculate_cos_sim

In [None]:
def calculate_cos_sim(embedding_a: np.ndarray, embedding_b: np.ndarray) -> float:
    return np.dot(embedding_a, embedding_b) / (np.linalg.norm(embedding_a) * np.linalg.norm(embedding_b))

### calculate_cos_distance

In [None]:
def calculate_cos_distance(embedding_a: np.ndarray, embedding_b: np.ndarray) -> float:
    return 1 - calculate_cos_sim(embedding_a, embedding_b)

### calculate_tsne

In [None]:
def calculate_tsne(embeddings, perplexity=None):
    if perplexity is None:
        if len(embeddings) < 6:
            perplexity = len(embeddings) - 1
        else:
            perplexity = 5
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    embeddings_reduced = tsne.fit_transform(np.array(embeddings))
    return embeddings_reduced


if TEST:
    embeddings_reduced = calculate_tsne(query_generic("word2vec__185", TEST_LEMMA_CLOSE_LIST, select_cols=["embedding"]))
    print(embeddings_reduced)

### calculate_trajectory_between_vectors

In [None]:
def calculate_trajectory_between_vectors(vector_a, vector_b, vector_c):
    vector_ab = vector_a - vector_b
    vector_bc = vector_b - vector_c
    return np.dot(vector_ab, vector_bc)


if TEST:
    vector_list_list = [
        (np.array([1, 2]), np.array([2, 3]), np.array([2, 5])),
        (np.array([1, 2]), np.array([2, 3]), np.array([2, 3])),
        (np.array([1, 2]), np.array([2, 3]), np.array([1, 2])),
    ]
    for vector_list in vector_list_list:
        print(calculate_trajectory_between_vectors(*vector_list))

### calculate_procrustes_alignment

In [None]:
def calculate_procrustes_alignment(table_name_a, table_name_b, table_aligned_name_b):
    print("calculate_procrustes_alignment: start")
    overlap_matrix_a = []
    overlap_matrix_b = []
    for lemma_embedding_a, lemma_embedding_b in query_over_mutual([table_name_a, table_name_b]):
        occurrence_count_sqrt = np.log1p((lemma_embedding_a[1] + lemma_embedding_b[1]) / 2)
        overlap_matrix_a.append(lemma_embedding_a[2] * occurrence_count_sqrt)
        overlap_matrix_b.append(lemma_embedding_b[2] * occurrence_count_sqrt)
    overlap_matrix_a = np.stack(overlap_matrix_a)
    overlap_matrix_b = np.stack(overlap_matrix_b)

    # do procrustes transformation
    r, _ = orthogonal_procrustes(overlap_matrix_b, overlap_matrix_a)
    embeddings_table_all_b = query_generic(table_name_b, select_cols=COLS_EMBEDDING, order_by="lemma")
    matrix_b = [e[2] for e in embeddings_table_all_b]
    matrix_b = np.stack(matrix_b)
    matrix_b_aligned = matrix_b @ r
    matrix_b_aligned_normalized = matrix_b_aligned / np.linalg.norm(matrix_b_aligned, axis=1, keepdims=True)
    print("calculate_procrustes_alignment: matrix_b_aligned.shape:", matrix_b_aligned.shape)
    db_insertion_data = []
    for embeddings_table_b_data, embedding_b_aligned in zip(embeddings_table_all_b, matrix_b_aligned_normalized):
        db_insertion_data.append([embeddings_table_b_data[0], embeddings_table_b_data[1], embedding_b_aligned])
    drop_table(table_aligned_name_b)
    create_embeddings_table(table_aligned_name_b)
    insert_to_db(table_aligned_name_b, db_insertion_data)


if TEST:
    calculate_procrustes_alignment("word2vec__184", "word2vec__185", "word2vec__185__aligned")
    calculate_procrustes_alignment("word2vec__185__aligned", "word2vec__186", "word2vec__186__aligned")
    embeddings_table_184 = query_generic(
        "word2vec__184",
        TEST_LEMMA_RANGED_LIST,
        select_cols=["lemma", "embedding"],
        order_by="lemma",
    )
    embeddings_table_185 = query_generic(
        "word2vec__185",
        TEST_LEMMA_RANGED_LIST,
        select_cols=["lemma", "embedding"],
        order_by="lemma",
    )
    embeddings_table_186 = query_generic(
        "word2vec__186",
        TEST_LEMMA_RANGED_LIST,
        select_cols=["lemma", "embedding"],
        order_by="lemma",
    )
    embeddings_table_185_aligned = query_generic(
        "word2vec__185__aligned",
        TEST_LEMMA_RANGED_LIST,
        select_cols=["lemma", "embedding"],
        order_by="lemma",
    )
    embeddings_table_186_aligned = query_generic(
        "word2vec__186__aligned",
        TEST_LEMMA_RANGED_LIST,
        select_cols=["lemma", "embedding"],
        order_by="lemma",
    )
    for lemma, e_184, e_185, e_186, e_185_aligned, e_186_aligned in zip(
        TEST_LEMMA_RANGED_LIST,
        embeddings_table_184,
        embeddings_table_185,
        embeddings_table_186,
        embeddings_table_185_aligned,
        embeddings_table_186_aligned,
    ):
        print(lemma)
        print("184-185:", calculate_cos_sim(e_184[1], e_185[1]))
        print("185-186:", calculate_cos_sim(e_185[1], e_186[1]))
        print("184-185_aligned:", calculate_cos_sim(e_184[1], e_185_aligned[1]))
        print("184_aligned-185_aligned:", calculate_cos_sim(e_185_aligned[1], e_186_aligned[1]))

### calculate_cos_sim_between_tables

In [None]:
def calculate_cos_sim_between_tables(table_name_a, table_name_b, table_name_diff):
    print(
        "calculate_cos_sim_between_tables: start: (table_name_a, table_name_b, table_name_diff)",
        (table_name_a, table_name_b, table_name_diff),
    )
    db_insertion_data = []
    for lemma_embedding_a, lemma_embedding_b in query_over_mutual([table_name_a, table_name_b]):
        db_insertion_data.append(
            (
                lemma_embedding_a[0],
                int((lemma_embedding_a[1] + lemma_embedding_b[1]) / 2),
                calculate_cos_sim(lemma_embedding_a[2], lemma_embedding_b[2]),
            )
        )
    print("calculate_cos_sim_between_tables: len(db_insertion_data):", len(db_insertion_data))
    drop_table(table_name_diff)
    create_diff_table(table_name_diff)
    insert_to_db(table_name_diff, db_insertion_data, cols=["lemma", "occurrence_count", "diff"])


if TEST:
    calculate_cos_sim_between_tables("word2vec__184", "word2vec__185__aligned", "word2vec__diff__cos_sim__184_185")

### calculate_trajectory_between_tables

In [None]:
def calculate_trajectory_between_tables(table_name_a, table_name_b, table_name_c, table_name_diff):
    print(
        "calculate_trajectory_between_tables: start: (table_name_a, table_name_b, table_name_c, table_name_diff):",
        (table_name_a, table_name_b, table_name_c, table_name_diff),
    )
    db_insertion_data = []
    for lemma_embedding_a, lemma_embedding_b, lemma_embedding_c in query_over_mutual([table_name_a, table_name_b, table_name_c]):
        db_insertion_data.append(
            (
                lemma_embedding_a[0],
                int((lemma_embedding_a[1] + lemma_embedding_b[1] + lemma_embedding_c[1]) / 3),
                calculate_trajectory_between_vectors(lemma_embedding_a[2], lemma_embedding_b[2], lemma_embedding_c[2]),
            )
        )
    print("calculate_cos_sim_between_tables: len(db_insertion_data):", len(db_insertion_data))
    drop_table(table_name_diff)
    create_diff_table(table_name_diff)
    insert_to_db(table_name_diff, db_insertion_data, cols=["lemma", "occurrence_count", "diff"])


if TEST:
    calculate_trajectory_between_tables(
        "word2vec__184", "word2vec__185__aligned", "word2vec__186__aligned", "word2vec__diff__trajectory__184_186"
    )

### calculate_relative_diff_between_tables

In [None]:
def calculate_relative_diff_between_tables(table_name_a, table_name_b, table_name_diff):
    print("calculate_relative_diff_between_tables: start: (table_name_a, table_name_b):", (table_name_a, table_name_b))
    db_insertion_data = []
    for mutual in query_mutual_lemmas([table_name_a, table_name_b], include_count=True):
        lemma = mutual[0]
        related_lemma_embedding_a = query_related(table_name_a, lemma, n=100)
        related_lemma_embedding_b = query_related(table_name_b, lemma, n=100)
        similarity_dict_a = {l: c for l, c in related_lemma_embedding_a}
        similarity_dict_b = {l: c for l, c in related_lemma_embedding_b}
        common_related_lemma = set(similarity_dict_a.keys()) & set(similarity_dict_b.keys())
        diff = 0
        for lemma_related in common_related_lemma:
            diff += abs(similarity_dict_a[lemma_related] - similarity_dict_b[lemma_related])
        db_insertion_data.append((lemma, int(mutual[1] / 2), diff))
    print("calculate_cos_sim_between_tables: len(db_insertion_data):", len(db_insertion_data))
    drop_table(table_name_diff)
    create_diff_table(table_name_diff)
    insert_to_db(table_name_diff, db_insertion_data, cols=["lemma", "occurrence_count", "diff"])


if TEST:
    calculate_relative_diff_between_tables("word2vec__184", "word2vec__185", "word2vec__diff__relative__184_185")

### normalize_diff_table

In [None]:
def normalize_diff_table(diff_table_name, diff_table_name_new, invert=False):
    print("normalize_diff_table: start: table_name_diff:", diff_table_name)
    diff_table = query_generic(diff_table_name, select_cols=COLS_DIFF, order_by="diff")
    min_diff = None
    max_diff = None
    for diff_row in diff_table:
        diff_value = diff_row[2]
        if min_diff is None and max_diff is None:
            min_diff = diff_value
            max_diff = diff_value
        else:
            if diff_value < min_diff:
                min_diff = diff_value
            if diff_value > max_diff:
                max_diff = diff_value
    scale = 2 / (max_diff - min_diff)
    print(max_diff)
    diff_table_new = []
    for diff_row in diff_table:
        value = ((diff_row[2] - min_diff) * scale) - 1
        if invert:
            value *= -1
        diff_table_new.append((diff_row[0], diff_row[1], value))
    print("normalize_diff_table: len(diff_table_new):", len(diff_table_new))
    drop_table(diff_table_name_new)
    create_diff_table(diff_table_name_new)
    insert_to_db(diff_table_name_new, diff_table_new, cols=COLS_DIFF)


if TEST:
    normalize_diff_table("word2vec__diff__cos_sim__184_185", "word2vec__diff__cos_sim__normalized__184_185")
    normalize_diff_table("word2vec__diff__trajectory__184_186", "word2vec__diff__trajectory__normalized__184_186")
    normalize_diff_table("word2vec__diff__relative__184_185", "word2vec__diff__relative__normalized__184_185", invert=True)

## plotting

### plot_tsne_from_labels_embeddings

In [None]:
def plot_tsne_from_labels_embeddings(
    label_embedding_list: list[str, Embedding],
    title: str = None,
    height: int = None,
    width: int = None,
    rotation_degree: int = None,
    perplexity: int = None,
):
    labels = []
    embeddings = []
    for l, e in label_embedding_list:
        labels.append(l)
        embeddings.append(e)
    reduced_vectors_tsne = calculate_tsne(embeddings, perplexity)

    if rotation_degree:
        angle_rad = np.deg2rad(-rotation_degree)
        rotation_matrix = np.array([[np.cos(angle_rad), -np.sin(angle_rad)], [np.sin(angle_rad), np.cos(angle_rad)]])
        reduced_vectors_tsne = reduced_vectors_tsne @ rotation_matrix.T

    if height is None:
        height = 800
    if width is None:
        width = 800
    time.sleep(PLOT_SLEEP)
    fig = px.scatter(
        x=reduced_vectors_tsne[:, 0],
        y=reduced_vectors_tsne[:, 1],
        text=labels,
        height=height,
        width=width,
        title=title,
    )
    fig.update_layout(xaxis=dict(title=None, showticklabels=False), yaxis=dict(title=None, showticklabels=False))
    fig.update_traces(
        marker=dict(size=10),
        textposition="bottom center",
        textfont=dict(size=12),
    )
    fig.show()


if TEST:
    lemma_embedding_list = query_generic("word2vec__185", lemma_list=TEST_LEMMA_CLOSE_LIST, select_cols=["lemma", "embedding"])
    plot_tsne_from_labels_embeddings(lemma_embedding_list)

### plot_tsne_from_lemma_and_related

In [None]:
def plot_tsne_from_lemma_and_related(
    table_name,
    lemma: str = None,
    n: int = 100,
    title: str = None,
    height: int = None,
    width: int = None,
    rotation_degree: int = None,
):
    plot_tsne_from_labels_embeddings(query_related(table_name, lemma, select_cols=["lemma", "embedding"], n=n))


if TEST:
    plot_tsne_from_lemma_and_related("word2vec__185", lemma="gehen", n=100)

### plot_2d_scatter

In [None]:
def plot_2d_scatter(data: dict | list[list], title: str = None, draw_line=False):
    time.sleep(PLOT_SLEEP)
    key_list = []
    value_list = []
    if type(data) is dict:
        for key, value in data.items():
            key_list.append(key)
            value_list.append(value)
    else:
        for key, value in data:
            key_list.append(key)
            value_list.append(value)
    fig = px.scatter(x=key_list, y=value_list, title=title)
    if draw_line:
        fig.update_traces(mode="lines+markers")
    fig.update_layout(xaxis_title=None, yaxis_title=None)
    fig.show()


if TEST:

    # general lemma occurrence count
    # plot_2d_scatter(query_average_occurrence_count("word2vec", decade_list_test), "lemma occurrence count in 183")

    # base differences
    # word2vec__diff__cos_sim__184_185 = query_generic(
    #     "word2vec__diff__cos_sim__184_185",
    #     select_cols=["lemma", "diff"],
    #     order_by="diff",
    #     order_desc=True,
    # )
    # word2vec__diff__trajectory__184_186 = query_generic(
    #     "word2vec__diff__trajectory__184_186",
    #     select_cols=["lemma", "diff"],
    #     order_by="diff",
    #     order_desc=True,
    # )
    # word2vec__diff__relative__184_185 = query_generic(
    #     "word2vec__diff__relative__184_185",
    #     select_cols=["lemma", "diff"],
    #     order_by="diff",
    #     order_desc=False,
    # )
    # plot_2d_scatter(word2vec__diff__cos_sim__184_185, "word2vec__diff__cos_sim__184_185")
    # plot_2d_scatter(word2vec__diff__trajectory__184_186, "word2vec__diff__trajectory__184_186")
    # plot_2d_scatter(word2vec__diff__relative__184_185, "word2vec__diff__relative__184_185")

    # # normalized differences
    word2vec__diff__cos_sim__normalized__184_185 = query_generic(
        "word2vec__diff__cos_sim__normalized__184_185",
        select_cols=["lemma", "diff"],
        order_by="diff",
        order_desc=True,
    )
    word2vec__diff__trajectory__normalized__184_186 = query_generic(
        "word2vec__diff__trajectory__normalized__184_186",
        select_cols=["lemma", "diff"],
        order_by="diff",
        order_desc=True,
    )
    word2vec__diff__relative__normalized__184_185 = query_generic(
        "word2vec__diff__relative__normalized__184_185",
        select_cols=["lemma", "diff"],
        order_by="diff",
        order_desc=True,
    )
    plot_2d_scatter(word2vec__diff__cos_sim__normalized__184_185, "word2vec__diff__cos_sim__normalized__184_185")
    plot_2d_scatter(word2vec__diff__trajectory__normalized__184_186, "word2vec__diff__trajectory__normalized__184_186")
    plot_2d_scatter(word2vec__diff__relative__normalized__184_185, "word2vec__diff__relative__normalized__184_185")

    # # differences of differences
    # plot_2d_scatter(lemma_diff_cos_sim_trajectory_183_185_dict, "lemma_diff_cos_sim_trajectory_183_185_dict")
    # plot_2d_scatter(lemma_diff_cos_sim_relative_diff_183_185_dict, "lemma_diff_cos_sim_relative_diff_183_185_dict")
    # plot_2d_scatter(lemma_diff_trajectory_relative_diff_183_185_dict, "lemma_diff_trajectory_relative_diff_183_185_dict")
    # plot_2d_scatter(lemma_diff_random_cos_sim_trajectory_dict, "lemma_diff_random_cos_sim_trajectory_dict")

    # # merged differences of differences
    # plot_2d_scatter(lemma_cos_sim_trajectory_183_185_merged_dict, "lemma_cos_sim_trajectory_183_185_merged_dict")
    # plot_2d_scatter(lemma_cos_sim_relative_diff_183_185_merged_dict, "lemma_cos_sim_relative_diff_183_185_merged_dict")
    # plot_2d_scatter(lemma_trajectory_relative_diff_183_185_merged_dict, "lemma_trajectory_relative_diff_183_185_merged_dict")
    # plot_2d_scatter(lemma_merged_random_diff_dict, "lemma_merged_random_diff_dict")

### plot_lemma_and_decade

In [None]:
def plot_lemma_and_decade(lemma_decade_embdding_dict: dict[Lemma, dict[Decade, Embedding]], perplexity=None, title=None):
    time.sleep(PLOT_SLEEP)

    # prepare data
    global_labels_list = []
    global_embeddings_list = []
    group_end_position_list = []
    position_count = 0
    for lemma, decade_embedding_dict in lemma_decade_embdding_dict.items():
        for decade, embedding in decade_embedding_dict.items():
            global_labels_list.append(str(decade) + ":" + lemma)
            global_embeddings_list.append(embedding)
            position_count += 1
        group_end_position_list.append(position_count)
    if 1 < len(global_embeddings_list) < 6:
        perplexity = len(global_embeddings_list) - 1
    else:
        perplexity = None
    lemma_embeddings_reduced_array = calculate_tsne(global_embeddings_list, perplexity=perplexity)

    # create plot
    fig = go.Figure()
    group_start_position = 0
    for group_end_position in group_end_position_list:
        lemma_respective_embeddings = lemma_embeddings_reduced_array[group_start_position:group_end_position]
        fig.add_trace(
            go.Scatter(
                x=lemma_respective_embeddings[:, 0],
                y=lemma_respective_embeddings[:, 1],
                mode="lines",
            )
        )
        group_start_position = group_end_position
    fig.add_trace(
        go.Scatter(
            x=lemma_embeddings_reduced_array[:, 0],
            y=lemma_embeddings_reduced_array[:, 1],
            mode="markers+text",
            text=global_labels_list,
            textposition="top center",
        )
    )
    fig.update_layout(
        title=title if title is not None else "",
        showlegend=False,
        width=800,
        height=800,
    )
    fig.show()


if TEST:
    lemma_decade_embdding_dict = {}
    print("trajectories:")
    for lemma in ["d", "wohnen", "Fürst"]:
        print(lemma, lemma_trajectory_183_184_185_dict[lemma])
        lemma_decade_embdding_dict[lemma] = {
            183: query_generic(word2vec_183_index, lemma),
            184: query_generic(word2vec_184_index, lemma),
            185: query_generic(word2vec_185_index, lemma),
        }
    plot_lemma_and_decade(lemma_decade_embdding_dict)

### plot_lemma_and_decade_from_lemma_list

In [None]:
def plot_lemma_and_decade_from_lemma_list(lemma_list, title=None):
    lemma_decade_embdding_dict = load_embeddings_from_persisted_decade_data(decade_list, lemma_list)
    plot_lemma_and_decade(lemma_decade_embdding_dict, title=title)

# OLD

## OLD: data structures

### OLD: create_index_from_word2vec

In [None]:
def create_index_from_word2vec(decade) -> Word2VecIndex:
    print("create_index: start: decade:", decade)
    model_path = MODELS_WORD2VEC_FOLDER + str(decade) + ".bin"
    model = Word2Vec.load(model_path)
    id_to_lemma_dict: IdToLemmaDict = {}
    lemma_to_id_dict: LemmaToIdDict = {}
    embedding_list = []
    for lemma_id, lemma in enumerate(model.wv.index_to_key):
        if is_word(lemma):
            embedding = model.wv[lemma]
            embedding_normalized = embedding / np.linalg.norm(embedding)
            embedding_list.append(embedding_normalized)
            id_to_lemma_dict[lemma_id] = lemma
            lemma_to_id_dict[lemma] = lemma_id
    dim = embedding_list[0].shape[0]
    hnsw_index = hnswlib.Index(space="cosine", dim=dim)
    hnsw_index.init_index(max_elements=len(embedding_list), ef_construction=INDEX_EF_CONSTRUCTION, M=INDEX_M)
    hnsw_index.add_items(embedding_list, ids=list(id_to_lemma_dict.keys()))
    index = (hnsw_index, lemma_to_id_dict, id_to_lemma_dict)
    print("create_lemma_dict_and_index: hnsw_index.get_current_count:", hnsw_index.get_current_count())
    return index


# def create_index_test():
#     word2vec_183_index = create_index_from_word2vec(183)
#     word2vec_184_index = create_index_from_word2vec(184)
#     word2vec_185_index = create_index_from_word2vec(185)
#     return word2vec_183_index, word2vec_184_index, word2vec_185_index


# word2vec_183_index, word2vec_184_index, word2vec_185_index = create_index_test()

### OLD: sort_dict_by_value

In [None]:
def sort_dict_by_value(key_value_dict: dict, desc=True) -> dict:
    if desc:
        sort_mod = -1
    else:
        sort_mod = 1
    return dict(sorted(key_value_dict.items(), key=lambda x: sort_mod * x[1]))


def sort_dict_by_value_test():
    global lemma_occurrence_count_183_dict
    global lemma_occurrence_count_184_dict
    global lemma_occurrence_count_185_dict
    lemma_occurrence_count_183_dict = sort_dict_by_value(lemma_occurrence_count_183_dict)
    lemma_occurrence_count_184_dict = sort_dict_by_value(lemma_occurrence_count_184_dict)
    lemma_occurrence_count_185_dict = sort_dict_by_value(lemma_occurrence_count_185_dict)
    return lemma_occurrence_count_183_dict, lemma_occurrence_count_184_dict, lemma_occurrence_count_185_dict


# if TEST:
#     lemma_occurrence_count_183_dict, lemma_occurrence_count_184_dict, lemma_occurrence_count_185_dict = load_cache_or_run(
#         sort_dict_by_value_test
#     )

### OLD: get_common_keys

In [None]:
def get_common_keys(*lemma_dict_list: list[dict]) -> set:
    common_set = set(lemma_dict_list[0])
    for lemma_dict in lemma_dict_list[1:]:
        common_set &= lemma_dict.keys()
    return common_set


# if TEST:
#     print(get_common_keys({"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2, "c": 3}))

### OLD: merge_count_occurrences_dict

In [None]:
def merge_count_occurrences_dict(lemma_occurrence_count_a_dict, lemma_occurrence_count_b_dict):
    print(
        "merge_count_occurrences_dict: start: " "len(lemma_occurrence_count_a_dict):",
        len(lemma_occurrence_count_a_dict),
        "len(lemma_occurrence_count_b_dict):",
        len(lemma_occurrence_count_b_dict),
    )
    lemma_occurrence_count_dict_merged = {}
    for lemma in get_common_keys(lemma_occurrence_count_a_dict, lemma_occurrence_count_b_dict):
        lemma_occurrence_count_dict_merged[lemma] = lemma_occurrence_count_a_dict[lemma] + lemma_occurrence_count_b_dict[lemma]
    print("merge_count_occurrences_dict: len(lemma_occurrence_count_dict_merged):", len(lemma_occurrence_count_dict_merged))
    return lemma_occurrence_count_dict_merged


# if TEST:
#     lemma_occurrence_count_merged_183_184_dict = merge_count_occurrences_dict(
#         lemma_occurrence_count_183_dict, lemma_occurrence_count_184_dict
#     )

### OLD: filter_on_values

In [None]:
def filter_on_values(key_value_dict: dict, limit_min: float = None, limit_max: float = None):
    key_value_dict_new = {}
    for lemma, value in key_value_dict.items():
        if limit_min and limit_max:
            if limit_min <= value <= limit_max:
                key_value_dict_new[lemma] = value
        elif limit_min:
            if limit_min <= value:
                key_value_dict_new[lemma] = value
        elif limit_max:
            if value <= limit_max:
                key_value_dict_new[lemma] = value
    return key_value_dict_new


# if TEST:
#     print(filter_on_values({"gehen": 0.9, "laufen": 0.5, "wandern": 0.7}, limit_min=0.6, limit_max=0.8))

## OLD: vector and index functions

### OLD: query_embedding

In [None]:
def query_generic(index: Index, lemma: Lemma) -> float:
    lemma_id = index[1].get(lemma)
    if lemma_id is not None:
        embedding = index[0].get_items([lemma_id])[0]
    else:
        embedding = None
    return embedding


# if TEST:
#     embedding = query_embedding(word2vec_183_index, "Haus")
#     print(embedding.shape)
#     assert query_embedding(word2vec_183_index, "kljwklerjas") is None

### OLD: query_related

In [None]:
def query_related(
    index: Index,
    lemma: Lemma,
    n: int = 10,
    return_as_dict: bool = True,
    keep_search: bool = False,
) -> dict[str, float] | list[str]:
    result = None
    id_embedding = index[1].get(lemma)
    if id_embedding is not None:
        if keep_search:
            distances_start = 0
        else:
            n += 1
            distances_start = 1
        embedding = index[0].get_items([id_embedding])[0]
        ids, distances = index[0].knn_query(embedding.reshape(1, -1), k=n)
        if return_as_dict:
            result = {}
        else:
            result = []
        for id_other, distance in list(zip(ids[0], distances[0]))[distances_start:]:
            cos_sim = 1 - distance
            lemma_related = index[2][id_other]
            if return_as_dict:
                result[lemma_related] = cos_sim
            else:
                result.append(lemma_related)
    return result


# if TEST:
#     print(query_related(word2vec_184_index, "gehen", n=10))
#     print(query_related(word2vec_185_index, "gehen", n=10, keep_search=True))
#     print(query_related(word2vec_185_index, "gehen", n=10, return_as_dict=False))
#     print(query_related(word2vec_185_index, "aclkjalkc"))

### OLD: calculate_average_sentence_embedding_from_sentence

In [None]:
def calculate_average_sentence_embedding_from_sentence(
    sentence: str,
    index: Index,
    show_exception: bool = False,
) -> np.ndarray:
    embedding_list = []
    for lemma in sentence.split(" "):
        try:
            embedding = query_generic(index, lemma)
            if embedding is not None:
                embedding_list.append(embedding)
        except Exception as ex:
            if show_exception:
                print(ex, lemma)
    embedding_avg = np.mean(np.array(embedding_list), axis=0)
    return embedding_avg


# if TEST:
#     v1 = calculate_average_sentence_embedding_from_sentence(
#         "d Mensch sein in d Haus",
#         word2vec_184_index,
#         show_exception=True,
#     )
#     v2 = calculate_average_sentence_embedding_from_sentence(
#         "d Mann sein in d Hütte",
#         word2vec_184_index,
#         show_exception=True,
#     )
#     v3 = calculate_average_sentence_embedding_from_sentence(
#         "d Ziege sein auf d Feld",
#         word2vec_184_index,
#         show_exception=True,
#     )
#     print(calculate_cos_sim(v1, v2))
#     print(calculate_cos_sim(v2, v3))

### OLD: get_occurrences

In [None]:
def get_occurrences(
    decade: Decade,
    line_number_dict: LineNumberDict,
    max_elem: int = None,
    highlight_lemma: bool = True,
    keep_lemma: bool = True,
) -> list[str]:
    text_list = []
    with open(TEXTS_FOLDER + str(decade) + ".txt", "r") as f:
        num_print = 0
        for line_number, line in enumerate(f):
            word_number_list = line_number_dict.get(line_number)
            if word_number_list:
                word_number_set = set(word_number_list)
                text = ""
                for word_number, word in enumerate(line.rstrip("\n").split(" ")):
                    if word_number in word_number_set and highlight_lemma and keep_lemma:
                        text += " ### " + word + " ###"
                    elif word_number not in word_number_set or (not highlight_lemma and keep_lemma):
                        text += " " + word
                    else:
                        pass
                text_list.append(text)
                num_print += 1
                if max_elem and num_print == max_elem:
                    break
    return text_list


# if TEST:
#     print(get_occurrences(184, lemma_occurrence_position_184_dict["Haus"], max_elem=1))
#     print(get_occurrences(184, lemma_occurrence_position_184_dict["Haus"], max_elem=1, highlight_lemma=False))
#     print(get_occurrences(184, lemma_occurrence_position_184_dict["Haus"], max_elem=1, keep_lemma=False))
#     print(len(get_occurrences(184, lemma_occurrence_position_184_dict["Haus"], max_elem=None, highlight_lemma=False)))

### OLD: calculate_average_sentence_embedding_from_lemma

In [None]:
def calculate_average_sentence_embedding_from_lemma(
    decade: Decade,
    line_number_dict: LineNumberDict,
    index: Index,
    max_sentences: int = 2,
) -> np.ndarray:
    sentence_list = get_occurrences(decade, line_number_dict, max_elem=max_sentences, highlight_lemma=False)
    sentence_embedding_dict = {}
    for sentence in sentence_list:
        sentence_embedding_dict[sentence] = calculate_average_sentence_embedding_from_sentence(sentence, index)
    return sentence_embedding_dict


# if TEST:
#     sentence_embedding_dict = calculate_average_sentence_embedding_from_lemma(
#         184,
#         lemma_occurrence_position_184_dict["gehen"],
#         word2vec_184_index,
#         max_sentences=1,
#     )
#     for sentence, embedding in sentence_embedding_dict.items():
#         print(sentence)
#         print(embedding.shape)

## difference analysis functions

### DONE: calculate_procrustes_alignment

In [None]:
def calculate_procrustes_alignment(table_a, table_b):
    print("calculate_procrustes_alignment: start")

    # create overlap matrices with embeddings weighted by count of occurrence
    common_lemma = query_mutual_lemmas([table_a, table_b])
    overlap_matrix_a = []
    overlap_matrix_b = []
    print("calculate_procrustes_alignment: len(common_lemma):", len(common_lemma))
    for lemma, occurrence_count in common_lemma:
        occurrence_count_sqrt = np.sqrt(occurrence_count / 2)
        embedding_a = query_generic(table_a, lemma)
        embedding_b = query_generic(table_b, lemma)
        if embedding_a is not None and embedding_b is not None:
            # print(query_embedding(table_a, lemma))
            # print(occurrence_count_sqrt)
            embedding_a = query_generic(table_a, lemma) * occurrence_count_sqrt
            embedding_b = query_generic(table_b, lemma) * occurrence_count_sqrt
            overlap_matrix_a.append(embedding_a)
            overlap_matrix_b.append(embedding_b)
    overlap_matrix_a = np.stack(overlap_matrix_a)
    overlap_matrix_b = np.stack(overlap_matrix_b)

    # do procrustes transformation
    r, _ = orthogonal_procrustes(overlap_matrix_b, overlap_matrix_a)
    matrix_b = query_all(table_b)
    # index_b_hnsw, lemma_to_id_b, id_to_lemma_b = index_b
    # index_b_id_to_lemma_keys = list(id_to_lemma_b.keys())
    # for i in index_b_id_to_lemma_keys:
    #     embedding_b = index_b_hnsw.get_items([i])[0]
    #     matrix_b.append(embedding_b)
    matrix_b = np.stack(matrix_b)
    matrix_b_aligned = matrix_b @ r
    matrix_b_aligned_normalized = matrix_b_aligned / np.linalg.norm(matrix_b_aligned, axis=1, keepdims=True)
    print("calculate_procrustes_alignment: matrix_b_aligned.shape:", matrix_b_aligned.shape)

    # create new index data structure
    # index_b_aligned = hnswlib.Index(space="cosine", dim=index_b[0].dim)
    # index_b_aligned.init_index(max_elements=index_b[0].get_max_elements(), ef_construction=INDEX_EF_CONSTRUCTION, M=INDEX_M)
    # index_b_aligned.add_items(matrix_b_aligned_normalized, index_b_id_to_lemma_keys)

    # return (index_b_aligned, index_b[1], index_b[2])


if TEST:
    calculate_procrustes_alignment("word2vec__184", "word2vec__185")
    # for lemma in ["d", "und", "gehen", "wohnen", "Fürst"]:
    #     print("calculate_procrustes_alignment: lemma:", lemma)
    #     embedding_183 = query_embedding(word2vec_183_index, lemma)
    #     embedding_184 = query_embedding(word2vec_184_index, lemma)
    #     embedding_185 = query_embedding(word2vec_185_index, lemma)
    #     embedding_aligned_184 = query_embedding(word2vec_184_aligned_index, lemma)
    #     embedding_aligned_185 = query_embedding(word2vec_185_aligned_index, lemma)
    #     cos_sim_183_184 = calculate_cos_sim(embedding_183, embedding_184)
    #     cos_sim_184_185 = calculate_cos_sim(embedding_184, embedding_185)
    #     cos_sim_aligned_183_184 = calculate_cos_sim(embedding_183, embedding_aligned_184)
    #     cos_sim_aligned_184_185 = calculate_cos_sim(embedding_aligned_184, embedding_aligned_185)
    #     print("calculate_procrustes_alignment:", "cos_sim_183_184:", cos_sim_183_184)
    #     print("calculate_procrustes_alignment:", "cos_sim_184_185:", cos_sim_184_185)
    #     print("calculate_procrustes_alignment:", "cos_sim_aligned_183_184:", cos_sim_aligned_183_184)
    #     print("calculate_procrustes_alignment:", "cos_sim_aligned_184_185:", cos_sim_aligned_184_185)

### DONE: calculate_cos_sim_between_indices

In [None]:
def calculate_cos_sim_between_indices(index_a: Index, index_b: Index) -> LemmaDiffDict:
    print("calculate_cos_sim_between_indices: start")
    common_lemma = get_common_keys(index_a[1], index_b[1])
    lemma_cos_sim_dict = {}
    for lemma in common_lemma:
        lemma_cos_sim_dict[lemma] = calculate_cos_sim(query_generic(index_a, lemma), query_generic(index_b, lemma))
    lemma_cos_sim_dict = sort_dict_by_value(lemma_cos_sim_dict)
    print("calculcate_cos_sim_between_indices: len(lemma_cos_sim_dict):", len(lemma_cos_sim_dict))
    return lemma_cos_sim_dict


def calculate_cos_sim_between_indices_test():
    lemma_cos_sim_dict_183_184 = calculate_cos_sim_between_indices(word2vec_183_index, word2vec_184_aligned_index)
    return lemma_cos_sim_dict_183_184


if TEST:
    lemma_cos_sim_183_184_dict = load_cache_or_run(calculate_cos_sim_between_indices_test)
    for lemma in ["d", "und", "gehen", "wohnen", "Fürst"]:
        print("calculcate_cos_sim_between_indices: lemma:", lemma)
        print(lemma_cos_sim_183_184_dict[lemma])

### OLD: weight_and_filter_lemmas

In [None]:
def weight_and_filter_lemmas(lemma_value_dict, lemma_occurrence_count_dict, filter_threshold=100):
    print("weight_and_filter_lemmas: start")
    lemma_value_dict_weighted = {}
    for lemma, value in lemma_value_dict.items():
        if filter_threshold and lemma_occurrence_count_dict[lemma] >= filter_threshold:
            lemma_value_dict_weighted[lemma] = value * np.log1p(np.sqrt(lemma_occurrence_count_dict[lemma]))
    lemma_value_dict_weighted = sort_dict_by_value(lemma_value_dict_weighted)
    print("weight_and_filter_lemmas: len(lemma_value_dict_weighted)", len(lemma_value_dict_weighted))
    return lemma_value_dict_weighted


if TEST:
    lemma_cos_sim_183_184_weighted_dict = weight_and_filter_lemmas(lemma_cos_sim_183_184_dict, lemma_occurrence_count_merged_183_184_dict)

### DONE: calculate_trajectory_from_lemma

In [None]:
def calculate_trajectory_from_lemma(index_a: Index, index_b: Index, index_c: Index, lemma: Lemma) -> Trajectory:
    a = query_generic(index_a, lemma)
    b = query_generic(index_b, lemma)
    c = query_generic(index_c, lemma)
    if a is not None and b is not None and c is not None:
        ab = a - b
        bc = b - c
        ab_bc_trajectory = np.dot(ab, bc)
        return ab_bc_trajectory
    else:
        return None


if TEST:
    for lemma in ["d", "und", "gehen", "wohnen", "Fürst"]:
        ab_bc_trajectory = calculate_trajectory_from_lemma(
            word2vec_183_index,
            word2vec_184_aligned_index,
            word2vec_185_aligned_index,
            lemma,
        )
        print("calculate_trajectory_from_diff_per_lemma:", lemma, ab_bc_trajectory)

### DONE: calculate_trajectory_dict_from_index

In [None]:
def calculate_trajectory_dict_from_index(index_a: Index, index_b: Index, index_c: Index) -> LemmaDiffDict:
    print("calculate_trajectory_dict_from_index: start")
    lemma_trajectory_diff_list = []
    common_lemma = get_common_keys(index_a[1], index_b[1], index_c[1])
    for lemma in common_lemma:
        lemma_trajectory_diff = calculate_trajectory_from_lemma(index_a, index_b, index_c, lemma)
        lemma_trajectory_diff_list.append((lemma, lemma_trajectory_diff))
    lemma_trajectory_diff_list = sorted(lemma_trajectory_diff_list, key=lambda x: -x[1])
    lemma_trajectory_dict = {l: d for l, d in lemma_trajectory_diff_list}
    return lemma_trajectory_dict


if TEST:
    lemma_trajectory_183_184_185_dict = calculate_trajectory_dict_from_index(
        word2vec_183_index,
        word2vec_184_aligned_index,
        word2vec_185_aligned_index,
    )
    print("len(lemma_trajectory_diff_dict_183_184_185)", len(lemma_trajectory_183_184_185_dict))

### DONE: calculate_relative_diff_from_lemma

In [None]:
def calculate_relative_diff_from_lemma(
    index_a: Index,
    index_b: Index,
    lemma_occurrence_count_a_dict: LemmaOccurrenceCountDict,
    lemma_occurrence_count_b_dict: LemmaOccurrenceCountDict,
    lemma: Lemma,
) -> float:
    diff = None

    # key and dict synchronization
    distances_a_dict = query_related(index_a, lemma, n=10)
    distances_b_dict = query_related(index_b, lemma, n=10)
    embedding_index_a_lemma = query_generic(index_a, lemma)
    embedding_index_b_lemma = query_generic(index_b, lemma)
    if embedding_index_a_lemma is not None and embedding_index_b_lemma is not None:
        distances_a_lemma_set = set(distances_a_dict.keys())
        distances_b_lemma_set = set(distances_b_dict.keys())
        lemma_all = set()
        for lemma_a in distances_a_lemma_set:
            is_in_both = True
            if lemma_a not in distances_b_lemma_set:
                embedding_index_b_lemma_a = query_generic(index_b, lemma_a)
                if embedding_index_b_lemma_a is not None:
                    distances_b_dict[lemma_a] = calculate_cos_distance(embedding_index_a_lemma, embedding_index_b_lemma_a)
                else:
                    is_in_both = False
            if is_in_both:
                lemma_all.add(lemma_a)
        for lemma_b in distances_b_lemma_set:
            is_in_both = True
            if lemma_b not in distances_a_lemma_set:
                embedding_index_a_lemma_b = query_generic(index_a, lemma_b)
                if embedding_index_a_lemma_b is not None:
                    distances_a_dict[lemma_b] = calculate_cos_distance(embedding_index_b_lemma, embedding_index_a_lemma_b)
                else:
                    is_in_both = False
            if is_in_both:
                lemma_all.add(lemma_b)

        # difference calculation
        diff = 0
        for lemma_related in lemma_all:
            distance_a = distances_a_dict[lemma_related]
            distance_b = distances_b_dict[lemma_related]
            diff += abs(distance_a - distance_b)
        diff /= len(lemma_all)

    return diff


if TEST:
    for lemma in ["d", "und", "gehen", "wohnen", "Fürst"]:
        print(
            lemma,
            calculate_relative_diff_from_lemma(
                word2vec_183_index,
                word2vec_184_index,
                lemma_occurrence_count_183_dict,
                lemma_occurrence_count_184_dict,
                lemma,
            ),
        )

### DONE: calculate_relative_diff_dict_from_index

In [None]:
def calculate_relative_diff_dict_from_index(
    index_a: Index,
    index_b: Index,
    lemma_occurrence_count_a_dict: LemmaOccurrenceCountDict,
    lemma_occurrence_count_b_dict: LemmaOccurrenceCountDict,
    min_occurrence: int = None,
    max_occurrence: int = None,
) -> LemmaDiffDict:
    print("calculate_relative_diff_dict_from_index: start")
    lemma_diff_dict = {}
    lemma_common = get_common_keys(index_a[1], index_b[1])
    for lemma in lemma_common:
        count_a = lemma_occurrence_count_a_dict[lemma]
        count_b = lemma_occurrence_count_b_dict[lemma]
        if (min_occurrence is None or (count_a >= min_occurrence and count_b >= min_occurrence)) and (
            max_occurrence is None or (count_a <= max_occurrence and count_b <= max_occurrence)
        ):
            diff = calculate_relative_diff_from_lemma(index_a, index_b, lemma_occurrence_count_a_dict, lemma_occurrence_count_b_dict, lemma)
            # diff_list.append((lemma, diff))
            lemma_diff_dict[lemma] = diff
    # diff_list = sorted(diff_list, key=lambda x: -x[1])
    # lemma_diff_dict = {lemma: diff for lemma, diff in diff_list}
    lemma_diff_dict = sort_dict_by_value(lemma_diff_dict, desc=False)
    print("create_diff_dict_from_index: len(lemma_diff_dict):", len(lemma_diff_dict))
    return lemma_diff_dict


def create_relative_diff_dict_from_index_test():
    lemma_diff_183_184_dict = calculate_relative_diff_dict_from_index(
        word2vec_183_index,
        word2vec_184_index,
        lemma_occurrence_count_183_dict,
        lemma_occurrence_count_184_dict,
    )
    lemma_diff_184_185_dict = calculate_relative_diff_dict_from_index(
        word2vec_184_index,
        word2vec_185_index,
        lemma_occurrence_count_184_dict,
        lemma_occurrence_count_185_dict,
    )
    return lemma_diff_183_184_dict, lemma_diff_184_185_dict


if TEST:
    lemma_relative_diff_183_184_dict, lemma_diff_184_185_dict = load_cache_or_run(create_relative_diff_dict_from_index_test)

### normalize_lemma_value_dict

In [None]:
def normalize_diff_table(lemma_value_dict, enable_inversion=False):
    print("normalize_lemma_value_dict: start")
    move = min(lemma_value_dict.values())
    lemma_value_dict_normalized = {lemma: diff - move for lemma, diff in lemma_value_dict.items()}
    scale = 2 / max(lemma_value_dict_normalized.values())
    lemma_value_dict_normalized = {lemma: diff * scale for lemma, diff in lemma_value_dict_normalized.items()}
    lemma_value_dict_normalized = {lemma: diff - 1 for lemma, diff in lemma_value_dict_normalized.items()}
    if enable_inversion:
        lemma_value_dict_normalized = {lemma: diff * -1 for lemma, diff in lemma_value_dict_normalized.items()}
    print("normalize_lemma_value_dict: len(lemma_value_dict_normalized):", len(lemma_value_dict_normalized))
    return lemma_value_dict_normalized


if TEST:
    lemma_cos_sim_183_184_normalized_dict = normalize_diff_table(lemma_cos_sim_183_184_dict)
    lemma_trajectory_183_184_185_normalized_dict = normalize_diff_table(lemma_trajectory_183_184_185_dict)
    lemma_relative_diff_183_184_normalized_dict = normalize_diff_table(lemma_relative_diff_183_184_dict, enable_inversion=True)

### create_random_lemma_value_dict

In [None]:
def create_random_lemma_value_dict(lemma_value_dict, range_min=-1, range_max=1):
    range_min *= 1000
    range_max *= 1000
    random_lemma_value_dict = {}
    for lemma in lemma_value_dict.keys():
        random_lemma_value_dict[lemma] = random.randint(range_min, range_max) / 1000
    return random_lemma_value_dict


if TEST:
    lemma_random_cos_sim_dict = create_random_lemma_value_dict(lemma_relative_diff_183_184_normalized_dict)
    lemma_random_trajectory_dict = create_random_lemma_value_dict(lemma_trajectory_183_184_185_normalized_dict)
    random_relative_diff_dict = create_random_lemma_value_dict(lemma_relative_diff_183_184_normalized_dict)

### create_lemma_diff_diff_dicts

In [None]:
def create_lemma_diff_diff_dicts(*lemma_value_dict_list):
    print("create_lemma_diff_diff_dicts: start")
    lemma_value_dict_list
    common_lemma = get_common_keys(*lemma_value_dict_list)
    lemma_diff_dict = {}
    for lemma in common_lemma:
        l = len(lemma_value_dict_list)
        for i_a in range(0, l):
            for i_b in range(i_a + 1, l):
                lemma_value_dict_a = lemma_value_dict_list[i_a]
                lemma_value_dict_b = lemma_value_dict_list[i_b]
                diff = lemma_diff_dict.get(lemma, 0)
                diff += abs(lemma_value_dict_a[lemma] - lemma_value_dict_b[lemma])
                lemma_diff_dict[lemma] = diff
    lemma_diff_dict = sort_dict_by_value(lemma_diff_dict, desc=False)
    diff_avg = sum(lemma_diff_dict.values()) / len(lemma_diff_dict)
    diff_median = list(lemma_diff_dict.values())[int(len(lemma_diff_dict) / 2)]
    print("create_lemma_diff_diff_dicts: avg_diff:", diff_avg)
    print("create_lemma_diff_diff_dicts: diff_median:", diff_median)
    return lemma_diff_dict


if TEST:
    lemma_diff_cos_sim_trajectory_183_185_dict = create_lemma_diff_diff_dicts(
        lemma_cos_sim_183_184_normalized_dict,
        lemma_trajectory_183_184_185_normalized_dict,
    )
    lemma_diff_cos_sim_relative_diff_183_185_dict = create_lemma_diff_diff_dicts(
        lemma_cos_sim_183_184_normalized_dict,
        lemma_relative_diff_183_184_normalized_dict,
    )
    lemma_diff_trajectory_relative_diff_183_185_dict = create_lemma_diff_diff_dicts(
        lemma_trajectory_183_184_185_normalized_dict,
        lemma_relative_diff_183_184_normalized_dict,
    )
    lemma_diff_random_cos_sim_trajectory_dict = create_lemma_diff_diff_dicts(
        lemma_random_cos_sim_dict,
        lemma_random_trajectory_dict,
    )
    random_diff_cos_sim_relative_diff_dict = create_lemma_diff_diff_dicts(
        lemma_random_cos_sim_dict,
        random_relative_diff_dict,
    )
    random_diff_trajectory_relative_diff_dict = create_lemma_diff_diff_dicts(
        lemma_random_trajectory_dict,
        random_relative_diff_dict,
    )

### merge_lemma_diff_diff_dict

In [None]:
def merge_lemma_diff_diff_dict(lemma_diff_diff_dict, lemma_diff_dict_a, lemma_diff_dict_b):
    print("merge_lemma_diff_diff_dict: start")
    lemma_diff_averaged_dict = {}
    for lemma in lemma_diff_diff_dict.keys():
        lemma_diff_averaged_dict["1:" + lemma] = lemma_diff_dict_a[lemma]
        lemma_diff_averaged_dict["2:" + lemma] = lemma_diff_dict_b[lemma]
    print("merge_lemma_diff_diff_dict: len(lemma_diff_averaged_dict):", len(lemma_diff_averaged_dict))
    return lemma_diff_averaged_dict


if TEST:
    lemma_cos_sim_trajectory_183_185_merged_dict = merge_lemma_diff_diff_dict(
        lemma_diff_cos_sim_trajectory_183_185_dict,
        lemma_cos_sim_183_184_normalized_dict,
        lemma_trajectory_183_184_185_normalized_dict,
    )
    lemma_cos_sim_relative_diff_183_185_merged_dict = merge_lemma_diff_diff_dict(
        lemma_diff_cos_sim_relative_diff_183_185_dict,
        lemma_cos_sim_183_184_normalized_dict,
        lemma_relative_diff_183_184_normalized_dict,
    )
    lemma_trajectory_relative_diff_183_185_merged_dict = merge_lemma_diff_diff_dict(
        lemma_diff_trajectory_relative_diff_183_185_dict,
        lemma_trajectory_183_184_185_normalized_dict,
        lemma_relative_diff_183_184_normalized_dict,
    )
    lemma_merged_random_diff_dict = merge_lemma_diff_diff_dict(
        lemma_diff_random_cos_sim_trajectory_dict,
        lemma_random_cos_sim_dict,
        lemma_random_trajectory_dict,
    )

### calculate_average_diff

In [None]:
def calculate_average_diff(global_diff_dict):
    decade_average_diff_dict = {}
    for lemma, diff in global_diff_dict.items():
        for decade, cos_sim in diff.items():
            diff_per_decade_list = decade_average_diff_dict.get(decade, [])
            diff_per_decade_list.append(cos_sim)
            decade_average_diff_dict[decade] = diff_per_decade_list
    decade_average_diff_dict_new = {}
    for lemma, diff_per_decade_list in decade_average_diff_dict.items():
        decade_average_diff_dict_new[lemma] = sum(diff_per_decade_list) / len(diff_per_decade_list)
    decade_average_diff_dict = decade_average_diff_dict_new
    return decade_average_diff_dict

## aggregate functions

### create_decade_data

In [None]:
def create_decade_data(decade: Decade) -> DecadeData:
    print("create_decade_data: start")
    word2vec_index = create_index_from_word2vec(decade)
    lemma_occurrence_count_dict, lemma_occurrence_position_dict = create_occurrence_dicts(decade, word2vec_index)
    lemma_occurrence_count_dict = sort_dict_by_value(lemma_occurrence_count_dict)
    return [word2vec_index, None, None, lemma_occurrence_count_dict, lemma_occurrence_position_dict]


def create_decade_data_test():
    return create_decade_data(183), create_decade_data(184), create_decade_data(185)


if TEST:
    decade_183_data, decade_184_data, decade_185_data = load_cache_or_run(create_decade_data_test)

### align_decade_data

In [None]:
def align_decade_data(decade_a_data: DecadeData, decade_b_data: DecadeData) -> DecadeData:
    print("align_decade_data: start")
    index_a = decade_a_data[0]
    index_b = decade_b_data[0]
    lemma_occurrence_count_a_dict = decade_a_data[3]
    lemma_occurrence_count_b_dict = decade_b_data[3]
    index_b = calculate_procrustes_alignment(index_a, index_b, lemma_occurrence_count_a_dict, lemma_occurrence_count_b_dict)
    decade_b_data[0] = index_b
    return decade_b_data


def align_decade_data_test():
    global decade_184_data
    global decade_185_data
    decade_184_data = align_decade_data(decade_183_data, decade_184_data)
    decade_185_data = align_decade_data(decade_184_data, decade_185_data)
    return decade_184_data, decade_185_data


if TEST:
    decade_184_data, decade_185_data = load_cache_or_run(align_decade_data_test)

### preprocess_and_persist_decade_data

In [None]:
def preprocess_and_persist_decade_data(decade_list):

    def does_exist(decade):
        return os.path.exists(f"{CACHE_FOLDER}preprocess_and_persist_decade_data__{decade}.pkl")

    print("preprocess_and_persist_decade_data: start: decade_list:", decade_list)
    decade = decade_list[0]
    if not does_exist(decade):
        decade_a_data = create_decade_data(decade)
        pickle_save(decade_a_data, f"preprocess_and_persist_decade_data__{decade_list[0]}")
    for decade in decade_list[1:]:
        if not does_exist(decade):
            decade_b_data = create_decade_data(decade)
            decade_b_data = align_decade_data(decade_a_data, decade_b_data)
            pickle_save(decade_b_data, f"preprocess_and_persist_decade_data__{decade}")
            decade_a_data = decade_b_data


if TEST:
    decade_list = create_decades_list()
    preprocess_and_persist_decade_data(decade_list)

### load_persisted_decade_data

In [None]:
def load_persisted_decade_data(decade):
    return pickle_load(f"preprocess_and_persist_decade_data__{decade}")


if TEST:
    decade_184_data = load_persisted_decade_data(184)

### load_embeddings_from_persisted_decade_data

In [None]:
def load_embeddings_from_persisted_decade_data(decade_list, lemma_list):
    print("load_embeddings_from_persisted_decade_data: start")
    lemma_decade_embdding_list_tmp = []
    for decade in decade_list:
        decade_data = load_persisted_decade_data(decade)
        for lemma in lemma_list:
            index = decade_data[0]
            embedding = query_generic(index, lemma)
            if embedding is not None:
                lemma_decade_embdding_list_tmp.append((lemma, decade, embedding))
    lemma_decade_embdding_dict = {}
    for lemma, decade, embedding in lemma_decade_embdding_list_tmp:
        decade_dict = lemma_decade_embdding_dict.get(lemma, {})
        decade_dict[decade] = embedding
        lemma_decade_embdding_dict[lemma] = decade_dict
    print("load_embeddings_from_persisted_decade_data: len(lemma_decade_embdding_dict):", len(lemma_decade_embdding_dict))
    return lemma_decade_embdding_dict


def load_embeddings_from_persisted_decade_data_test():
    lemma_decade_embdding_dict = load_embeddings_from_persisted_decade_data(decade_list, ["d", "und", "gehen", "wohnen", "Fürst"])
    return lemma_decade_embdding_dict


if TEST:
    lemma_decade_embdding_dict = load_cache_or_run(load_embeddings_from_persisted_decade_data_test)

### average_decade_lemma_diff_dict

In [None]:
def average_decade_lemma_diff_dict(decade_lemma_decade_dict: DecadeLemmaDiffDict, enable_avg=True) -> dict[Lemma, float]:
    print("average_decade_lemma_diff_dict: start")
    lemma_diff_averaged_dict_tmp = {}
    for decade, lemma_dict in decade_lemma_decade_dict.items():
        for lemma, diff in lemma_dict.items():
            diff_list = lemma_diff_averaged_dict_tmp.get(lemma, [])
            diff_list.append(diff)
            lemma_diff_averaged_dict_tmp[lemma] = diff_list
    lemma_diff_averaged_dict = {}
    for lemma, diff_list in lemma_diff_averaged_dict_tmp.items():
        diff = sum(diff_list)
        if enable_avg:
            diff /= len(diff_list)
        lemma_diff_averaged_dict[lemma] = diff
    lemma_diff_averaged_dict = sort_dict_by_value(lemma_diff_averaged_dict)
    print("average_decade_lemma_diff_dict: len(lemma_diff_averaged_dict):", len(lemma_diff_averaged_dict))
    return lemma_diff_averaged_dict


if TEST:
    lemma_decade_dict_test = {
        183: {
            "gehen": 0.2,
            "laufen": 0.4,
        },
        184: {
            "gehen": 0.3,
            "laufen": 0.6,
        },
        185: {
            "gehen": 0.1,
            "laufen": 0.1,
        },
    }
    print(average_decade_lemma_diff_dict(lemma_decade_dict_test))
    print(average_decade_lemma_diff_dict(lemma_decade_dict_test, enable_avg=False))

### calculate_cos_sim_between_decades

In [None]:
def calculate_cos_sim_between_decades(decade_list) -> DecadeLemmaDiffDict:
    print("calculate_cos_sim_between_decades: start")
    decade_a = decade_list[0]
    decade_a_data = load_persisted_decade_data(decade_a)
    decade_lemma_cos_sim_dict = {}
    for decade_b in decade_list[1:]:
        decade_b_data = load_persisted_decade_data(decade_b)
        lemma_cos_sim_dict = calculate_cos_sim_between_indices(decade_a_data[0], decade_b_data[0])
        cos_sim_decade_str = str(decade_a) + "-" + str(decade_b)
        decade_lemma_cos_sim_dict[cos_sim_decade_str] = lemma_cos_sim_dict
        # lemma_occurrence_count_merged = merge_count_occurrences_dict(decade_a_data[3], decade_b_data[3])
        # lemma_cos_sim_dict_filtered = {}
        # for lemma, occurrence_count in lemma_occurrence_count_merged.items():
        #     lemma_cos_sim_dict_filtered[lemma] = lemma_cos_sim_dict[lemma]
        # lemma_cos_sim_dict = lemma_cos_sim_dict_filtered
        # for lemma, cos_sim in lemma_cos_sim_dict.items():
        #     decade_cos_sim_dict = decade_lemma_cos_sim_dict.get(lemma, {})
        #     decade_cos_sim_dict[cos_sim_decade_str] = cos_sim
        #     decade_lemma_cos_sim_dict[lemma] = decade_cos_sim_dict
        decade_a_data = decade_b_data
    print("calculate_cos_sim_between_decades: len(decade_lemma_cos_sim_dict):", len(decade_lemma_cos_sim_dict))
    return decade_lemma_cos_sim_dict


def calculate_cos_sim_between_decades_test():
    decade_lemma_cos_sim_dict = calculate_cos_sim_between_decades(decade_list)
    lemma_cos_sim_averaged_dict = average_decade_lemma_diff_dict(decade_lemma_cos_sim_dict)
    return decade_lemma_cos_sim_dict, lemma_cos_sim_averaged_dict


if TEST:
    decade_lemma_cos_sim_dict, lemma_cos_sim_averaged_dict = load_cache_or_run(calculate_cos_sim_between_decades_test)

### calculate_trajectories_between_decades

In [None]:
def calculate_trajectories_between_decades(decade_list: DecadeList) -> DecadeLemmaDiffDict:
    print("calculate_trajectories_between_decades: start")
    decade_a = decade_list[0]
    decade_a_data = load_persisted_decade_data(decade_list[0])
    decade_b_data = load_persisted_decade_data(decade_list[1])
    decade_lemma_trajectory_dict = {}
    for decade_c in decade_list[2:]:
        decade_data_c = load_persisted_decade_data(decade_c)
        index_a = decade_a_data[0]
        index_b = decade_b_data[0]
        index_c = decade_data_c[0]
        lemma_trajectory_dict_abc = calculate_trajectory_dict_from_index(index_a, index_b, index_c)
        trajectory_decades_str = str(decade_a) + "-" + str(decade_data_c)
        decade_lemma_trajectory_dict[trajectory_decades_str] = lemma_trajectory_dict_abc
        # for lemma, trajectory in lemma_trajectory_dict_abc.items():
        #     trajectory_lemma_dict = decade_lemma_trajectory_dict.get(lemma, {})
        #     trajectory_decades_str = str(decade_a) + "-" + str(decade_data_c)
        #     trajectory_lemma_dict[trajectory_decades_str] = trajectory
        #     decade_lemma_trajectory_dict[lemma] = trajectory_lemma_dict
        decade_a_data = decade_b_data
        decade_b_data = decade_data_c
    print("calculate_trajectories_between_decades: len(decade_lemma_trajectory_dict):", len(decade_lemma_trajectory_dict))
    return decade_lemma_trajectory_dict


def calculate_trajectories_between_decades_test():
    decade_list = create_decades_list()
    decade_lemma_trajectory_dict = calculate_trajectories_between_decades(decade_list)
    lemma_trajectory_averaged_dict = average_decade_lemma_diff_dict(decade_lemma_trajectory_dict)
    return decade_lemma_trajectory_dict, lemma_trajectory_averaged_dict


if TEST:
    decade_lemma_trajectory_dict, lemma_trajectory_averaged_dict = load_cache_or_run(calculate_trajectories_between_decades_test)

### calculate_relative_diff_between_decades

In [None]:
def calculate_relative_diff_between_decades(decade_list: DecadeList) -> DecadeLemmaDiffDict:
    print("calculate_relative_diff_between_decades: start")
    decade_a = decade_list[0]
    decade_a_data = load_persisted_decade_data(decade_a)
    decade_lemma_relative_diff_dict = {}
    for decade_b in decade_list[1:]:
        decade_b_data = load_persisted_decade_data(decade_b)
        lemma_relative_diff_dict = calculate_relative_diff_dict_from_index(
            decade_a_data[0],
            decade_b_data[0],
            decade_a_data[3],
            decade_b_data[3],
        )
        decade_str = str(decade_a) + "-" + str(decade_b)
        decade_lemma_relative_diff_dict[decade_str] = lemma_relative_diff_dict
        # lemma_relative_diff_dict_merged = merge_count_occurrences_dict(decade_a_data[3], decade_b_data[3])
        # lemma_relative_diff_dict_filtered = {}
        # for lemma, occurrence_count in lemma_relative_diff_dict_merged.items():
        #     lemma_relative_diff_dict_filtered[lemma] = lemma_relative_diff_dict[lemma]
        # lemma_relative_diff_dict = lemma_relative_diff_dict_filtered
        # for lemma, relative_diff in lemma_relative_diff_dict.items():
        #     decade_relative_diff_dict = decade_lemma_relative_diff_dict.get(lemma, {})
        #     decade_relative_diff_dict[decade_str] = relative_diff
        #     decade_lemma_relative_diff_dict[lemma] = decade_relative_diff_dict
        decade_a_data = decade_b_data
    print("calculate_relative_diff_between_decades: len(decade_lemma_relative_diff_dict):", len(decade_lemma_relative_diff_dict))
    return decade_lemma_relative_diff_dict


def calculate_relative_diff_between_decades_test():
    decade_list = create_decades_list()
    decade_lemma_relative_diff_dict = calculate_relative_diff_between_decades(decade_list)
    lemma_relative_diff_averaged_dict = average_decade_lemma_diff_dict(decade_lemma_relative_diff_dict)
    return decade_lemma_relative_diff_dict, lemma_relative_diff_averaged_dict


if TEST:
    decade_lemma_relative_diff_dict, lemma_relative_diff_averaged_dict = load_cache_or_run(calculate_relative_diff_between_decades_test)

# Analysis

## prepare_all

In [None]:
def prepare_all():
    decade_list = create_decades_list()
    preprocess_and_persist_decade_data(decade_list)
    decade_lemma_cos_sim_dict = calculate_cos_sim_between_decades(decade_list)
    lemma_cos_sim_averaged_dict = average_decade_lemma_diff_dict(decade_lemma_cos_sim_dict)
    decade_lemma_trajectory_dict = calculate_trajectories_between_decades(decade_list)
    lemma_trajectory_averaged_dict = average_decade_lemma_diff_dict(decade_lemma_trajectory_dict)
    decade_lemma_relative_diff_dict = calculate_relative_diff_between_decades(decade_list)
    lemma_relative_diff_averaged_dict = average_decade_lemma_diff_dict(decade_lemma_relative_diff_dict)
    return (
        decade_list,
        decade_lemma_cos_sim_dict,
        lemma_cos_sim_averaged_dict,
        decade_lemma_trajectory_dict,
        lemma_trajectory_averaged_dict,
        decade_lemma_relative_diff_dict,
        lemma_relative_diff_averaged_dict,
    )


(
    decade_list,
    decade_lemma_cos_sim_dict,
    lemma_cos_sim_averaged_dict,
    decade_lemma_trajectory_dict,
    lemma_trajectory_averaged_dict,
    decade_lemma_relative_diff_dict,
    lemma_relative_diff_averaged_dict,
) = load_cache_or_run(prepare_all)

## global changes analysis

### global cosine similarity changes

In [None]:
plot_2d_scatter(lemma_cos_sim_averaged_dict, "global average change between decades regarding cosine similarity")

### global trajectories

In [None]:
plot_2d_scatter(lemma_trajectory_averaged_dict, "global average change between decades regarding trajectory")

In [None]:
plot_2d_scatter(lemma_relative_diff_averaged_dict, "global average change between decades regarding relative differences")

### filter_on_decades

In [None]:
def filter_on_decades(decade_lemma_diff_dict, num_decades=2):
    lemma_decade_count_dict = {}
    for decade, lemma_diff_dict in decade_lemma_diff_dict.items():
        for lemma in lemma_diff_dict.keys():
            decade_count = lemma_decade_count_dict.get(lemma, 0)
            lemma_decade_count_dict[lemma] = decade_count + 1
    lemma_set = set()
    for lemma, decade_count in lemma_decade_count_dict.items():
        if decade_count >= num_decades:
            lemma_set.add(lemma)
    decade_lemma_dict_filtered = {}
    for decade, lemma_diff_dict in decade_lemma_diff_dict.items():
        lemma_diff_dict_new = {}
        for lemma, diff in lemma_diff_dict.items():
            if lemma in lemma_set:
                lemma_diff_dict_new[lemma] = diff
        decade_lemma_dict_filtered[decade] = lemma_diff_dict_new
    return decade_lemma_dict_filtered


if TEST:
    decade_lemma_dict_filtered = filter_on_decades(
        {
            180: {
                "gehen": 1,
                "laufen": 1,
                "Haus": 1,
            },
            181: {
                "gehen": 1,
                "wandern": 1,
            },
            182: {
                "gehen": 1,
                "wandern": 1,
            },
        }
    )
    print(decade_lemma_dict_filtered)

In [None]:
decade_lemma_cos_sim_filtered_dict = filter_on_decades(decade_lemma_cos_sim_dict)
decade_lemma_cos_sim_filtered_merged_dict = average_decade_lemma_diff_dict(decade_lemma_cos_sim_filtered_dict)
plot_2d_scatter(decade_lemma_cos_sim_filtered_merged_dict, "global filtered average change between decades regarding cosine similarity")

In [None]:
decade_lemma_trajectory_dict_filtered = filter_on_decades(decade_lemma_trajectory_dict)
decade_lemma_trajectory_dict_filtered_merged = average_decade_lemma_diff_dict(decade_lemma_trajectory_dict_filtered)
plot_2d_scatter(decade_lemma_trajectory_dict_filtered_merged, "global filtered average change between decades regarding trajectory")

In [None]:
decade_lemma_relative_diff_dict_filtered = filter_on_decades(decade_lemma_relative_diff_dict, num_decades=3)
decade_lemma_relative_diff_dict_filtered_merged = average_decade_lemma_diff_dict(decade_lemma_relative_diff_dict_filtered)
plot_2d_scatter(
    decade_lemma_relative_diff_dict_filtered_merged, "global filtered average change between decades regarding relative differences"
)

### compare cos sim and trajectories differences

### normalize global diffs

In [None]:
decade_lemma_cos_sim_filtered_merged_dict_normalized = normalize_diff_table(decade_lemma_cos_sim_filtered_merged_dict)
plot_2d_scatter(
    decade_lemma_cos_sim_filtered_merged_dict_normalized,
    "global normalized filtered average change between decades regarding cosine similarity",
)

In [None]:
decade_lemma_trajectory_dict_filtered_merged_normalized = normalize_diff_table(decade_lemma_trajectory_dict_filtered_merged)
plot_2d_scatter(
    decade_lemma_trajectory_dict_filtered_merged_normalized,
    "global normalized filtered average change between decades regarding trajectory",
)

In [None]:
decade_lemma_relative_diff_dict_filtered_merged_normalized = normalize_diff_table(
    decade_lemma_relative_diff_dict_filtered_merged,
    enable_inversion=True,
)
plot_2d_scatter(
    decade_lemma_relative_diff_dict_filtered_merged_normalized,
    "global normalized filtered average change between decades regarding trajectory",
)

In [None]:
decade_lemma_diffs_compared = create_lemma_diff_diff_dicts(
    decade_lemma_cos_sim_filtered_merged_dict_normalized, decade_lemma_trajectory_dict_filtered_merged_normalized
)
plot_2d_scatter(decade_lemma_diffs_compared, "difference between normalized cos sim and trajectories")

In [None]:
decade_lemma_diffs_compared_merged = merge_lemma_diff_diff_dict(
    decade_lemma_diffs_compared,
    decade_lemma_cos_sim_filtered_merged_dict_normalized,
    decade_lemma_trajectory_dict_filtered_merged_normalized,
)
plot_2d_scatter(decade_lemma_diffs_compared_merged)

In [None]:
random_dict_a = create_random_lemma_value_dict(decade_lemma_cos_sim_filtered_merged_dict_normalized)
random_dict_b = create_random_lemma_value_dict(decade_lemma_trajectory_dict_filtered_merged_normalized)
random_lemma_diff_diff_dict = create_lemma_diff_diff_dicts(random_dict_a, random_dict_b)
decade_lemma_diffs_compared_merged_random = merge_lemma_diff_diff_dict(random_lemma_diff_diff_dict, random_dict_a, random_dict_b)
plot_2d_scatter(decade_lemma_diffs_compared_merged_random)

### compare cos sim and relative diff differences

### average differences per decade

In [None]:
average_cos_sim_per_decade_dict = calculate_average_diff(decade_lemma_cos_sim_dict)
plot_2d_scatter(average_cos_sim_per_decade_dict, draw_line=True)

In [None]:
average_trajectory_per_decade_dict = calculate_average_diff(decade_lemma_trajectory_dict)
plot_2d_scatter(average_trajectory_per_decade_dict, draw_line=True)

In [None]:
average_relative_diff_per_decade_dict = calculate_average_diff(decade_lemma_relative_diff_dict)
plot_2d_scatter(average_relative_diff_per_decade_dict, draw_line=True)

## sample analysis

### get_lemma_by_position

In [None]:
def get_lemma_by_position(lemma_dict, index_start=0, n=10, direction_forward=True, index_start_is_percent=False):
    lemma_dict_selected = {}
    key_value_list = []
    if direction_forward:
        for key, value in lemma_dict.items():
            if is_word(key):
                key_value_list.append((key, value))
    else:
        for key, value in list(lemma_dict.items())[::-1]:
            if is_word(key):
                key_value_list.append((key, value))
    if index_start_is_percent:
        index_start = int((len(key_value_list) / 100) * index_start)
    index_end = index_start + n
    for i, (key, value) in enumerate(key_value_list):
        if index_start <= i:
            if i < index_end:
                lemma_dict_selected[key] = value
            else:
                break
    return lemma_dict_selected

### sample_and_plot_lemmas

In [None]:
def sample_and_plot_lemmas(
    decade_lemma_cos_sim_dict,
    lemma_cos_sim_averaged_dict,
    decade_lemma_trajectory_dict,
    lemma_trajectory_averaged_dict,
    decade_lemma_relative_diff_dict,
    lemma_relative_diff_averaged_dict,
    index_start,
    n=10,
    direction_forward=True,
    index_start_is_percent=False,
    plot_limit=3,
):

    def sample_and_plot_lemmas_internal(
        decade_lemma_diff_dict,
        lemma_diff_averaged_dict,
        title,
    ):
        lemma_diff_sampled_dict = get_lemma_by_position(
            lemma_diff_averaged_dict,
            index_start=index_start,
            n=n,
            direction_forward=direction_forward,
            index_start_is_percent=index_start_is_percent,
        )
        print(lemma_diff_sampled_dict)
        lemma_diff_list_sampled = list(lemma_diff_sampled_dict.keys())[:plot_limit]
        for lemma in lemma_diff_list_sampled:
            decade_diff_dict = {}
            for decade, lemma_diff_dict in decade_lemma_diff_dict.items():
                diff = lemma_diff_dict.get(lemma)
                if diff is not None:
                    decade_diff_dict[decade] = diff
            plot_2d_scatter(decade_diff_dict, draw_line=True, title=title + ": " + lemma)
        return lemma_diff_list_sampled

    lemma_diff_list_sampled = sample_and_plot_lemmas_internal(decade_lemma_cos_sim_dict, lemma_cos_sim_averaged_dict, "cosine similarity")
    lemma_diff_list_sampled += sample_and_plot_lemmas_internal(decade_lemma_trajectory_dict, lemma_trajectory_averaged_dict, "trajectory")
    lemma_diff_list_sampled += sample_and_plot_lemmas_internal(decade_relative_diff_dict, lemma_relative_diff_averaged_dict, "trajectory")
    lemma_diff_list_sampled = list(set(lemma_diff_list_sampled))
    plot_lemma_and_decade_from_lemma_list(lemma_diff_list_sampled, "embeddings samples regarding cosine similarity and trajectory")

### top lemma

In [None]:
sample_and_plot_lemmas(
    decade_lemma_cos_sim_dict,
    decade_lemma_cos_sim_filtered_merged_dict,
    decade_lemma_trajectory_dict,
    decade_lemma_trajectory_dict_filtered_merged,
    decade_lemma_relative_diff_dict,
    lemma_relative_diff_averaged_dict,
    index_start=0,
    n=10,
    plot_limit=3,
)

### middle lemmas

In [None]:
sample_and_plot_lemmas(
    decade_lemma_cos_sim_dict,
    decade_lemma_cos_sim_filtered_merged_dict,
    decade_lemma_trajectory_dict,
    decade_lemma_trajectory_dict_filtered_merged,
    index_start=50,
    n=10,
    plot_limit=3,
    index_start_is_percent=True,
)

### bottom lemmas

In [None]:
sample_and_plot_lemmas(
    decade_lemma_cos_sim_dict,
    decade_lemma_cos_sim_filtered_merged_dict,
    decade_lemma_trajectory_dict,
    decade_lemma_trajectory_dict_filtered_merged,
    index_start=0,
    n=10,
    plot_limit=3,
    direction_forward=False,
)

### together

In [None]:
# top
global_top_cos_sim = sample_diff_dict = get_lemma_by_position(
    decade_lemma_cos_sim_filtered_merged_dict,
    index_start=0,
    n=2,
)
global_top_cos_sim = list(global_top_cos_sim.keys())

# middle
global_middle_cos_sim = sample_diff_dict = get_lemma_by_position(
    decade_lemma_cos_sim_filtered_merged_dict,
    index_start=50,
    n=2,
    index_start_is_percent=True,
)
global_middle_cos_sim = list(global_middle_cos_sim.keys())

# bottom
global_bottom_cos_sim = sample_diff_dict = get_lemma_by_position(
    decade_lemma_cos_sim_filtered_merged_dict,
    index_start=0,
    n=2,
    direction_forward=False,
)
global_bottom_cos_sim = list(global_bottom_cos_sim.keys())

# together
global_together_cos_sim = global_top_cos_sim + global_middle_cos_sim + global_bottom_cos_sim
plot_lemma_and_decade_from_lemma_list(global_together_cos_sim, "sampled via cosine similarity: " + str(global_together_cos_sim))

In [None]:
# top
global_top_trajectory = sample_diff_dict = get_lemma_by_position(
    decade_lemma_trajectory_dict_filtered_merged,
    index_start=0,
    n=2,
)
global_top_trajectory = list(global_top_trajectory.keys())

# middle
global_middle_trajectory = sample_diff_dict = get_lemma_by_position(
    decade_lemma_trajectory_dict_filtered_merged,
    index_start=50,
    n=2,
    index_start_is_percent=True,
)
global_middle_trajectory = list(global_middle_trajectory.keys())

# bottom
global_bottom_trajectory = sample_diff_dict = get_lemma_by_position(
    decade_lemma_trajectory_dict_filtered_merged,
    index_start=0,
    n=2,
    direction_forward=False,
)
global_bottom_trajectory = list(global_bottom_trajectory.keys())

# together
global_together_trajectory = global_top_trajectory + global_middle_trajectory + global_bottom_trajectory
plot_lemma_and_decade_from_lemma_list(global_together_trajectory, "sampled via trajectory: " + str(global_together_trajectory))

# DB Close

In [None]:
cursor.close()
conn.close()