## setup and loading

In [None]:
import os
import pickle
import random
from dataclasses import dataclass
from typing import TypeAlias

import hnswlib
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import spacy
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nlp = spacy.load("de_core_news_sm")
random.seed(42)

In [None]:
ENABLE_TEST = True
FOLDER_MODELS_WORD2VEC = "/veld/input/models/"
FOLDER_TEXTS = "/veld/input/texts/"

## tpye aliases

In [None]:
Lemma: TypeAlias = str
Word: TypeAlias = str
Decade: TypeAlias = int
LineNumber: TypeAlias = int
WordNumber: TypeAlias = int
OccurrenceCount: TypeAlias = int
Embedding: TypeAlias = np.ndarray
Model: TypeAlias = Word2Vec
LineNumberDict: TypeAlias = dict[LineNumber, list[WordNumber]]
IdToLemmaDict: TypeAlias = dict[int, Lemma]
LemmaToIdDict: TypeAlias = dict[Lemma, int]
Index: TypeAlias = list[hnswlib.Index, LemmaToIdDict, IdToLemmaDict]
LemmaDict: TypeAlias = dict[Lemma, list[list[Word], Embedding, LineNumberDict, OccurrenceCount]]
DecadeDict: TypeAlias = dict[Decade, list[Model, LemmaDict, Index]]

## create_decades_list

In [None]:
def create_decades_dict(folder) -> DecadeDict:
    decade_dict: DecadeDict = {}
    for model_file in os.listdir(folder):
        if model_file.endswith(".bin"):
            decade = int(model_file.split(".bin")[0])
            decade_dict[decade] = None
    print("create_decades_dict: decade_dict:", decade_dict)
    return decade_dict


if ENABLE_TEST:
    decade_dict = create_decades_dict(FOLDER_MODELS_WORD2VEC)

## load_model

In [None]:
def load_model(model_path) -> Model:
    model = Word2Vec.load(model_path)
    print("load_model_word2vec: model_path:", model_path)
    print("load_model_word2vec: model:", model)
    return model


if ENABLE_TEST:
    decade_dict[168] = [load_model(FOLDER_MODELS_WORD2VEC + "168.bin"), None, None]
    decade_dict[169] = [load_model(FOLDER_MODELS_WORD2VEC + "169.bin"), None, None]

## create_lemma_dict

In [None]:
def create_lemma_dict(decade: Decade, model: Model) -> LemmaDict:
    lemma_dict: LemmaDict = {}
    word_to_lemma_dict = {}

    for word in model.wv.index_to_key:
        lemma = nlp(word)[0].lemma_
        word_to_lemma_dict[word] = lemma
        lemma_dict_value = lemma_dict.get(lemma, [[], None, {}, 0])
        word_list = lemma_dict_value[0]
        word_list.append(word)
        lemma_dict[lemma] = lemma_dict_value

    for lemma, lemma_dict_value in lemma_dict.items():
        word_list = lemma_dict_value[0]
        word_embedding_list = [model.wv[word] for word in word_list]
        embedding_average = np.mean(np.array(word_embedding_list), axis=0)
        embedding_normalized = embedding_average / np.linalg.norm(embedding_average)
        lemma_dict_value[1] = embedding_normalized

    with open(FOLDER_TEXTS + str(decade) + ".txt", "r") as f:
        for line_number, line in enumerate(f):
            for word_number, word in enumerate(line.split(" ")):
                lemma = word_to_lemma_dict.get(word)
                if lemma:
                    lemma_dict_value = lemma_dict[lemma]
                    line_number_dict: LineNumberDict = lemma_dict_value[2]
                    word_number_list: list[WordNumber] = line_number_dict.get(line_number, [])
                    word_number_list.append(word_number)
                    line_number_dict[line_number] = word_number_list
                    occurrence_count = lemma_dict_value[3]
                    lemma_dict_value[3] = occurrence_count + 1

    print("create_lemma_dict: len(lemma_dict):", len(lemma_dict))
    return lemma_dict


if ENABLE_TEST:
    decade_dict[168][1] = create_lemma_dict(168, decade_dict[168][0])
    decade_dict[169][1] = create_lemma_dict(169, decade_dict[169][0])
    print(decade_dict[168][0])
    print(decade_dict[169][0])

## create_index

In [None]:
def create_index(lemma_dict: LemmaDict) -> Index:

    # prepare data
    id_to_lemma_dict: IdToLemmaDict = {}
    lemma_to_id_dict: LemmaToIdDict = {}
    embedding_array = []
    for lemma_id, (lemma, lemma_dict_value) in enumerate(lemma_dict.items()):
        id_to_lemma_dict[lemma_id] = lemma
        lemma_to_id_dict[lemma] = lemma_id
        embedding_array.append(lemma_dict_value[1])
    embedding_array = np.array(embedding_array)

    # create index
    max_elements = len(embedding_array)
    dim = embedding_array[0].shape[0]
    index = hnswlib.Index(space="cosine", dim=dim)
    index.init_index(max_elements=max_elements, ef_construction=100, M=16)
    index.add_items(embedding_array, ids=list(id_to_lemma_dict.keys()))
    return [index, lemma_to_id_dict, id_to_lemma_dict]


if ENABLE_TEST:
    decade_dict[168][2] = create_index(decade_dict[168][1])
    decade_dict[169][2] = create_index(decade_dict[169][1])

## query_related

In [None]:
def query_related(index: Index, lemma: Lemma, n=10):
    result = None
    try:
        id_embedding = index[1][lemma]
    except:
        print("not found")
    else:
        embedding = index[0].get_items([id_embedding])[0]
        ids, distances = index[0].knn_query(embedding.reshape(1, -1), k=n)
        result = {}
        for id_other, distance in list(zip(ids[0], distances[0]))[1:]:
            lemma = index[2][id_other]
            result[lemma] = distance
    return result


if ENABLE_TEST:
    print(query_related(decade_dict[168][2], "gehen", n=10))
    print(query_related(decade_dict[169][2], "gehen", n=10))

## query_cos_distance

In [None]:
def query_cos_distance(index, lemma_a, lemma_b):
    cos_distance = None
    try:
        id_a = index[1][lemma_a]
        id_b = index[1][lemma_b]
        embedding_a = index[0].get_items([id_a])[0]
        embedding_b = index[0].get_items([id_b])[0]
        cos_distance = 1 - np.dot(embedding_a, embedding_b)
    except:
        pass
    return cos_distance


if ENABLE_TEST:
    print(query_cos_distance(decade_dict[168][2], "gehen", "laufen"))
    print(query_cos_distance(decade_dict[169][2], "gehen", "laufen"))

## print_occurrence

In [None]:
def print_occurrence(decade: Decade, lemma_dict: LemmaDict, lemma: Lemma, max_print=5):
    line_number_dict = lemma_dict[lemma][2]
    with open(FOLDER_TEXTS + str(decade) + ".txt", "r") as f:
        num_print = 0
        for line_number, line in enumerate(f):
            word_number_list = line_number_dict.get(line_number)
            if word_number_list:
                word_number_set = set(word_number_list)
                text = ""
                for word_number, word in enumerate(line.split(" ")):
                    if word_number in word_number_set:
                        text += " >>> " + word + " <<<"
                    else:
                        text += " " + word
                print(text)
                num_print += 1
                if max_print and num_print == max_print:
                    break


print_occurrence(168, decade_dict[168][1], "gehen", max_print=1)
print_occurrence(169, decade_dict[169][1], "gehen", max_print=1)

## create_diff_of_lemma

In [None]:
def create_diff_of_lemma(index_a_dict, index_b_dict, lemma):
    diff_total = None

    # key and dict synchronization
    distances_a_dict = query_related(index_a_dict, lemma)
    distances_b_dict = query_related(index_b_dict, lemma)
    if distances_a_dict and distances_b_dict:
        lemma_a_set = set(distances_a_dict.keys())
        lemma_b_set = set(distances_b_dict.keys())
        lemma_all = set()
        for lemma_a in lemma_a_set:
            if lemma_a not in lemma_b_set:
                distances_b_dict[lemma_a] = query_cos_distance(index_b_dict, lemma, lemma_a)
            lemma_all.add(lemma_a)
        for lemma_b in lemma_b_set:
            if lemma_b not in lemma_a_set:
                distances_a_dict[lemma_b] = query_cos_distance(index_a_dict, lemma, lemma_b)
            lemma_all.add(lemma_b)

        # difference calculation
        diff_total = 0
        for lemma_related in lemma_all:
            distance_a = distances_a_dict[lemma_related]
            distance_b = distances_b_dict[lemma_related]
            if distance_a and distance_b:
                diff_total += abs(distance_a - distance_b)

    return diff_total


if ENABLE_TEST:
    print(create_diff_of_lemma(index_168_dict, index_169_dict, "gehen"))

## create_diff_between_indices

In [None]:
def create_diff_between_indices(index_a_dict, index_b_dict):
    diff_between_indices = {}
    lemma_common = set(index_a_dict["lemma_to_id_dict"].keys()) & set(index_b_dict["lemma_to_id_dict"].keys())
    for lemma in lemma_common:
        diff = create_diff_of_lemma(index_a_dict, index_b_dict, lemma)
        diff_between_indices[lemma] = diff
    return diff_between_indices


if ENABLE_TEST:
    diff_between_indices = create_diff_between_indices(index_168_dict, index_169_dict)

# OLD

In [None]:
def create_global_word_set(model_list):
    word_set = set()
    for m in model_list:
        for w in m.wv.index_to_key:
            word_set.add(w)
    return word_set


word_set = create_global_word_set(list(model_dict.values()))
len(word_set)

In [None]:
def create_word_set_sample(word_set, model_list, limit, must_be_in_all_model):
    word_set_sample = set()
    num_found = 0
    word_list_shuffled = list(word_set)
    random.shuffle(word_list_shuffled)
    for w in word_list_shuffled:
        if num_found == limit:
            break
        else:
            skip = False
            if must_be_in_all_model:
                for m in model_list:
                    if w not in m.wv:
                        skip = True
                        break
            if not skip:
                word_set_sample.add(w)
                num_found += 1
    return word_set_sample


word_set_sample = create_word_set_sample(word_set, list(model_dict.values()), 100, False)
print(len(word_set_sample))

# analysis functions

In [None]:
def calculate_cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [None]:
def calculate_gini_coefficient(x):
    x = np.array(x, dtype=np.float64)
    n = len(x)
    x_sorted = np.sort(x)
    cum_x = np.cumsum(x_sorted)
    gini = (2 * np.sum((np.arange(1, n + 1) * x_sorted))) / (n * np.sum(x_sorted)) - (n + 1) / n
    return gini


calculate_gini_coefficient([1, 3, 2, 100, 2])

In [None]:
def get_diff_between_decades(w, model_prev, model_current, debug=False):
    try:
        v_prev = model_prev.wv[w]
        v_current = model_current.wv[w]
        word_set_top_10_prev = model_prev.wv.most_similar(w)
        word_set_top_10_current = model_current.wv.most_similar(w)
    except:
        return None

    word_set_both = set()
    for w_prev, _ in word_set_top_10_prev:
        word_set_both.add(w_prev)
    for w_current, _ in word_set_top_10_current:
        word_set_both.add(w_current)

    diff_per_decade = []
    for w_rel in word_set_both:
        try:
            v_rel_prev = model_prev.wv[w_rel]
            v_rel_current = model_current.wv[w_rel]
        except:
            pass
        else:
            cos_sim_prev = calculate_cosine_similarity(v_prev, v_rel_prev)
            cos_sim_current = calculate_cosine_similarity(v_current, v_rel_current)
            diff_prev_current = abs(cos_sim_prev - cos_sim_current)
            diff_per_decade.append(diff_prev_current)
            if debug:
                print(
                    "w_rel:",
                    w_rel,
                    "cos_sim_prev:",
                    cos_sim_prev,
                    "cos_sim_current:",
                    cos_sim_current,
                    "diff_prev_current:",
                    diff_prev_current,
                )
    if diff_per_decade:
        diff_per_decade = sum(diff_per_decade) / len(diff_per_decade)

        return diff_per_decade


get_diff_between_decades("gesetze", model_dict[178], model_dict[179], True)

In [None]:
def create_relative_history(word_set, model_dict, print_progress=False):
    len_total = len(word_set)
    if print_progress:
        len_segment = round(len_total / 100)
        print("len_total:", len_total)
        print("len_segment:", len_segment)
    word_diff_history = {}
    for i, w in enumerate(word_set):
        if print_progress and i % len_segment == 0:
            print("i:", i)
        model_prev = None
        diff_between_decade_dict = {}
        diff_between_decade_total = []
        for decade, model_current in model_dict.items():
            if model_prev:
                diff_between_decade = get_diff_between_decades(w, model_prev, model_current)
                if diff_between_decade:
                    diff_between_decade_dict[str(decade) + "0s"] = diff_between_decade
                    if 152 <= decade <= 191:
                        diff_between_decade_total.append(diff_between_decade)
            model_prev = model_current

        if diff_between_decade_total:
            diff_between_decade_avg = sum(diff_between_decade_total) / len(diff_between_decade_total)
            diff_between_decade_gini = calculate_gini_coefficient(diff_between_decade_total)
            word_diff_history[w] = {
                "diff_decade_avg": diff_between_decade_avg,
                "diff_decade_gini": diff_between_decade_gini,
                "diff_decade_dict": diff_between_decade_dict,
            }
    return word_diff_history


# word_diff_history = create_relative_history(word_set, model_dict, True)
# word_diff_history

In [None]:
# with open("/veld/output/word_diff_history.pkl", "wb") as f:
#    pickle.dump(word_diff_history, f)

with open("/veld/output/word_diff_history.pkl", "rb") as f:
    word_diff_history = pickle.load(f)

In [None]:
word_diff_history

In [None]:
def sort_word_diff_history(word_diff_history, sort_key):
    return dict(sorted(word_diff_history.items(), key=lambda item: -item[1][sort_key]))


word_diff_history_sorted_avg = sort_word_diff_history(word_diff_history, "diff_decade_avg")
word_diff_history_sorted_gini = sort_word_diff_history(word_diff_history, "diff_decade_gini")

In [None]:
for i, w_diff_history in enumerate(word_diff_history.items()):
    if i == 3:
        break
    print(w_diff_history)

print("------------")

for i, w_diff_history in enumerate(word_diff_history_sorted_avg.items()):
    if i == 3:
        break
    print(w_diff_history)

print("------------")

for i, w_diff_history in enumerate(word_diff_history_sorted_gini.items()):
    if i == 3:
        break
    print(w_diff_history)

In [None]:
def get_sample(
    word_diff_history,
    from_top,
    min_num_decades,
    max_sample,
    index_range=None,
    word_set=None,
):
    word_diff_history_sample = {}
    if from_top:
        direction = 1
    else:
        direction = -1
    if index_range:
        if not index_range[0]:
            index_range[0] = 0
        if not index_range[1]:
            index_range[1] = len(word_diff_history)
    num_found = 0
    for i, w_history in enumerate(list(word_diff_history.items())[::direction]):
        w = w_history[0]
        w_diff = w_history[1]
        if ((word_set and w in word_set) or not word_set) and ((index_range and index_range[0] <= i < index_range[1]) or not index_range):
            if num_found == max_sample:
                break
            if (min_num_decades and len(w_diff["diff_decade_dict"]) >= min_num_decades) or not min_num_decades:
                word_diff_history_sample[w] = w_diff
                num_found += 1
    return word_diff_history_sample

In [None]:
get_sample(
    word_diff_history_sorted_avg,
    from_top=True,
    min_num_decades=30,
    max_sample=3,
    index_range=[None, None],
    word_set=None,
)

# plotting

In [None]:
def plot_history(word_diff_history):
    decades = []
    for d in range(1470, 1960, 10):
        decades.append(str(d) + "s")
    data = {}
    for w, diff_history in word_diff_history.items():
        data[w] = diff_history["diff_decade_dict"]
    df = pd.DataFrame(data).reindex(decades).reset_index()
    df = df.rename(columns={"index": "Decade"})
    fig = go.Figure()
    for category in data.keys():
        fig.add_trace(
            go.Scatter(
                x=df["Decade"],
                y=df[category],
                mode="lines+markers",
                name=category,
                connectgaps=True,  # Ensures lines are drawn across missing data
            )
        )
    fig.update_layout(
        title="Data points by decade",
        xaxis_title="Decades",
        yaxis_title="Value",
        xaxis=dict(categoryorder="array", categoryarray=decades),
    )
    fig.show()

In [None]:
def show_plot_tsne(vector_dict, title=None):

    labels = []
    values = []
    for w in WORD_LIST:
        labels.append(w)
        values.append(vector_dict[w])
    values = np.array(values)

    tsne = TSNE(n_components=2, perplexity=5, random_state=42)
    reduced_vectors_tsne = tsne.fit_transform(values)

    plt.figure(figsize=(8, 6))
    plt.scatter(reduced_vectors_tsne[:, 0], reduced_vectors_tsne[:, 1], c="blue", alpha=0.7)

    for i, label in enumerate(labels):
        plt.text(
            reduced_vectors_tsne[i, 0],
            reduced_vectors_tsne[i, 1],
            label,
            fontsize=9,
            ha="right",
            color="black",
        )
    if title:
        plt.title(title)
    plt.show()

In [None]:
plot_history(
    get_sample(
        word_diff_history_sorted_avg,
        from_top=True,
        min_num_decades=30,
        max_sample=3,
        index_range=[9000, None],
        word_set=None,
    )
)

In [None]:
plot_history(
    get_sample(
        word_diff_history_sorted_gini,
        from_top=True,
        min_num_decades=30,
        max_sample=3,
        index_range=[10000, None],
        word_set=None,
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=30,
        max_sample=3,
        index_range=[1200, None],
        word_set=None,
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=30,
        num_sample=3,
        index_range=[2000, None],
        word_set=None,
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=30,
        num_sample=3,
        index_range=[10000, None],
        word_set=None,
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=False,
        min_num_decades=30,
        num_sample=3,
        index_range=[None, None],
        word_set=None,
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=False,
        min_num_decades=10,
        num_sample=3,
        index_range=[40000, None],
        word_set=None,
    )
)

In [None]:
for w in word_relative_history_list:
    if w[0] in ["dar", "sollst"]:
        print(w)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=30,
        num_sample=3,
        index_range=[None, None],
        word_set=["gesetz", "himmel"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=None,
        num_sample=3,
        index_range=[None, None],
        word_set=["demokratie"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=None,
        num_sample=3,
        index_range=[None, None],
        word_set=["frau", "mann"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=None,
        num_sample=3,
        index_range=[None, None],
        word_set=["mutter", "vater"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=None,
        num_sample=3,
        index_range=[None, None],
        word_set=["könig", "kaiser"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=None,
        num_sample=4,
        index_range=[None, None],
        word_set=["wasser", "erde", "brot", "haus"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=None,
        num_sample=10,
        index_range=[None, None],
        word_set=["haus", "kaiser"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list,
        from_top=True,
        min_num_decades=None,
        num_sample=10,
        index_range=[None, None],
        word_set=["mensch", "gott", "welt"],
    )
)

In [None]:
plot(
    get_cos_sim_sample(
        word_relative_history_list_gini,
        from_top=True,
        min_num_decades=20,
        num_sample=5,
        index_range=[None, None],
        word_set=None,
    )
)

# various snippets

In [None]:
import hnswlib
import numpy as np

# Your data
texts = ["apple", "banana", "orange"]
vectors = np.random.rand(3, 128).astype("float32")
vectors /= np.linalg.norm(vectors, axis=1, keepdims=True)

# Map text labels to integer IDs
text_to_id = {text: i for i, text in enumerate(texts)}
id_to_text = {i: text for text, i in text_to_id.items()}

# Initialize the index
dim = 128
index = hnswlib.Index(space="cosine", dim=dim)
index.init_index(max_elements=10, ef_construction=100, M=16)
index.add_items(vectors, ids=list(text_to_id.values()))

# Query
query_vector = vectors[0].reshape(1, -1)
labels, distances = index.knn_query(query_vector, k=2)

# Convert back to text labels
results = [(id_to_text[label], dist) for label, dist in zip(labels[0], distances[0])]
print(results)

In [None]:
# TODO: probably not useful
def create_word_cos_sim_history_list(word_set, model_dict):
    word_cos_sim_history_list = []
    for w in word_set:
        cos_sim_history_dict = {}
        model_prev = None
        w_vec_current = None
        w_vec_prev = None
        total_diff = 0
        for decade, model_current in model_dict.items():
            try:
                w_vec_current = model_current.wv[w]
            except:
                pass
            else:
                if w_vec_prev is not None:
                    cos_sim = calculate_cosine_similarity(w_vec_prev, w_vec_current)
                    total_diff += 2 - (cos_sim + 1)
                    cos_sim_history_dict[str(decade) + "0s"] = cos_sim
                w_vec_prev = w_vec_current
        if cos_sim_history_dict:
            word_cos_sim_history_list.append((w, total_diff, cos_sim_history_dict))
    word_cos_sim_history_list = sorted(word_cos_sim_history_list, key=lambda x: -x[1])
    return word_cos_sim_history_list


# word_cos_sim_history_list = create_word_cos_sim_history_list(word_set, model_dict)
# print(len(word_cos_sim_history_list))

In [None]:
import pandas as pd
import plotly.graph_objects as go

# Define the full range of decades
decades = [f"{1900+10*i}s" for i in range(10)]  # 1900s to 2000s

# Sample data
data = {
    "a": {"1920s": 0.6, "1930s": 0.5, "1980s": 0.1},
    "b": {"1930s": 0.2, "1980s": 0.4},
}

# Convert to DataFrame with explicit ordering
df = pd.DataFrame(data).reindex(decades).reset_index()
df = df.rename(columns={"index": "Decade"})

# Create the plot
fig = go.Figure()

for category in data.keys():
    fig.add_trace(
        go.Scatter(
            x=df["Decade"],
            y=df[category],
            mode="lines+markers",
            name=category,
            connectgaps=True,  # Ensures lines are drawn across missing data
        )
    )

# Format the layout
fig.update_layout(
    title="Data points by decade",
    xaxis_title="Decades",
    yaxis_title="Value",
    xaxis=dict(categoryorder="array", categoryarray=decades),
)

fig.show()

In [None]:
import matplotlib.pyplot as plt

# Sample data
data = {
    "a": {"1920s": 1, "1930s": 0.5, "1980s": 0.1},
    "b": {"1930s": 0.2, "1940s": 0, "1950s": 0.7, "1980s": 0.4},
}

# Define all decades (ensuring consistent x-axis)
decades = [f"{1900 + 10*i}s" for i in range(10)]

# Plot each ID
plt.figure(figsize=(10, 5))
for identifier, values in data.items():
    y_values = [values.get(decade, None) for decade in decades]  # Use None for missing data
    plt.plot(decades, y_values, marker="o", label=identifier)

# Customize plot
plt.xlabel("Decades")
plt.ylabel("Value")
plt.title("Data points by decade")
plt.legend()
plt.grid(True)

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
data = {
    "a": {"1920s": 1, "1930s": 0.5, "1980s": 0.1},
    "b": {"1930s": 0.2, "1950s": 0.8, "1980s": 0.4},
}

# Define all decades (ensuring consistent x-axis)
decades = [f"{1900 + 10*i}s" for i in range(10)]

# Plot each ID
plt.figure(figsize=(10, 5))
for identifier, values in data.items():
    y_values = [values.get(decade, np.nan) for decade in decades]  # Use np.nan to avoid connecting missing points
    plt.plot(decades, y_values, marker="o", label=identifier)

# Customize plot
plt.xlabel("Decades")
plt.ylabel("Value")
plt.title("Data points by decade")
plt.legend()
plt.grid(True)

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Sample data
data = {
    "a": {"1920s": 1, "1930s": 0.5, "1980s": 0.1},
    "b": {"1930s": 0.2, "1950s": 0.8, "1980s": 0.4},
}

# Define all decades (ensuring consistent x-axis)
decades = [f"{1900 + 10*i}s" for i in range(10)]

# Convert to a DataFrame for interpolation
df = pd.DataFrame({key: {d: data[key].get(d, np.nan) for d in decades} for key in data})

# Interpolate missing values (linear interpolation)
df.interpolate(method="linear", inplace=True)

# Plot each ID
plt.figure(figsize=(10, 5))
for identifier in df.columns:
    plt.plot(decades, df[identifier], marker="o", label=identifier)

# Customize plot
plt.xlabel("Decades")
plt.ylabel("Value")
plt.title("Data points by decade (with interpolation)")
plt.legend()
plt.grid(True)

plt.show()

In [None]:
!pip install plotly

In [None]:
import pandas as pd
import plotly.express as px

# Sample data
data = {
    "Decade": ["1920s", "1930s", "1980s", "1930s", "1950s", "1980s"],
    "Value": [1, 0.5, 0.1, 0.2, 0.8, None],
    "Category": ["a", "a", "a", "b", "b", "b"],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Plot using Plotly Express
fig = px.line(
    df,
    x="Decade",
    y="Value",
    color="Category",
    markers=True,
    title="Data points by decade",
)

fig.show()

In [None]:
calculate_cosine_similarity(model_dict[180].wv["mann"], model_dict[180].wv["mann"])

In [None]:
model_dict[180].wv.most_similar("mann", topn=10)

In [None]:
model_dict[181].wv.most_similar("mann", topn=10)

In [None]:
# Extract vocabulary and corresponding vectors
words = list(model.wv.index_to_key)[:100]  # List of words in vocabulary
vectors = np.array([model.wv[word] for word in words])  # Word vectors

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(vectors)

# Convert to a dictionary of word pairs (optional)
similarity_dict = {(words[i], words[j]): similarity_matrix[i, j] for i in range(len(words)) for j in range(len(words)) if i != j}

# Example: Print top 10 most similar word pairs
sorted_pairs = sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)
for pair, sim in sorted_pairs[:10]:
    print(f"{pair}: {sim}")

In [None]:
similarity_matrix = cosine_similarity(vectors)

In [None]:
similarity_matrix

In [None]:
# Convert to a dictionary of word pairs (optional)
similarity_dict = {(words[i], words[j]): similarity_matrix[i, j] for i in range(len(words)) for j in range(len(words)) if i != j}

# Example: Print top 10 most similar word pairs
sorted_pairs = sorted(similarity_dict.items(), key=lambda x: x[1], reverse=True)
for pair, sim in sorted_pairs:
    print(f"{pair}: {sim}")