In [1]:
from typing import List

import numpy as np
import pandas as pd
import scipy.stats as stats

from bokeh.plotting import figure, show

from corpora import corpora as countries
from stemmers import stemmers
from algo import algos
from settings import model_dir

from create import tokenize_values  # , load_source, calc_occurences
import similarity
from similarity import viz_params, viz_fns

# from similarity import render, calc_dist, calc_agg, calc_shift

TypeError: Type Tuple cannot be instantiated; use tuple() instead

In [None]:
tkn = "sb"
# tkn = "dummy"
algo = "ft"
# algo = "w2v"

# culture = "I"
culture = "all"
# culture = "Germany"
# culture = "Italy"
# culture = "Portugal"
iteration = 0  # sb values; dummy corpus
# iteration = 1 # sb values and corpus

epochs = 200

viz = "average"
# viz = "similarity"
# viz = "shift"
viz = "stdev"


model_dir = "/home/mapto/models/20230713"

In [None]:
values, valuesbackref = tokenize_values(tkn)
# fulltexts, tokenized = load_source(stemmers[tkn], countries)
# occurences, occurences_tv, occurences_backref = calc_occurences(values, tokenized)

In [None]:
# keywords = sorted(list(set(occurences_backref.keys())))
keywords = sorted(list(values.keys()))

In [None]:
# dist = calc_dist(keywords, culture, tkn, model_dir)
# dist = calc_shift(keywords, culture, tkn, model_dir)
# dist = calc_agg(np.std, keywords, culture, tkn, model_dir, algo=algo)
results = {}
for algo in ["w2v", "ft"]:
    fn = getattr(similarity, f"calc_{viz_fns[viz]}")
    dist = fn(
        viz_params[viz],
        keywords,
        culture,
        tkn,
        model_dir,
        algo=algo,
        iteration=iteration,
    )
    dlist = []
    for k, v in dist.items():
        for kk, vv in v.items():
            dlist += [vv]
    results[algo] = dlist

In [None]:
colors = {"w2v": "#0000FF99", "ft": "#00FF0099"}

p = figure(width=670, height=400, toolbar_location=None, title="Embedding Stability")

for k, r in results.items():
    x = [float(v) for v in r]
    mu = np.average(x)
    sigma = np.std(x)
    color = colors[k]
    # Histogram
    bins = np.linspace(min(x), max(x), 40)
    hist, edges = np.histogram(x, density=True, bins=bins)
    p.quad(
        top=hist,
        bottom=0,
        left=edges[:-1],
        right=edges[1:],
        fill_color=color,
        line_color=color,
        legend_label=k,
    )

    # Probability density function
    x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
    p.line(x, stats.norm.pdf(x, mu, sigma), line_width=2, line_color=color)

p.y_range.start = 0
p.xaxis.axis_label = "x"
p.yaxis.axis_label = "PDF(x)"

show(p)