In [None]:
import json

import altair as alt
import numpy as np
import polars as pl
from sklearn.cluster import AgglomerativeClustering as CLUSTERER
from sklearn.manifold import TSNE as EMBEDDING

# from sklearn.manifold import MDS as EMBEDDING

In [None]:
# https://gist.githubusercontent.com/thomascamminady/c5da0b7acb41faf6abd6c99aff10e144/raw/f2b074e2e3945c0102eceb4983eb998da1432d44/theme.json


def loader():
    with open("theme.json") as f:
        return json.load(f)


alt.data_transformers.disable_max_rows()
# alt.renderers.enable("browser")
alt.themes.register("wahoo_theme", loader)
alt.themes.enable("wahoo_theme")

In [None]:
def metric(x, y):
    return np.linalg.norm(x - y)


df = pl.read_csv("../data/wahl-o-mat-2024.csv").drop("question", "question_id")
X = df.to_numpy().T

cluster =CLUSTERER(n_clusters=6)
labels = cluster.fit(X).labels_
# X[i_question, j_party]
# shape of X is (number of questions, number of parties)
embedding = EMBEDDING()
_ = embedding.fit_transform(X)
x, y = _[:, 0], _[:, 1]
distances = np.zeros((34, 34))
for i in range(34):
    for j in range(34):
        _x = df[:, i].to_numpy()
        _y = df[:, j].to_numpy()
        distances[i, j] = metric(_x, _y)

In [None]:
d = []

for i in range(34):
    for j in range(34):
        if i != j:
            d.append(
                {
                    "party": df.columns[i],
                    "target": df.columns[j],
                    "weight": distances[i, j],
                }
            )


connections = pl.DataFrame(d)
connections = connections.with_columns(
    weight=pl.col("weight") - pl.col("weight").min().over("party")
)
connections = connections.with_columns(
    weight=pl.col("weight") / pl.col("weight").max().over("party")
)
connections = connections.with_columns(weight=1 - pl.col("weight"))
connections = connections.with_columns(weight=pl.col("weight") ** 2)
connections = connections.with_columns(pl.col("weight").round(3))
parties = pl.DataFrame(
    {
        "party": df.columns,
        "x": np.round(x, 3),
        "y": np.round(y, 3),
        "label": labels,
    }
)


parties = parties.with_columns(
    (pl.col("x") * 1000 // 1).cast(pl.Int64),
    (pl.col("y") * 1000 // 1).cast(pl.Int64),
)
connections = connections.with_columns(
    (pl.col("weight")*100//1).cast(pl.Int64)
)

In [None]:
# Create pointerover selection

lookup_data = alt.LookupData(
    data=parties, key="party", fields=["x", "y", "label"]
)
select_party = alt.selection_point(
    on="pointerover", nearest=True, fields=["party"], empty=True
)

base = (
    alt.Chart(parties).encode(
        x=alt.X("x:Q").scale(zero=False).axis(None),
        y=alt.Y("y:Q").scale(zero=False).axis(None),
        color=alt.Color("label:N").title("Cluster ID"),
    )
).properties(width=1000, height=1000)

c1 = alt.layer(
    base.mark_point(filled=True, size=100),
    base.mark_text(
        align="left", dx=3, dy=-3, baseline="bottom", fontSize=16
    ).encode(text=alt.Text("party"), color=alt.value("black")),
).add_params(select_party)


c2 = (
    alt.Chart(connections)
    .transform_filter(alt.datum.weight > 20)
    .mark_rule()
    .encode(
        x=alt.X("x:Q").axis(None),
        y=alt.Y("y:Q").axis(None),
        x2=alt.X2("x2:Q"),
        y2=alt.Y2("y2:Q"),
        color="label2:N",
        strokeWidth=alt.StrokeWidth("weight:Q")
        .scale(domain=[20, 100])
        .legend(title="Similarity measure"),
        opacity=alt.Opacity("weight:Q").scale(range=[0, 0.7]).legend(None),
    )
    .transform_lookup(lookup="party", from_=lookup_data)
    .transform_lookup(
        lookup="target", from_=lookup_data, as_=["x2", "y2", "label2"]
    )
    .transform_filter(select_party)
)

c = (
    (c2 + c1)
    .configure_axis(grid=False)
    .configure_view(stroke=None)
    # .properties(
    #     title="Similarity between German parties for the EU elections 2024",
    #     # subtitle="Based on their Wahl-O-Mat answers."
    # )
    .properties(
        title={
            "text": [
                "Similarity between German parties for the EU elections 2024"
            ],
            "subtitle": [
                "Based on their Wahl-O-Mat answers, crawled from www.wahl-o-mat.de (file: module_definition.js).",
                "Embedding via t-SNE, clustering via agglomerative clustering, both using scikit-learn.",
                "Source code and data: https://github.com/thomascamminady/wahl_o_mat_2024.",
                "Author: Thomas Camminady",
            ],
            "anchor": "start",
            "subtitlePadding": 10,
        }
    )
)
c.save("Chart.json")
c.save("Chart.png", scale_factor=2)
c.show()

In [None]:
print(c.to_url())