SEMMAP - Erstellung zweier Keyword Visualisierungen für TRUMP und MUSK

In [None]:
cut -f2 ~/trump_copy.vrt | sort | uniq > keywords.tsv

In [None]:
cut -f2 ~/trump_copy.vrt | sort | uniq -c | awk '$1 > 50 {print $2}' > keywords_frequent.tsv

In [None]:
cut -f2 ~/trump_copy.vrt \
  | grep -E '^[a-zA-ZäöüÄÖÜß]+$' \
  | sort \
  | uniq \
  > keywords_clean.tsv

In [None]:
cut -f2 ~/trump_copy.vrt \
  | grep -E '^[a-zA-ZäöüÄÖÜß]+$' \
  | sort \
  | uniq -c \
  | awk '$1 > 50 {print $2}' \
  > keywords_frequent_clean.tsv

In [None]:
conda create -n semmap37 python=3.8
conda activate semmap37
pip install pymagnitude pandas scikit-learn sentence-transformers numpy matplotlib seaborn annoy
git clone https://github.com/ausgerechnet/semmap.git
cd semmap
cd bin
# jetzt Embeddings erstellen:
./create-embeddings ~/keywords_frequent_clean.tsv

In [None]:
## create-embeddings.py ##
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import os

from semmap.embeddings import create_embeddings
from semmap.embeddings_store import create_embeddings_store


if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description="create embeddings store (three files)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument('path_in', help="text file with one item per line, e.g. from 'cwb-lexdecode -P P_ATT CWB_ID'")
    parser.add_argument('--model_name', default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help="")
    parser.add_argument('--path_settings', help="path to semmap settings file")
    parser.add_argument('--path_db', help="path to database of items")
    parser.add_argument('--path_annoy', help="path to annoy index")
    parser.add_argument('--n_trees', default=100, help="", type=int)
    parser.add_argument('--metric', default='angular', help="")
    parser.add_argument('--random_seed', default=42, help="", type=int)
    parser.add_argument('--as_text', action="store_true", default=False, help="store only as gzipped text file?")
    parser.add_argument('--path_text_out', default=None, help="path to text file")
    args = parser.parse_args()

    if args.as_text:
        path_out = f'{args.path_in}.txt.gz' if args.path_text_out is None else args.path_text_out
        with open(args.path_in, "rt") as f:
            items = f.read().rstrip().split("\n")
        create_embeddings(items, args.model_name, path_out)

    path_settings = f'{args.path_in}.semmap' if args.path_settings is None else args.path_settings
    path_db = f'{args.path_in}.sqlite' if args.path_db is None else args.path_db
    path_annoy = f'{args.path_in}.annoy' if args.path_annoy is None else args.path_annoy

    if os.path.exists(path_settings) or os.path.exists(path_db) or os.path.exists(path_annoy):
        raise FileExistsError()

    create_embeddings_store(args.path_in, path_settings, path_db, path_annoy,
                            args.n_trees, args.metric, args.model_name, args.random_seed)
# args.model_name ist wohl 'default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2'

In [3]:
from semmap.embeddings import create_embeddings
help(create_embeddings)

Help on function create_embeddings in module semmap.embeddings:

create_embeddings(items, model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', path_out=None, mode='wt', as_is=False)
    create context-free embeddings from list of types



In [None]:
# bash:
pymagnitude:
nano /Users/vivien/opt/anaconda3/envs/semmap311/lib/python3.11/site-packages/pymagnitude/third_party/allennlp/common/params.py
# statt: from collections import MutableMapping, OrderedDict
# durch das ersetzen: from collections.abc import MutableMapping
#from collections import OrderedDict
python -c "from semmap.semspace import SemanticSpace; print('Import klappt jetzt komplett!')"

2D erstellen (Aus Embeddings)

In [4]:
from semmap.semspace import SemanticSpace
import pandas as pd
import os

# Pfad zu deinen SemMap-Dateien
path_base = os.path.expanduser("~/semmap/bin/keywords_frequent_clean.tsv")

# Prüfen, ob die Dateien existieren
for ext in [".semmap", ".sqlite", ".annoy"]:
    if not os.path.exists(path_base + ext):
        raise FileNotFoundError(f"Die Datei {path_base + ext} wurde nicht gefunden!")

# SemanticSpace-Objekt erstellen
space = SemanticSpace(path_base + ".semmap")

# Keywords aus der TSV-Datei laden
with open(path_base, "r") as f:
    keywords = [line.strip() for line in f]

# 2D-Koordinaten berechnen
coords_df = space.generate2d(items=keywords)

# Ergebnis in eine TSV-Datei speichern
output_file = os.path.expanduser("~/semmap/bin/keywords_2d.tsv")
coords_df.to_csv(output_file, sep="\t", index=False)

print(f"✅ Fertig! Die 2D-Koordinaten wurden in {output_file} gespeichert.")
print(coords_df.head())

✅ Fertig! Die 2D-Koordinaten wurden in /Users/vivien/semmap/bin/keywords_2d.tsv gespeichert.
              x          y
item                      
A    -16.102224   6.669508
ABC    0.294166   2.549005
AG    15.427487  -1.069275
ASAP -29.669147  35.072575
Abe   -5.945231  47.352264


In [None]:
## Assoziationsmaße

In [None]:
import math

# === Log-Likelihood Ratio (LLR) berechnen ===
llr_values = {}

# Gesamtzahl der Token im Korpus
N = total_tokens

for w1 in cooc_counts:
    for w2, k11 in cooc_counts[w1].items():
        # Häufigkeiten der einzelnen Wörter
        f1 = token_counts.get(w1, 0)
        f2 = token_counts.get(w2, 0)
        
        # 2x2 Kontingenztafel
        k12 = f1 - k11        # w1 ohne w2
        k21 = f2 - k11        # w2 ohne w1
        k22 = N - (k11 + k12 + k21)
        
        # Erwartungswerte
        E11 = (f1 * f2) / N
        E12 = (f1 * (N - f2)) / N
        E21 = ((N - f1) * f2) / N
        E22 = ((N - f1) * (N - f2)) / N
        
        # Log-Likelihood berechnen
        llr = 0
        for k, E in [(k11, E11), (k12, E12), (k21, E21), (k22, E22)]:
            if k > 0 and E > 0:
                llr += k * math.log(k / E)
        llr_values[(w1, w2)] = 2 * llr

Visualisierung: 2D (x, y-Werte) + Assoziationsmaße als Größe der Repräsentation (z-Werte)

In [None]:
import pandas as pd
import plotly.express as px

# Pfad zu deiner 2D-Datei mit Assoziationsmaßen
df = pd.read_csv("~/semmap/bin/keywords_2d.tsv", sep="\t")

# Falls du ein z-Maß hast, sonst z.B. Frequenz oder zufällige Werte
if 'z' not in df.columns:
    import numpy as np
    df['z'] = np.random.rand(len(df)) * 10  # Dummy-Werte

# Interaktiver 3D-Scatter-Plot
fig = px.scatter_3d(
    df,
    x='x',
    y='y',
    z='z',
    text='keyword',  # Name deiner Keyword-Spalte
    color='z',       # Farben nach Z-Wert
    size='z',        # Größe der Punkte nach Z-Wert
    color_continuous_scale='Viridis',
    size_max=10,
    opacity=0.8
)

fig.update_layout(
    scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Association Measure (Z)'
    ),
    title="Interaktive 3D-Visualisierung der Keywords"
)

fig.show()

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from adjustText import adjust_text  # pip install adjustText

# Daten laden
df = pd.read_csv("~/semmap/bin/keywords_2d.tsv", sep="\t")

# Wenn noch kein z-Wert: z.B. Assoziationsmaß oder Dummy
if 'z' not in df.columns:
    import numpy as np
    df['z'] = np.random.rand(len(df)) * 10

# Figure und 3D Achse
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Punkte plotten
sc = ax.scatter(df['x'], df['y'], df['z'], c=df['z'], cmap='viridis', s=50)

# Labels vorbereiten
texts = [ax.text(df['x'][i], df['y'][i], df['z'][i], df['keyword'][i], fontsize=9)
         for i in range(len(df))]

# Overlap bereinigen
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))

# Achsenbeschriftungen
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z (Association measure)')

plt.title("3D-Keyword-Visualisierung mit Overlap-Anpassung")
plt.colorbar(sc, label='Z-Wert')
plt.show()


ModuleNotFoundError: No module named 'adjustText'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from adjustText import adjust_text  # pip install adjustText
import numpy as np

# ------------------------------
# 1️⃣ Daten laden
# ------------------------------
df = pd.read_csv("~/semmap/bin/keywords_2d.tsv", sep="\t")

# Überprüfen, ob z-Wert vorhanden ist
if 'z' not in df.columns:
    print("Keine Z-Werte gefunden. Es wird ein Dummy-Wert verwendet.")
    df['z'] = np.random.rand(len(df)) * 10

# Optional: Top-N Keywords nach z-Wert auswählen (z.B. die 50 wichtigsten)
top_n = 50
df_top = df.nlargest(top_n, 'z').reset_index(drop=True)

# ------------------------------
# 2️⃣ 3D-Plot erstellen
# ------------------------------
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')

# Punkte plotten, farblich nach z-Wert
sc = ax.scatter(df_top['x'], df_top['y'], df_top['z'], c=df_top['z'], cmap='viridis', s=60)

# Labels vorbereiten
texts = [ax.text(df_top['x'][i], df_top['y'][i], df_top['z'][i], df_top['keyword'][i], fontsize=9)
         for i in range(len(df_top))]

# Overlap bereinigen
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red', lw=0.5))

# Achsenbeschriftungen
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z (Association measure)')

# Titel + Farbskala
plt.title(f"Top-{top_n} Keywords im 3D-SemMap-Space", fontsize=16)
plt.colorbar(sc, label='Z-Wert')

plt.show()


ChatGPT komplett:

In [None]:
cut -f2 ~/trump_copy.vrt \
  | grep -E '^[a-zA-ZäöüÄÖÜß]+$' \
  | sort \
  | uniq -c \
  | awk '{print $2 "\t" $1}' \
  > keywords_frequent_with_counts.tsv

In [2]:
import pandas as pd
from collections import Counter, defaultdict
from itertools import combinations
import re
import os
from semmap.semspace import SemanticSpace
import math

# === Parameter ===
corpus_file = "trump_copy.vrt"
keywords_file = "~/semmap/keywords_frequent_with_counts.tsv"
window_size = 5  # Wörter links/rechts für Co-Occurrence

# === Keywords laden ===
keywords_df = pd.read_csv(os.path.expanduser(keywords_file), sep="\t", header=None, names=["lemma", "freq"])

# Nur gültige Strings als Keywords behalten
keywords_df["lemma"] = keywords_df["lemma"].astype(str)
keywords_df = keywords_df[keywords_df["lemma"].str.strip() != ""]
keywords_set = set(keywords_df["lemma"])

# === Korpus lesen und Tokenisieren ===
corpus_tokens = []
with open(corpus_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) >= 2 and re.match(r'^[a-zA-ZäöüÄÖÜß]+$', parts[1]):
            corpus_tokens.append(parts[1])

# === Co-Occurrence zählen ===
cooc_counts = defaultdict(Counter)
for i, token in enumerate(corpus_tokens):
    if token in keywords_set:
        window_start = max(i - window_size, 0)
        window_end = min(i + window_size + 1, len(corpus_tokens))
        window_tokens = corpus_tokens[window_start:i] + corpus_tokens[i+1:window_end]
        for t in window_tokens:
            if t in keywords_set:
                cooc_counts[token][t] += 1

# === Assoziationsmaß berechnen (PMI) ===
token_counts = dict(zip(keywords_df["lemma"], keywords_df["freq"]))
total_tokens = sum(token_counts.values())

pmi_values = {}
for t1 in cooc_counts:
    for t2, c in cooc_counts[t1].items():
        p_xy = c / total_tokens
        p_x = token_counts[t1] / total_tokens
        p_y = token_counts[t2] / total_tokens
        pmi_values[(t1, t2)] = math.log2(p_xy / (p_x * p_y)) if p_xy > 0 else 0

# === 2D-Koordinaten aus SemMap laden ===
path_semmap = os.path.expanduser("~/semmap/bin/keywords_frequent_clean.tsv.semmap")
space = SemanticSpace(path_semmap)

# Nur gültige Strings an SemMap übergeben
keywords_clean = [k for k in keywords_set if isinstance(k, str) and k.strip() != ""]
coords_df = space.generate2d(items=keywords_clean)
coords_df = coords_df.set_index("item")

# === Z-Wert als Summe der PMI-Werte mit anderen Tokens ===
z_values = {}
for token in keywords_clean:
    z = sum(pmi_values.get((token, t2), 0) for t2 in keywords_clean if t2 != token)
    z_values[token] = z

# === DataFrame für 3D-Plot erstellen ===
plot_df = coords_df.copy()
plot_df["z"] = plot_df.index.map(z_values)

# === Speichern ===
plot_df.to_csv("keywords_3d.tsv", sep="\t")

print("3D-Daten in 'keywords_3d.tsv' gespeichert!")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyError: "None of ['item'] are in the columns"

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from adjustText import adjust_text

# === Daten laden ===
df = pd.read_csv("keywords_3d.tsv", sep="\t", index_col=0)

x = df['x'].values
y = df['y'].values
z = df['z'].values
labels = df.index.tolist()

# === 3D-Plot erstellen ===
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Punkte plotten
sc = ax.scatter(x, y, z, c=z, cmap='viridis', s=60)
cbar = plt.colorbar(sc)
cbar.set_label('Assoziationswert (z)')

# Textlabels hinzufügen
texts = []
for xi, yi, zi, label in zip(x, y, z, labels):
    texts.append(ax.text(xi, yi, zi, label, fontsize=9))

# adjustText anwenden, damit sich Texte nicht überlappen
adjust_text(texts, only_move={'points':'y', 'texts':'y'}, 
            arrowprops=dict(arrowstyle="-", color='gray', lw=0.5))

# Achsen
ax.set_xlabel('SemMap x')
ax.set_ylabel('SemMap y')
ax.set_zlabel('Assoziationsmaß z')
ax.set_title('3D-Visualisierung der Keywords mit Assoziationswerten')

plt.show()