## import Wahlomat dataset
A scientific public analysis with dataset from 
[Wahlomat 2025 data source](https://www.bpb.de/themen/wahl-o-mat/bundestagswahl-2025/558463/download/)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns # corr heatmap
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA # PCA chart
from ipywidgets import interact # for slider 
import ipywidgets as widgets
from ipywidgets import interactive
import zipfile
import os
os.makedirs("img", exist_ok=True) # create new directory "img" to hold images

### download Wahl-O-Mat_Bundestagswahl dataset from www.bpb.de and unzip the file in a new directory

In [None]:
!curl -o Wahl-O-Mat_Bundestagswahl_2025.zip https://www.bpb.de/system/files/datei/Wahl-O-Mat_Bundestagswahl_2025_Datensatz_v1.01.zip

zip_path = "Wahl-O-Mat_Bundestagswahl_2025.zip"
extract_folder = "data"

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# List the extracted files
os.listdir(extract_folder)

# remove zip file after unzipping
os.remove(zip_path)


### read dataset from Excel file and pivot it

In [None]:
df = pd.read_excel(
    'data/Wahl-O-Mat Bundestagswahl 2025_Datensatz_v1.01.xlsx',
     sheet_name='Datensatz BTW 2025',
     usecols=['Partei: Kurzbezeichnung', 'These: Nr.', 'Position: Position'],
     index_col='Partei: Kurzbezeichnung',
     converters={'Position: Position': lambda x: {'stimme zu': 1, 'neutral': 0, 'stimme nicht zu': -1}.get(x)},
     header=0
)
pivot_df = pd.pivot_table(
    df,
    index='Partei: Kurzbezeichnung',
    columns='These: Nr.',
    values='Position: Position',
    aggfunc='first'
).drop('Verjüngungsforschung')

### explore raw dataset 

In [None]:
#df.head()
#df.shape
#df.columns
#df.info()
pivot_df.head()
#pivot_df.T.corr()
#pivot_df.info()


### create correlation matrix, generate a heatmap (sorted by avg)

In [None]:
# Compute the correlation matrix
corr_matrix = pivot_df.T.corr()

# Sort the correlation matrix by average correlation
sorted_indices = corr_matrix.mean().sort_values(ascending=False).index
sorted_corr_matrix = corr_matrix.loc[sorted_indices, sorted_indices]

# Create a mask to hide the lower triangle
mask = np.tril(np.ones_like(sorted_corr_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(sorted_corr_matrix, 
            mask=mask,  # Apply mask to hide lower triangle
            cmap="coolwarm_r", 
            annot=True, 
            fmt=".2f", 
            center=0,
            linewidths=0.5, 
            cbar=True)

plt.title("Zustimmungsähnlichkeit der Parteien (Obere Dreiecksmatrix)")
plt.xlabel("")
plt.ylabel("")

# Saves the figure and show it
plt.savefig("img/Zustimmungsähnlichkeit.png", dpi=300, bbox_inches="tight")  
plt.show()


### PCA analysis of the pivoted dataset (2D)

In [None]:
# Perform PCA on the dataset (reduce to 2 dimensions)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(pivot_df)

# Create a DataFrame for PCA results
pca_df = pd.DataFrame(pca_result, index=pivot_df.index, columns=["PC1", "PC2"])

# flip horizontally the PCA chart (since PC1 is somewhat mapped to the left/right party tendency)
pca_df["PC1"] *= -1

# Define party colors based on extracted colors from image
party_colors = {
    "CDU / CSU": "#000000",  # Black
    "AfD": "#0047AB",  # Dark Blue
    "SPD": "#E3001B",  # Red
    "GRÜNE": "#1A7F22",  # Green
    "Die Linke": "#C60084",  # Magenta/Pink
    "BSW": "#5E1D4D",  # Dark Purple
    "FDP": "#FFD700",  # Bright Yellow
}

# Plot PCA scatter plot
plt.figure(figsize=(10, 7))

# Plot all parties in gray by default
for party, (x, y) in pca_df.iterrows():
    color = party_colors.get(party, "gray")  # Use party color if available, else gray
    size = 150 if party in party_colors else 50  # Make highlighted parties larger
    fontweight = 'bold' if party in party_colors else 'normal'

    plt.scatter(x, y, color=color, s=size, alpha=0.7)
    plt.text(x, y, party, fontsize=10, fontweight=fontweight, ha='right', color=color)

# Add vectors for selected parties with shorter arrows
origin = np.zeros((len(party_colors), 2))  # Origin for vectors
vectors = np.array([pca_df.loc[party] for party in party_colors if party in pca_df.index])

for (x, y), party in zip(vectors, party_colors.keys()):
    plt.arrow(0, 0, x * 0.5, y * 0.5, color=party_colors[party], alpha=0.7, width=0.01, head_width=0.1)

# Labels and grid
plt.xlabel("Hauptkomponente 1")
plt.ylabel("Hauptkomponente 2")
plt.title("Clustering der Parteien nach Positionen – Parteien mit ähnlicher Meinung stehen nah beieinander")
plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.grid(True, linestyle='--', alpha=0.6)

# Saves the figure and show it
plt.savefig("img/Clustering der Parteien nach Positionen.png", dpi=300, bbox_inches="tight")  # Saves the figure
plt.show()

### PCA in 3D

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Perform PCA with 3 components
pca_3d = PCA(n_components=3)
pca_result_3d = pca_3d.fit_transform(pivot_df)

# Create DataFrame for PCA results
pca_df_3d = pd.DataFrame(pca_result_3d, index=pivot_df.index, columns=["PC1", "PC2", "PC3"])

# flip horizontally the PCA chart (since PC1 is somewhat mapped to the left/right party tendency)
pca_df_3d["PC1"] *= -1

# Define party colors
party_colors = {
    "CDU / CSU": "#000000",  # Black
    "AfD": "#0047AB",  # Dark Blue
    "SPD": "#E3001B",  # Red
    "GRÜNE": "#1A7F22",  # Green
    "Die Linke": "#C60084",  # Magenta/Pink
    "BSW": "#5E1D4D",  # Dark Purple
    "FDP": "#FFD700",  # Bright Yellow
}

# Create 3D figure
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Adjust bubble sizes based on perspective depth (z-axis position)
z_values = pca_df_3d["PC3"].values
min_z, max_z = min(z_values), max(z_values)
scaled_sizes = 50 + 200 * (z_values - min_z) / (max_z - min_z)

# Plot all parties
scatters = []
for (party, (x, y, z), size) in zip(pca_df_3d.index, pca_df_3d.values, scaled_sizes):
    color = party_colors.get(party, "gray")
    fontweight = 'bold' if party in party_colors else 'normal'
    scatter = ax.scatter(x, y, z, color=color, s=size, alpha=0.7)
    ax.text(x, y, z, party, fontsize=10, fontweight=fontweight, color=color)
    scatters.append(scatter)

# Labels and grid
ax.set_xlabel("Hauptkomponente 1")
ax.set_ylabel("Hauptkomponente 2")
ax.set_zlabel("Hauptkomponente 3")
ax.set_title(f"PCA 3D-Clustering der Parteien – Erklärte Varianz: {sum(pca_3d.explained_variance_ratio_) * 100:.2f}%")

# Function to rotate the 3D plot
def update(frame):
    ax.view_init(elev=20, azim=frame)
    return scatters

# Create animation
ani = FuncAnimation(fig, update, frames=np.arange(0, 360, 2), interval=50)

# Save animation as a GIF (optional, uncomment if needed)
ani.save("img/PCA_3D_Clustering.gif", writer="pillow", fps=20)

plt.show()


### Verteilung der Parteipositionen

In [None]:
plt.figure(figsize=(10, 5))
pivot_df.stack().value_counts().sort_index().plot(kind="bar", color=["red", "gray", "blue"])
plt.xticks(ticks=[0, 1, 2], labels=["-1 (Ablehnung)", "0 (Neutral)", "1 (Zustimmung)"], rotation=0)
plt.xlabel("Antwortwert")
plt.ylabel("Anzahl")
plt.title("Verteilung der Parteipositionen über alle Thesen")

# Saves the figure and show it
plt.savefig("img/Verteilung der Parteipositionen.png", dpi=300, bbox_inches="tight")  # Saves the figure
plt.show()


### party positions by These (interactive view)

In [None]:
import plotly.express as px

def plot_party_positions(topic_index):
    df = pivot_df.iloc[:, topic_index].reset_index()
    df.columns = ["Partei", "Antwortwert"]
    
    fig = px.bar(df, x="Partei", y="Antwortwert", 
                 title=f"Parteipositionen zur These {topic_index+1}", 
                 color="Partei")
    fig.show()

while True:
    try:
        topic_index = int(input(f"Enter a topic index (0 - {pivot_df.shape[1]-1}): "))
        if 0 <= topic_index < pivot_df.shape[1]:
            plot_party_positions(topic_index)
        else:
            print("Index out of range.")
    except ValueError:
        print("Please enter a valid integer.")


### explained variability by number of PCA components

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Perform PCA for multiple components
num_components = min(pivot_df.shape[1], 10)  # Limit to 10 components for visualization
pca_full = PCA(n_components=num_components)
pca_full.fit(pivot_df)

# Calculate explained variance
explained_variance = np.cumsum(pca_full.explained_variance_ratio_) * 100

# Plot variance explained by increasing PCA components
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_components + 1), explained_variance, marker='o', linestyle='-')
plt.xlabel("Anzahl der Hauptkomponenten")
plt.ylabel("Kumulierte erklärte Varianz (%)")
plt.title("Erklärte Varianz durch zunehmende Hauptkomponenten")
plt.grid(True, linestyle='--', alpha=0.6)
plt.savefig("img/Erklaerte_Varianz_PCA.png", dpi=300, bbox_inches="tight")
plt.show()
