In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from load.utils import load_df_from_parquet
from preprocessing.utils import (
    load_event_comments,
    normalize,
)
from preprocessing.constants import OUTPUT_DIR
from eda.constants import FIGURES_DIR
from events.event_constants import EVENTS_INFO

In [None]:
EVENT_NAMES = [
    # ("gun_control", "mass_shootings_gun_control"),
    # ("gun_control", "mass_shootings"),
    ("elections", "us_elections_2012", "month"),
    ("elections", "us_elections_2016", "week"),
    ("elections", "us_midterms_2014", "month"),
    ("elections", "us_midterms_2018", "week"),
    ("abortion", "abortion", "month"),
]

In [None]:
for event_theme, event_name, granularity in EVENT_NAMES:
    print(event_theme, event_name)

    print("Loading comments...")

    comments_with_embeddings = load_event_comments(
        theme=event_theme,
        event_name=event_name + "_with_embeddings",
    )

    # convert date to datetime
    comments_with_embeddings["date"] = pd.to_datetime(
        comments_with_embeddings["created_utc"], unit="s"
    )

    comments_with_embeddings["week"] = comments_with_embeddings["date"].apply(
        lambda x: x.weekofyear
    )
    comments_with_embeddings["month"] = comments_with_embeddings["date"].dt.month

    print("Computing party probabilities...")

    probs_parties = (
        comments_with_embeddings.groupby(by=[granularity]).apply(
            lambda x: pd.DataFrame(
                {
                    "prob_dem": x[x["party"] == "dem"]["author"].nunique()
                    / x["author"].nunique(),
                },
                index=[x["date"].iloc[0]],
            )
        )
    ).reset_index()

    # join party probabilities to comments

    print("Joining party probabilities to comments...")

    comments_with_embeddings = comments_with_embeddings.merge(
        probs_parties,
        on=granularity,
        how="left",
    )

    print("Computing user embeddings...")

    user_embeddings = (
        comments_with_embeddings.groupby(
            by=[pd.Grouper(key="date", freq="W" if granularity == "week" else "M"), "author", "party"]
        )
        .agg(
            count=("author", lambda x: len(x)),
            mean=("embedding", lambda x: normalize(np.vstack(x).mean(axis=0))),
            max=("embedding", lambda x: normalize(np.vstack(x).max(axis=0))),
            random_party=(
                "prob_dem",
                lambda x: np.random.choice(
                    ["dem", "rep"],
                    p=[
                        # select any value from the column
                        x.iloc[0],
                        1 - x.iloc[0],
                    ],
                ),
            ),
        )
        .reset_index()
    )

    print(user_embeddings)

    # Save user embeddings

    print(f"Saving {len(user_embeddings)} user embeddings...")

    user_embeddings.to_parquet(
        f"{OUTPUT_DIR}/polarization/{event_theme}/{granularity}ly_{event_name}_user_embeddings.parquet"
    )

    print("Computing party embeddings...")

    party_embeddings = (
        user_embeddings.groupby(
            by=[
                pd.Grouper(key="date", freq="W" if granularity == "week" else "M"),
                "party",
            ]
        )
        .agg(
            count=("author", lambda x: len(x)),
            mean=("mean", lambda x: normalize(np.vstack(x).mean(axis=0))),
            max=("max", lambda x: normalize(np.vstack(x).max(axis=0))),
        )
        .reset_index()
    )

    # Save party embeddings

    print(f"Saving {len(party_embeddings)} party embeddings...")

    party_embeddings.to_parquet(
        f"{OUTPUT_DIR}/polarization/{event_theme}/{granularity}ly_{event_name}_party_embeddings.parquet"
    )

    print("Computing CO Polarization...")

    cosine_distances = (
        party_embeddings.groupby(by="date")
        .apply(
            lambda x: pd.DataFrame(
                {
                    "polarization_mean": [
                        cosine_distance(
                            x[x["party"] == "dem"]["mean"].values[0],
                            x[x["party"] == "rep"]["mean"].values[0],
                        )
                    ],
                    "polarization_max": [
                        cosine_distance(
                            x[x["party"] == "dem"]["max"].values[0],
                            x[x["party"] == "rep"]["max"].values[0],
                        )
                    ],
                    "count": [x["count"].sum()],
                }
            )
        )
        .reset_index()
    )

    print(cosine_distances)

    # save to parquet

    print("Saving CO Polarization...")

    cosine_distances.to_parquet(
        f"{OUTPUT_DIR}/polarization/{event_theme}/{event_name}_co_polarization.parquet"
    )

    ###############################################################

    print("Computing random party embeddings...")

    random_party_embeddings = (
        user_embeddings.groupby(
            by=[
                pd.Grouper(key="date", freq="W" if granularity == "week" else "M"),
                "random_party",
            ]
        )
        .agg(
            count=("author", lambda x: len(x)),
            mean=("mean", lambda x: normalize(np.vstack(x).mean(axis=0))),
            max=("max", lambda x: normalize(np.vstack(x).max(axis=0))),
        )
        .reset_index()
    )

    # Save party embeddings

    print(f"Saving {len(random_party_embeddings)} party embeddings...")

    print("Computing random CO Polarization...")

    random_cosine_distances = (
        random_party_embeddings.groupby(by="date")
        .apply(
            lambda x: pd.DataFrame(
                {
                    "polarization_mean": [
                        cosine_distance(
                            x[x["random_party"] == "dem"]["mean"].values[0],
                            x[x["random_party"] == "rep"]["mean"].values[0],
                        )
                    ],
                    "polarization_max": [
                        cosine_distance(
                            x[x["random_party"] == "dem"]["max"].values[0],
                            x[x["random_party"] == "rep"]["max"].values[0],
                        )
                    ],
                    "count": [x["count"].sum()],
                }
            )
        )
        .reset_index()
    )

    print(random_cosine_distances)

    # save to parquet

    print("Saving random CO Polarization...")

    random_cosine_distances.to_parquet(
        f"{OUTPUT_DIR}/polarization/{event_theme}/random_{event_name}_co_polarization.parquet"
    )


## Plots

In [None]:
for event_theme, event_name, granularity in EVENT_NAMES:
    print(event_theme, event_name)

    # Load LO polarization
    lo_pol_by_time_events = pd.read_parquet(
        f"{OUTPUT_DIR}/{event_name}_leaveout_polarization_by_{granularity}.parquet",
    )

    lo_pol_by_time_events["date"] = pd.to_datetime(lo_pol_by_time_events["date"])

    # Load CO Polarization

    cosine_distances = pd.read_parquet(
        f"{OUTPUT_DIR}/polarization/{event_theme}/{event_name}_co_polarization.parquet"
    )
    cosine_distances = cosine_distances[cosine_distances["count"] > 500]

    # Load random CO Polarization

    random_cosine_distances = pd.read_parquet(
        f"{OUTPUT_DIR}/polarization/{event_theme}/random_{event_name}_co_polarization.parquet"
    )

    random_cosine_distances = random_cosine_distances[
        random_cosine_distances["count"] > 500
    ]

    # plot polarization_mean by date

    plt.figure(figsize=(20, 5))

    ax = sns.lineplot(
        x=cosine_distances["date"],
        y=cosine_distances["polarization_mean"],
        label="Contextualized estimator",
        marker="o",
        color="black",
    )

    _ = sns.lineplot(
        x=random_cosine_distances["date"],
        y=random_cosine_distances["polarization_mean"],
        label="Contextualized estimator with random party assignment",
        marker="o",
        color="orange",
        ax=ax
    )
    
    ax2 = plt.twinx()
    sns.lineplot(
        x=lo_pol_by_time_events["date"],
        y=lo_pol_by_time_events["polarization"],
        label="Leave-out estimator",
        marker="o",
        color="blue",
        linestyle="--",
        ax=ax2,
        alpha=0.3,
    )

    event_details = EVENTS_INFO[event_name]

    trans = ax.get_xaxis_transform()

    if "date" in event_details:
        plt.axvline(
            event_details["date"],
            linestyle="--",
            color="red",
            # label=f"{event_details['name']}",  #  ({date_str})
        )
        plt.text(
            event_details["date"],
            0.1,
            event_details['name'],
            fontsize='x-small',
            transform=trans,
            rotation=-90,
        )


    for relevant_event, relevant_event_date in event_details["relevant_dates"].items():
        # date_str = relevant_event_date.strftime("%Y-%m-%d")
        plt.axvline(
            relevant_event_date,
            linestyle="--",
            color="blue",
            # label=f"{relevant_event}", #  ({date_str})
        )

        plt.text(
            relevant_event_date,
            0.1,
            relevant_event,
            fontsize='x-small',
            transform=trans,
            rotation=-90,
        )

    ax.set(
        xlabel="Date",
        ylabel="Contextualized partisanship score estimate",
    )

    ax2.set(
        xlabel="Date",
        ylabel="Leave-out partisanship score estimate",
    )

    plt.savefig(
        fname=f"{FIGURES_DIR}/polarization/{event_name}/{event_name}_co_polarization.pdf",
        bbox_inches="tight",
    )

    plt.show()
