In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(f"data/logs/overall_polarization.log"),
        logging.StreamHandler(stream=sys.stdout),
    ],
)


In [None]:
import json

import pandas as pd
from tqdm import tqdm
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

tqdm.pandas()

from polarization.utils import (
    calculate_polarization,
    calculate_polarization_by_time,
)
from preprocessing.utils import (
    build_vocab,
    save_event_vocab,
    load_event_vocab,
    tokenize_comment,
)
from preprocessing.constants import (
    OUTPUT_DIR,
    MIN_OCCURENCE_FOR_VOCAB,
)

from load.utils import (
    load_df_from_parquet,
    save_df_as_parquet,
)
from eda.constants import PARTIES_COLORS, FIGURES_DIR


In [None]:
YEARS = [2013] #, 2013]  # 2014, 2015, 2016, 2017, 2018, 2019]

pol_by_month = []

for year in YEARS:
    logging.info(year)
    logging.info("Load partisan comments")  
    partisan_comments_year = load_df_from_parquet(
        file_name=f"user_partisan_comments_{year}.parquet",
    )

    partisan_comments_year = partisan_comments_year[
        partisan_comments_year["party"].isin(["dem", "rep"])
    ].copy()

    logging.info(f"Length of comments {year}: {len(partisan_comments_year)}")

    # logging.info("Building vocab...")
    # year_vocab = build_vocab(
    #     partisan_comments_year["tokens"],
    #     ngram_range=(1, 2),
    #     min_df=MIN_OCCURENCE_FOR_VOCAB,
    # )

    # logging.info("Saving vocab...")

    # # save vocab
    # save_event_vocab(
    #     year_vocab,
    #     theme="overall",
    #     event_name=f"user_partisan_comments_{year}",
    # )

    year_vocab = load_event_vocab(
        theme="overall",
        event_name=f"user_partisan_comments_{year}",
    )
    logging.info(f"Length of vocab {year}: {len(year_vocab)}")

    logging.info("Calculating leave-out polarization by month...")

    partisan_comments_year["datetime"] = pd.to_datetime(
        partisan_comments_year["created_utc"], unit="s"
    )

    logging.info(partisan_comments_year.dtypes)

    pol_by_month_year = calculate_polarization_by_time(
        partisan_comments_year,
        ngram_range=(1, 2),
        event_vocab=year_vocab,
        freq="M",
        equalize_users=True,
    )

    logging.info(f"Saving monthly polarization data {year}...")
    pol_by_month_year.to_parquet(
        f"{OUTPUT_DIR}/overall_{year}_leaveout_polarization_by_month.parquet",
        index=False,
    )

    pol_by_month.append(pol_by_month_year)

logging.info("Calculating leave-out polarization by month...")

pol_by_month = pd.concat(pol_by_month)

logging.info(f"Saving monthly polarization data...")

pol_by_month.to_parquet(
    f"{OUTPUT_DIR}/overall_leaveout_polarization_by_month.parquet",
    index=False,
)

## Plot

In [None]:
YEARS = [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

logging.info("Load partisan comments")

pol_by_month = []

for year in YEARS:
    logging.info(year)
    pol_by_month_year = load_df_from_parquet(
        file_name=f"overall_{year}_leaveout_polarization_by_month.parquet",
    )

    pol_by_month_year["date"] = pd.to_datetime(pol_by_month_year["date"])

    pol_by_month_year["dem_polarization"] = pol_by_month_year["dem_user_polarizations"].apply(np.mean)
    pol_by_month_year["rep_polarization"] = pol_by_month_year["rep_user_polarizations"].apply(np.mean)

    pol_by_month.append(pol_by_month_year)


pol_years_by_month = pd.concat(pol_by_month)


In [None]:
display(pol_years_by_month)


In [None]:
# Line plot

plt.figure(figsize=(12, 4))


ax = sns.lineplot(
    data=pol_years_by_month,
    x="date",
    y="polarization",
    label="Polarization",
    marker="o",
    color="k",
)

sns.lineplot(
    data=pol_years_by_month,
    x="date",
    y="random_polarization",
    label="Polarization with random user assignment",
    color="orange",
)

plt.ylabel("Leave-out partisanship estimate")
plt.xlabel("Date")

plt.savefig(
    "data/figures/polarization/overall/overall_monthly_partisanship.pdf",
    bbox_inches="tight",
)

plt.show()


In [None]:
pol_years_by_month

In [None]:
# Convert data into DataFrame
df_values1 = pd.DataFrame(
    {
        "value": np.concatenate(pol_years_by_month["dem_user_polarizations"].values),
        "date": np.repeat(
            pol_years_by_month["date"],
            [len(v) for v in pol_years_by_month["dem_user_polarizations"]],
        ),
        "Party": np.repeat(
            "Democrat",
            sum(len(v) for v in pol_years_by_month["dem_user_polarizations"]),
        ),
    }
)
df_values2 = pd.DataFrame(
    {
        "value": np.concatenate(pol_years_by_month["rep_user_polarizations"].values),
        "date": np.repeat(
            pol_years_by_month["date"],
            [len(v) for v in pol_years_by_month["rep_user_polarizations"]],
        ),
        "Party": np.repeat(
            "Republican",
            sum(len(v) for v in pol_years_by_month["rep_user_polarizations"]),
        ),
    }
)

# Concatenate the two DataFrames
pol_years_by_month_expanded = pd.concat([df_values1, df_values2], ignore_index=True)


In [None]:
# Line plot

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

plt.figure(figsize=(12, 4))


ax = sns.lineplot(
    data=pol_years_by_month_expanded,
    y="value",
    x="date",
    hue="Party",
    errorbar=("ci", 95),
    palette=PARTIES_COLORS

)

plt.ylabel("Leave-out partisanship estimate")
plt.xlabel("Date")

plt.savefig(
    "data/figures/polarization/overall/party_monthly_partisanship.pdf",
    bbox_inches='tight'
)

plt.show()


## Event Comparison Democrats vs Republicans

In [None]:
EVENTS = [
    ("gun_control", "mass_shootings_gun_control"),
    ("elections", "us_elections_2012"),
    ("elections", "us_elections_2016"),
    ("elections", "us_midterms_2014"),
    ("elections", "us_midterms_2018"),
    ("abortion", "abortion"),
]

In [None]:
for event_theme, event_name in EVENTS:
    print(event_name)
    event_polarization = json.load(
        open(
            f"{OUTPUT_DIR}/{event_name}_leaveout_polarization.json",
        )
    )
    total_pol, (dem_polarization, rep_polarization) = event_polarization

    print("total pol", total_pol)
    print("dem pol", np.mean(dem_polarization))
    print("rep pol", np.mean(rep_polarization))

    # Plot user polarization

    user_pols = pd.DataFrame(
        zip(
            dem_polarization + rep_polarization,
            ["Democrat"] * len(dem_polarization) + ["Republican"] * len(dem_polarization),
        ),
        columns=["Polarization", "Party"],
    )

    plt.figure(figsize=(15, 2))

    ax = sns.violinplot(
        data=user_pols,
        x="Polarization",
        y="Party",
        hue="Party",
        palette=PARTIES_COLORS,
    )

    ax.get_yaxis().set_ticks([])
    ax.set_ylabel("")
    ax.set_xlabel("User-level leave-out partisanship estimation")
    # ax.set_title(f"{EVENT_DETAILS['name']} (Total polarization: {total_pol[0]:.3f})")

    # add legend
    handles, labels = ax.get_legend_handles_labels()

    ax.legend(
        handles=handles,
        labels=labels,
        title="Party",
    )

    plt.savefig(
        fname=f"{FIGURES_DIR}/polarization/{event_theme}/{event_name}_user_leaveout_polarization.pdf",
        bbox_inches="tight",
        pad_inches=0,
        format="pdf",
    )

    plt.show()
