In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            f"data/logs/elections_polarization.log"
        ),
        logging.StreamHandler(stream=sys.stdout),
    ],
)


In [None]:
import json

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm  # type: ignore
from scipy import stats

sns.set_theme()

from eda.constants import (
    FIGURES_DIR,
    FIG_SIZE,
)
from polarization.utils import (
    calculate_polarization,
    calculate_polarization_by_time,
)
from preprocessing.utils import (
    load_event_comments,
    load_event_vocab,
)
from preprocessing.constants import (
    OUTPUT_DIR,
)
from events.elections_constants import (
    ELECTIONS_EVENTS_INFO,
)

sns.set(rc={"figure.figsize": FIG_SIZE})


In [None]:
YEAR = 2016
EVENT_KEY = f"us_elections_{YEAR}"
EVENT_DETAILS = ELECTIONS_EVENTS_INFO[EVENT_KEY]

logging.info(EVENT_DETAILS)

In [None]:
logging.info(EVENT_DETAILS["name"])

event_comments = load_event_comments(theme="elections", event_name=EVENT_KEY)
event_vocab = load_event_vocab(theme="elections", event_name=EVENT_KEY)

In [None]:
logging.info(event_comments.shape)
logging.info(event_comments.columns)

In [None]:
plt.figure(figsize=(20, 12))
sns.countplot(
    y="subreddit",
    data=event_comments,
    hue="party",
    order=event_comments["subreddit"].value_counts().iloc[:20].index,
    palette={"rep": "red", "dem": "blue"},
)

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_subreddit_hist.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)

plt.show()


In [None]:
plt.figure(figsize=(2, 5))
sns.countplot(x=event_comments['party'])

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_party_hist.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)

plt.show()

In [None]:
logging.info(event_comments.shape)

## Filter users

In [None]:
# MAY_1_2016_TIMESTAMP = 1462060800

# early_comments = event_comments[event_comments["created_utc"] <= MAY_1_2016_TIMESTAMP]

# early_unique_users = early_comments["author"].unique()

# logging.info(early_unique_users.shape)

# event_comments = event_comments[
#     event_comments["author"].isin(early_unique_users)
# ]

# logging.info(event_comments.shape)

In [None]:
logging.info(event_comments.shape)
logging.info(event_comments.columns)

### Total polarization

In [None]:
logging.info("Calculating leave-out polarization...")

event_polarization = calculate_polarization(
    event_comments,
    ngram_range=(1, 2),
    event_vocab=event_vocab,
    method="leaveout",
    equalize_users=True,
)

logging.info("Saving leave-out polarization...")
with open(f"{OUTPUT_DIR}/{EVENT_KEY}_leaveout_polarization.json", "w") as file:
    json.dump(event_polarization, file)


In [None]:
event_polarization = json.load(
    open(
        f"{OUTPUT_DIR}/{EVENT_KEY}_leaveout_polarization.json",
    )
)

In [None]:
total_pol, (dem_polarization, rep_polarization) = event_polarization

logging.info(f"Total polarization: {total_pol}")


In [None]:
t = stats.ttest_ind(dem_polarization, rep_polarization, equal_var=False)

logging.info(t)

In [None]:
# Plot user polarization

user_pols = pd.DataFrame(
    zip(
        dem_polarization + rep_polarization,
        ["Democrats"] * len(dem_polarization) + ["Republicans"] * len(dem_polarization),
    ),
    columns=["Polarization", "Affiliation"],
)

plt.figure()

ax = sns.boxplot(
    y="Polarization",
    x="Affiliation",
    data=user_pols,
)

ax.set_xlabel("Affiliation")
ax.set_ylabel("User polarization")
ax.set_title(f"{EVENT_DETAILS['name']} (Total polarization: {total_pol[0]:.3f})")

ax.legend(
    labels=[
        f"Median democrat polarization: {np.mean(dem_polarization):.3f}",
        f"Median republican polarization: {np.mean(rep_polarization):.3f}",
    ],
)

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_user_leaveout_polarization.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)

plt.show()


In [None]:
event_comments["datetime"] = pd.to_datetime(
    event_comments["created_utc"],
    unit="s",
)


### Monthly polarization

In [None]:
logging.info("Calculating leave-out polarization by month...")

pol_by_month_events = calculate_polarization_by_time(
    event_comments,
    ngram_range=(1, 2),
    event_vocab=event_vocab,
    freq="M",
    equalize_users=True,
)

logging.info("Saving monthly polarization data...")
pol_by_month_events.to_parquet(
    f"{OUTPUT_DIR}/{EVENT_KEY}_leaveout_polarization_by_month.parquet",
    index=False,
)


In [None]:
# Plot monthly polarization

plt.figure()

ax = sns.lineplot(
    x=pol_by_month_events["date"],
    y=pol_by_month_events["polarization"],
    label="Polarization",
)
sns.lineplot(
    data=pol_by_month_events,
    x="date",
    y="random_polarization",
    label="Polarization with random user assignment",
)

for relevant_event, relevant_event_date in ELECTIONS_EVENTS_INFO[EVENT_KEY][
    "relevant_dates"
].items():
    plt.axvline(
        relevant_event_date,
        linestyle="--",
        color="blue",
        label=f"{relevant_event} ({relevant_event_date.strftime('%m/%d/%Y')})",
    )

plt.axvline(
    ELECTIONS_EVENTS_INFO[EVENT_KEY]["date"],
    linestyle="--",
    color="red",
    label=f"{ELECTIONS_EVENTS_INFO[EVENT_KEY]['name']} ({ELECTIONS_EVENTS_INFO[EVENT_KEY]['date'].strftime('%m/%d/%Y')})",
)

plt.xlabel("Date")
plt.ylabel("Polarization")
plt.title("Monthly polarization score")
plt.legend()

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_leaveout_polarization_by_month.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)
plt.show()


In [None]:
# Plot monthly user count

plt.figure()

ax = sns.lineplot(
    x=pol_by_month_events["date"],
    y=pol_by_month_events["user_cnt"],
)

for relevant_event, relevant_event_date in ELECTIONS_EVENTS_INFO[EVENT_KEY][
    "relevant_dates"
].items():
    plt.axvline(
        relevant_event_date,
        linestyle="--",
        color="blue",
        label=f"{relevant_event} ({relevant_event_date.strftime('%m/%d/%Y')})",
    )

plt.axvline(
    ELECTIONS_EVENTS_INFO[EVENT_KEY]["date"],
    linestyle="--",
    color="red",
    label=f"{ELECTIONS_EVENTS_INFO[EVENT_KEY]['name']} ({ELECTIONS_EVENTS_INFO[EVENT_KEY]['date'].strftime('%m/%d/%Y')})",
)

plt.xlabel("Date")
plt.ylabel("Number of users")
plt.title("Monthly number of users discussing event")
plt.legend()

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_nr_users_by_month.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)
plt.show()


### Weekly polarization

In [None]:
logging.info("Calculating leave-out polarization by week...")

pol_by_week_events = calculate_polarization_by_time(
    event_comments,
    ngram_range=(1, 2),
    event_vocab=event_vocab,
    freq="W",
    equalize_users=True,
)

logging.info("Saving weekly polarization data...")
pol_by_week_events.to_parquet(
    f"{OUTPUT_DIR}/{EVENT_KEY}_leaveout_polarization_by_week.parquet",
    index=False,
)


In [None]:
# pol_by_week_events = pd.read_parquet(
#     f"{OUTPUT_DIR}/{EVENT_KEY}_leaveout_polarization_by_week.parquet"
# )

# pol_by_week_events = pd.read_csv(
#     f"{OUTPUT_DIR}/{EVENT_KEY}_polarization_by_week.csv",
#     index_col=False,
# )

# pol_by_week_events["date"] = pd.to_datetime(pol_by_week_events["date"])


In [None]:
# Plot weekly polarization

plt.figure()

ax = sns.lineplot(
    x=pol_by_week_events["date"],
    y=pol_by_week_events["polarization"],
    label="Polarization",
)
sns.lineplot(
    data=pol_by_week_events,
    x="date",
    y="random_polarization",
    label="Polarization with random user assignment",
)

for relevant_event, relevant_event_date in ELECTIONS_EVENTS_INFO[EVENT_KEY][
    "relevant_dates"
].items():
    plt.axvline(
        relevant_event_date,
        linestyle="--",
        color="blue",
        label=f"{relevant_event} ({relevant_event_date.strftime('%m/%d/%Y')})",
    )

plt.axvline(
    ELECTIONS_EVENTS_INFO[EVENT_KEY]["date"],
    linestyle="--",
    color="red",
    label=f"{ELECTIONS_EVENTS_INFO[EVENT_KEY]['name']} ({ELECTIONS_EVENTS_INFO[EVENT_KEY]['date'].strftime('%m/%d/%Y')})",
)

plt.xlabel("Date")
plt.ylabel("Polarization")
plt.title("Weekly polarization score")
plt.legend()

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_leaveout_polarization_by_week.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)
plt.show()


In [None]:
# Plot weekly user count

plt.figure()

ax = sns.lineplot(
    x=pol_by_week_events["date"],
    y=pol_by_week_events["user_cnt"],
)

for relevant_event, relevant_event_date in ELECTIONS_EVENTS_INFO[EVENT_KEY][
    "relevant_dates"
].items():
    plt.axvline(
        relevant_event_date,
        linestyle="--",
        color="blue",
        label=f"{relevant_event} ({relevant_event_date.strftime('%m/%d/%Y')})",
    )

plt.axvline(
    ELECTIONS_EVENTS_INFO[EVENT_KEY]["date"],
    linestyle="--",
    color="red",
    label=f"{ELECTIONS_EVENTS_INFO[EVENT_KEY]['name']} ({ELECTIONS_EVENTS_INFO[EVENT_KEY]['date'].strftime('%m/%d/%Y')})",
)

plt.xlabel("Date")
plt.ylabel("Number of users")
plt.title("Weekly number of users discussing event")
plt.legend()

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_nr_users_by_week.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)
plt.show()


### Daily polarization

In [None]:
logging.info("Calculating leave-out polarization by day...")

pol_by_day_events = calculate_polarization_by_time(
    event_comments,
    ngram_range=(1, 2),
    event_vocab=event_vocab,
    freq="D",
    equalize_users=True,
)

logging.info("Saving daily polarization data...")
pol_by_day_events.to_parquet(
    f"{OUTPUT_DIR}/{EVENT_KEY}_leaveout_polarization_by_day.parquet",
    index=False,
)


In [None]:
pol_by_day_events = pd.read_parquet(
    f"{OUTPUT_DIR}/{EVENT_KEY}_leaveout_polarization_by_day.parquet"
)

# pol_by_day_events = pd.read_csv(
#     f"{OUTPUT_DIR}/{EVENT_KEY}_polarization_by_day.csv",
#     index_col=False,
# )

# pol_by_day_events["date"] = pd.to_datetime(pol_by_day_events["date"])

In [None]:
# Plot daily polarization

plt.figure()

ax = sns.lineplot(
    x=pol_by_day_events["date"],
    y=pol_by_day_events["polarization"],
    label="Polarization",
)
sns.lineplot(
    data=pol_by_day_events,
    x="date",
    y="random_polarization",
    label="Polarization with random user assignment",
)

for relevant_event, relevant_event_date in ELECTIONS_EVENTS_INFO[EVENT_KEY][
    "relevant_dates"
].items():
    plt.axvline(
        relevant_event_date,
        linestyle="--",
        color="blue",
        label=f"{relevant_event} ({relevant_event_date.strftime('%m/%d/%Y')})",
    )

plt.axvline(
    ELECTIONS_EVENTS_INFO[EVENT_KEY]["date"],
    linestyle="--",
    color="red",
    label=f"{ELECTIONS_EVENTS_INFO[EVENT_KEY]['name']} ({ELECTIONS_EVENTS_INFO[EVENT_KEY]['date'].strftime('%m/%d/%Y')})",
)

plt.xlabel("Date")
plt.ylabel("Polarization")
plt.title("Daily polarization score")
plt.legend()

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_leaveout_polarization_by_day.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)
plt.show()


In [None]:
# Plot daily user count

plt.figure()

ax = sns.lineplot(
    x=pol_by_day_events["date"],
    y=pol_by_day_events["user_cnt"],
)

color = iter(cm.Greens(np.linspace(0.4, 1, len(ELECTIONS_EVENTS_INFO[EVENT_KEY][
    "relevant_dates"
]))))

for relevant_event, relevant_event_date in ELECTIONS_EVENTS_INFO[EVENT_KEY][
    "relevant_dates"
].items():
    plt.axvline(
        relevant_event_date,
        linestyle="--",
        color=next(color),
        label=f"{relevant_event} ({relevant_event_date.strftime('%m/%d/%Y')})",
    )

plt.axvline(
    ELECTIONS_EVENTS_INFO[EVENT_KEY]["date"],
    linestyle="--",
    color="red",
    label=f"{ELECTIONS_EVENTS_INFO[EVENT_KEY]['name']} ({ELECTIONS_EVENTS_INFO[EVENT_KEY]['date'].strftime('%m/%d/%Y')})",
)

plt.xlabel("Date")
plt.ylabel("Number of users")
plt.title("Daily number of users discussing event")
plt.legend()

plt.savefig(
    fname=f"{FIGURES_DIR}/{EVENT_KEY}_nr_users_by_day.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)
plt.show()


In [None]:
logging.info("End of script")