In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging
import sys
import os

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            f"data/logs/mass_shootings_polarization.log"
        ),
        logging.StreamHandler(stream=sys.stdout),
    ],
)


In [None]:
import json

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

sns.set_theme()

from eda.constants import FIGURES_DIR
from polarization.utils import (
    calculate_polarization,
    calculate_polarization_by_time,
)
from preprocessing.utils import (
    load_event_comments,
    load_event_vocab,
)
from preprocessing.constants import (
    OUTPUT_DIR,
)

from events.mass_shootings_constants import (
    MASS_SHOOTINGS_EVENTS_INFO,
)

sns.set(rc={"figure.figsize": (40, 12)})

CURRENT_DATE = pd.to_datetime("today").strftime("%m_%d")


In [None]:
THEME = "gun_control"
EVENT_KEY = "mass_shootings"

logging.info(MASS_SHOOTINGS_EVENTS_INFO)

In [None]:
event_comments = load_event_comments(theme=THEME, event_name=EVENT_KEY)
event_vocab = load_event_vocab(theme=THEME, event_name=EVENT_KEY)

In [None]:
logging.info(event_comments.shape)


In [None]:

plt.figure(figsize=(20, 12))
sns.countplot(
    y="subreddit",
    data=event_comments,
    hue="party",
    order=event_comments["subreddit"].value_counts().iloc[:20].index,
    palette={"rep": "#E81B23", "dem": "#00AEF3"},
)

FIGURES_DIR_TODAY = f"{FIGURES_DIR}/{THEME}/{CURRENT_DATE}"

if not os.path.exists(FIGURES_DIR_TODAY):
    os.makedirs(FIGURES_DIR_TODAY)

plt.savefig(
    fname=f"{FIGURES_DIR_TODAY}/{EVENT_KEY}_subreddit_hist.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)

plt.show()


In [None]:
plt.figure(figsize=(20, 3))
sns.countplot(
    y=event_comments["party"],
    palette={"rep": "#E81B23", "dem": "#00AEF3"},
)

plt.savefig(
    fname=f"{FIGURES_DIR_TODAY}/{EVENT_KEY}_party_hist.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)

plt.show()


### Total polarization

In [None]:
logging.info("Calculating leave-out polarization...")

event_polarization = calculate_polarization(
    event_comments,
    ngram_range=(1, 2),
    event_vocab=event_vocab,
    method="leaveout",
    equalize_users=True,
)

logging.info("Saving leave-out polarization...")
OUTPUT_DIR_TODAY = f"{OUTPUT_DIR}/polarization/{THEME}/{CURRENT_DATE}"

if not os.path.exists(OUTPUT_DIR_TODAY):
    os.makedirs(OUTPUT_DIR_TODAY)

with open(f"{OUTPUT_DIR_TODAY}/{EVENT_KEY}_leaveout_polarization.json", "w") as file:
    json.dump(event_polarization, file)


In [None]:
# Load leave-out polarization json

# event_polarization = json.load(
#     open(
#         f"{OUTPUT_DIR_TODAY}/{EVENT_KEY}_leaveout_polarization.json",
#     )
# )

In [None]:
total_pol, (dem_polarization, rep_polarization) = event_polarization

In [None]:
# Plot user polarization

user_pols = pd.DataFrame(
    zip(
        dem_polarization + rep_polarization,
        ["Democrats"] * len(dem_polarization) + ["Republicans"] * len(dem_polarization),
    ),
    columns=["Polarization", "Affiliation"],
)

plt.figure()

ax = sns.boxplot(
    y="Polarization",
    x="Affiliation",
    data=user_pols,
    palette={"Republicans": "#E81B23", "Democrats": "#00AEF3"},
)

ax.set_xlabel("Affiliation")
ax.set_ylabel("User polarization")
ax.set_title(f"Mass Shootings (Total polarization: {total_pol[0]:.3f})")

ax.legend(
    labels=[
        f"Median democrat polarization: {np.mean(dem_polarization):.3f}",
        f"Median republican polarization: {np.mean(rep_polarization):.3f}",
    ],
)

plt.savefig(
    fname=f"{FIGURES_DIR_TODAY}/{EVENT_KEY}_user_leaveout_polarization.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)

plt.show()


In [None]:
t = stats.ttest_ind(dem_polarization, rep_polarization, equal_var=False)

logging.info(t)

In [None]:
for shooting_name, shooting_comments in event_comments.groupby("event_name"):
    event_polarization = calculate_polarization(
        shooting_comments,
        ngram_range=(1, 2),
        event_vocab=event_vocab,
        method="leaveout",
        equalize_users=False,
    )

    total_pol, (dem_polarization, rep_polarization) = event_polarization

    logging.info(f"{shooting_name} - polarization: {total_pol}")



### Aproach 2

In [None]:
EVENT_KEY_GUN_CONTROL = "mass_shootings_gun_control"

In [None]:
event_comments = load_event_comments(theme=THEME, event_name=EVENT_KEY_GUN_CONTROL)
event_vocab = load_event_vocab(theme=THEME, event_name=EVENT_KEY_GUN_CONTROL)

In [None]:
logging.info("Calculating leave-out polarization...")

event_polarization = calculate_polarization(
    event_comments,
    ngram_range=(1, 2),
    event_vocab=event_vocab,
    method="leaveout",
    equalize_users=True,
)

logging.info("Saving leave-out polarization...")
with open(f"{OUTPUT_DIR_TODAY}/{EVENT_KEY_GUN_CONTROL}_leaveout_polarization.json", "w") as file:
    json.dump(event_polarization, file)

In [None]:
# Load leave-out polarization json

# event_polarization = json.load(
#     open(
#         f"{OUTPUT_DIR_TODAY}/{EVENT_KEY_GUN_CONTROL}_leaveout_polarization.json",
#     )
# )

In [None]:
total_pol, (dem_polarization, rep_polarization) = event_polarization

In [None]:
# Plot user polarization

user_pols = pd.DataFrame(
    zip(
        dem_polarization + rep_polarization,
        ["Democrats"] * len(dem_polarization) + ["Republicans"] * len(dem_polarization),
    ),
    columns=["Polarization", "Affiliation"],
)

plt.figure()

ax = sns.boxplot(
    y="Polarization",
    x="Affiliation",
    data=user_pols,
    palette={"Republicans": "#E81B23", "Democrats": "#00AEF3"},
)

ax.set_xlabel("Affiliation")
ax.set_ylabel("User polarization")
ax.set_title(f"Mass Shootings (Total polarization: {total_pol[0]:.3f})")

ax.legend(
    labels=[
        f"Median democrat polarization: {np.mean(dem_polarization):.3f}",
        f"Median republican polarization: {np.mean(rep_polarization):.3f}",
    ],
)

plt.savefig(
    fname=f"{FIGURES_DIR_TODAY}/{EVENT_KEY_GUN_CONTROL}_user_leaveout_polarization.pdf",
    bbox_inches="tight",
    pad_inches=0,
    format="pdf",
)

plt.show()


In [None]:
for shooting_name, shooting_comments in event_comments.groupby("event_name"):
    shooting_name = shooting_name + "_gun_control"
    event_polarization = calculate_polarization(
        shooting_comments,
        ngram_range=(1, 2),
        event_vocab=event_vocab,
        method="leaveout",
        equalize_users=False,
    )

    total_pol, (dem_polarization, rep_polarization) = event_polarization

    logging.info("Saving leave-out polarization...")
    with open(f"{OUTPUT_DIR_TODAY}/{shooting_name}_leaveout_polarization.json", "w") as file:
        json.dump(event_polarization, file)

    logging.info(f"{shooting_name} - polarization: {total_pol}")

    logging.info("Calculating leave-out polarization by day...")

    shooting_comments["datetime"] = pd.to_datetime(
        shooting_comments["created_utc"],
        unit="s",
    )

    pol_by_day_events = calculate_polarization_by_time(
        shooting_comments,
        ngram_range=(1, 2),
        event_vocab=event_vocab,
        freq="D",
        equalize_users=True,
    )

    logging.info("Saving daily polarization data...")
    pol_by_day_events.to_parquet(
        f"{OUTPUT_DIR_TODAY}/{shooting_name}_leaveout_polarization_by_day.parquet",
        index=False,
    )
    
    logging.info("Calculating leave-out polarization by day...")

    pol_by_day_events = calculate_polarization_by_time(
        shooting_comments,
        ngram_range=(1, 2),
        event_vocab=event_vocab,
        freq="D",
        equalize_users=True,
    )

    logging.info("Saving daily polarization data...")
    pol_by_day_events.to_parquet(
        f"{OUTPUT_DIR_TODAY}/{shooting_name}_leaveout_polarization_by_day.parquet",
        index=False,
    )

    pol_by_day_events = pd.read_parquet(
    f"{OUTPUT_DIR_TODAY}/{shooting_name}_leaveout_polarization_by_day.parquet"
)

    # Plot daily polarization

    plt.figure()

    ax = sns.lineplot(
        x=pol_by_day_events["date"],
        y=pol_by_day_events["polarization"],
        label="Polarization",
    )
    sns.lineplot(
        data=pol_by_day_events,
        x="date",
        y="random_polarization",
        label="Polarization with random user assignment",
    )

    plt.xlabel("Date")
    plt.ylabel("Polarization")
    plt.title("Daily polarization score")
    plt.legend()

    plt.savefig(
        fname=f"{FIGURES_DIR_TODAY}/{shooting_name}_leaveout_polarization_by_day.pdf",
        bbox_inches="tight",
        pad_inches=0,
        format="pdf",
    )
    plt.show()

    # Plot daily user count

    plt.figure()

    ax = sns.lineplot(
        x=pol_by_day_events["date"],
        y=pol_by_day_events["user_cnt"],
    )

    plt.xlabel("Date")
    plt.ylabel("Number of users")
    plt.title("Daily number of users discussing event")
    plt.legend()

    plt.savefig(
        fname=f"{FIGURES_DIR_TODAY}/{shooting_name}_nr_users_by_day.pdf",
        bbox_inches="tight",
        pad_inches=0,
        format="pdf",
    )
    plt.show()
