In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:

import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
import matplotlib.pyplot as plt

from load.utils import load_df_from_parquet

from affection.utils import (
    get_compound_sentiment_score,
    get_comments_mentioning_opposition,
)
from preprocessing.utils import (
    load_event_comments,
)
from preprocessing.constants import OUTPUT_DIR

from affection.constants import (
    REPUBLICAN_KEYWORDS,
    DEMOCRACT_KEYWORDS,
)

from events.utils import get_event_regex
from eda.constants import PARTIES_COLORS


In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            f"data/logs/affective_polarization.log"
        ),
        logging.StreamHandler(stream=sys.stdout),
    ],
)

In [None]:
PARTY_REGEXES = {
    "dem": get_event_regex(
        DEMOCRACT_KEYWORDS,
        [],
        "or",
        stem=False,
        capture=False,
    ),
    "rep": get_event_regex(
        REPUBLICAN_KEYWORDS,
        [],
        "or",
        stem=False,
        capture=False,
    ),
}


In [None]:
logging.info(PARTY_REGEXES["dem"])
logging.info(PARTY_REGEXES["rep"])

## EVENT-SPECIFIC

In [None]:
EVENT_NAMES = [
    ("gun_control", "mass_shootings_gun_control"),
    ("gun_control", "mass_shootings"),
    ("elections", "us_elections_2012"),
    ("elections", "us_elections_2016"),
    ("elections", "us_midterms_2014"),
    ("elections", "us_midterms_2018"),
    ("abortion", "abortion"),
]

for THEME, EVENT in EVENT_NAMES:
    logging.info(f"Loading {EVENT}")

    event_comments = load_event_comments(
        theme=THEME,
        event_name=EVENT,
    )

    dem_event_comments_about_reps = get_comments_mentioning_opposition(
        event_comments,
        body="body_cleaned",
        party="dem",
        regexes=PARTY_REGEXES,
    )

    rep_event_comments_about_dems = get_comments_mentioning_opposition(
        event_comments,
        body="body_cleaned",
        party="rep",
        regexes=PARTY_REGEXES,
    )

    logging.info("Dem comments")
    logging.info(dem_event_comments_about_reps.shape)
    logging.info("Rep comments")
    logging.info(rep_event_comments_about_dems.shape)

    event_comments_about_opposing_party = pd.concat(
        [
            dem_event_comments_about_reps,
            rep_event_comments_about_dems,
        ]
    )

    event_comments_about_opposing_party[
        "sentiment_score"
    ] = event_comments_about_opposing_party["body_cleaned"].progress_apply(
        get_compound_sentiment_score
    )

    current_date = pd.Timestamp.now().strftime("%m_%d")
    OUTPUT_DIR_TODAY = f"{OUTPUT_DIR}/{current_date}"

    if not os.path.exists(OUTPUT_DIR_TODAY):
        os.makedirs(OUTPUT_DIR_TODAY)

    event_comments_about_opposing_party.to_parquet(
        f"{OUTPUT_DIR_TODAY}/{EVENT}_comments_about_opposing_party.parquet",
    )

    event_comments["sentiment_score"] = event_comments["body_cleaned"].progress_apply(
        get_compound_sentiment_score
    )

    event_comments.to_parquet(
        f"{OUTPUT_DIR_TODAY}/{EVENT}_comments.parquet",
    )


## GENERAL

In [None]:
for YEAR in [2014, 2015, 2016, 2017, 2018, 2019]:

    partisan_comments = load_df_from_parquet(
        file_name=f"user_partisan_comments_{YEAR}.parquet",
    )

    # filter comments by democrats mentioning republicans

    logging.info("Filtering comments by democrats mentioning republican")
    dem_comments_about_reps = get_comments_mentioning_opposition(
        partisan_comments,
        body="body_cleaned",
        party="dem",
        regexes=PARTY_REGEXES,
    )

    logging.info("Filtering comments by republicans mentioning democrats")
    rep_comments_about_dems = get_comments_mentioning_opposition(
        partisan_comments,
        body="body_cleaned",
        party="rep",
        regexes=PARTY_REGEXES,
    )

    # concatenate the two dataframes

    comments_about_opposing_party = pd.concat(
        [
            dem_comments_about_reps,
            rep_comments_about_dems,
        ]
    )

    logging.info("Total comments")
    logging.info(comments_about_opposing_party.shape)
    logging.info("Dem comments")
    logging.info(dem_comments_about_reps.shape)
    logging.info("Rep comments")
    logging.info(rep_comments_about_dems.shape)

    logging.info("Calculating sentiment scores")

    comments_about_opposing_party["sentiment_scores"] = comments_about_opposing_party[
        "body_cleaned"
    ].progress_apply(get_compound_sentiment_score)

    logging.info("Saving sentiment scores")

    current_date = pd.Timestamp.now().strftime("%m_%d")

    OUTPUT_DIR_TODAY = f"{OUTPUT_DIR}/{current_date}"

    if not os.path.exists(OUTPUT_DIR_TODAY):
        os.makedirs(OUTPUT_DIR_TODAY)

    comments_about_opposing_party.to_parquet(
        f"{OUTPUT_DIR_TODAY}/comments_about_opposing_party_{YEAR}.parquet",
    )


In [None]:
current_date = pd.Timestamp.now().strftime("%m_%d")

comments_about_opposing_party = load_df_from_parquet(
    f"comments_about_opposing_party_2016.parquet",
    date=current_date,
)

In [None]:
comments_about_opposing_party.groupby("type")["sentiment_scores"].mean()


In [None]:
user_sentiment = comments_about_opposing_party.groupby(
    [
        "author",
        "party",
    ]
)["sentiment_scores"].mean()

user_sentiment = user_sentiment.reset_index()


In [None]:
sns.set_theme()
sns.boxplot(
    user_sentiment,
    x="party",
    y="sentiment_scores",
    palette=PARTIES_COLORS,
)

plt.show()


In [None]:
comments_about_opposing_party["date"] = pd.to_datetime(
    comments_about_opposing_party["created_utc"],
    unit="s",
)

comments_about_opposing_party["day"] = comments_about_opposing_party["date"].dt.date


In [None]:
plt.figure(figsize=(20, 10))

sns.lineplot(comments_about_opposing_party, x="day", y="sentiment_scores", hue="party")

plt.show()