In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:

import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

from load.utils import load_df_from_parquet

from affection.utils import (
    get_compound_sentiment_score,
    get_comments_mentioning_opposition,
)
from preprocessing.utils import (
    load_event_comments,
)
from preprocessing.constants import OUTPUT_DIR
from eda.constants import FIGURES_DIR

from affection.constants import (
    REPUBLICAN_KEYWORDS,
    DEMOCRACT_KEYWORDS,
)

from events.utils import get_event_regex
from events.mass_shootings_constants import MASS_SHOOTINGS_EVENTS_INFO
from events.event_constants import EVENTS_INFO
from eda.constants import PARTIES_COLORS


In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            f"data/logs/affective_polarization.log"
        ),
        logging.StreamHandler(stream=sys.stdout),
    ],
)

In [None]:
PARTY_REGEXES = {
    "dem": get_event_regex(
        DEMOCRACT_KEYWORDS,
        [],
        "or",
        stem=False,
        capture=False,
    ),
    "rep": get_event_regex(
        REPUBLICAN_KEYWORDS,
        [],
        "or",
        stem=False,
        capture=False,
    ),
}


In [None]:
logging.info(PARTY_REGEXES["dem"])
logging.info(PARTY_REGEXES["rep"])

## EVENT-SPECIFIC

In [None]:
EVENT_NAMES = [
    # ("gun_control", "mass_shootings_gun_control"),
    # ("gun_control", "mass_shootings"),
    ("elections", "us_elections_2012"),
    ("elections", "us_elections_2016"),
    ("elections", "us_midterms_2014"),
    ("elections", "us_midterms_2018"),
    ("abortion", "abortion"),
]

In [None]:
for THEME, EVENT in EVENT_NAMES:
    logging.info(f"Loading {EVENT}")

    event_comments = load_event_comments(
        theme=THEME,
        event_name=EVENT,
    )

    dem_event_comments_about_reps = get_comments_mentioning_opposition(
        event_comments,
        body="body_cleaned",
        party="dem",
        regexes=PARTY_REGEXES,
    )

    rep_event_comments_about_dems = get_comments_mentioning_opposition(
        event_comments,
        body="body_cleaned",
        party="rep",
        regexes=PARTY_REGEXES,
    )

    logging.info("Dem comments")
    logging.info(dem_event_comments_about_reps.shape)
    logging.info("Rep comments")
    logging.info(rep_event_comments_about_dems.shape)

    event_comments_about_opposing_party = pd.concat(
        [
            dem_event_comments_about_reps,
            rep_event_comments_about_dems,
        ]
    )

    event_comments_about_opposing_party[
        "sentiment_score"
    ] = event_comments_about_opposing_party["body_cleaned"].progress_apply(
        get_compound_sentiment_score
    )

    current_date = pd.Timestamp.now().strftime("%m_%d")
    OUTPUT_DIR_TODAY = f"{OUTPUT_DIR}/{current_date}"

    if not os.path.exists(OUTPUT_DIR_TODAY):
        os.makedirs(OUTPUT_DIR_TODAY)

    event_comments_about_opposing_party.to_parquet(
        f"{OUTPUT_DIR}/{EVENT}_comments_about_opposing_party.parquet",
    )

    event_comments["sentiment_score"] = event_comments["body_cleaned"].progress_apply(
        get_compound_sentiment_score
    )

    event_comments.to_parquet(
        f"{OUTPUT_DIR_TODAY}/{EVENT}_comments.parquet",
    )


In [None]:
pd.set_option('display.max_colwidth', None)

event_comments_about_opposing_party

In [None]:
print("Event comments about opposing party")

for THEME, EVENT in EVENT_NAMES:
    logging.info(f"Loading {EVENT}")

    event_comments_about_opposing_party = load_df_from_parquet(
        file_name=f"{EVENT}_comments_about_opposing_party_sentiment.parquet",
    )

    # replace the values in the 'party' column using the mapping dictionary
    event_comments_about_opposing_party["Party"] = event_comments_about_opposing_party[
        "party"
    ].replace({"dem": "Democrat", "rep": "Republican"})

    event_comments_about_opposing_party["date"] = pd.to_datetime(
        event_comments_about_opposing_party["created_utc"],
        unit="s",
    )

    event_comments_about_opposing_party["day"] = event_comments_about_opposing_party[
        "date"
    ].dt.date
    event_comments_about_opposing_party["week"] = (
        event_comments_about_opposing_party["date"].dt.to_period("W").dt.start_time
    )
    event_comments_about_opposing_party["month"] = (
        event_comments_about_opposing_party["date"].dt.to_period("M").dt.start_time
    )

    plt.figure(figsize=(12, 3))

    sns.lineplot(
        event_comments_about_opposing_party,
        x="week",
        y="sentiment_score",
        hue="Party",
        palette=PARTIES_COLORS,
        marker="o",
        errorbar=("ci", 95),
    )

    plt.ylabel("Sentiment score")
    plt.xlabel("Date")

    plt.savefig(
        f"{FIGURES_DIR}/affective_pol/{EVENT}_opposing_weekly_sentiment_scores.pdf",
        bbox_inches="tight",
    )

    plt.show()

    plt.figure(figsize=(12, 3))

    sns.lineplot(
        event_comments_about_opposing_party,
        x="month",
        y="sentiment_score",
        hue="Party",
        palette=PARTIES_COLORS,
        marker="o",
        errorbar=("ci", 95),
    )

    plt.savefig(
        f"{FIGURES_DIR}/affective_pol/{EVENT}_opposing_monthly_sentiment_scores.pdf",
        bbox_inches="tight",
    )

    plt.ylabel("Sentiment score")
    plt.xlabel("Date")

    plt.show()


In [None]:
print("All event comments")

for THEME, EVENT in EVENT_NAMES:
    logging.info(f"Loading {EVENT}")

    event_comments = load_df_from_parquet(
        file_name=f"{EVENT}_comments_sentiment.parquet",
    )

    # replace the values in the 'party' column using the mapping dictionary
    event_comments['Party'] = event_comments['party'].replace( {'dem': 'Democrat', 'rep': 'Republican'})

    event_comments["date"] = pd.to_datetime(
        event_comments["created_utc"],
        unit="s",
    )

    event_comments["day"] = event_comments[
        "date"
    ].dt.date
    event_comments["week"] = event_comments["date"].dt.to_period("W").dt.start_time
    event_comments["month"] = event_comments["date"].dt.to_period('M').dt.start_time

    plt.figure(figsize=(12, 3))

    sns.lineplot(
        event_comments,
        x="month",
        y="sentiment_score",
        hue="Party",
        palette=PARTIES_COLORS,
        marker="o",
        errorbar=("ci", 95),
    )

    plt.ylabel("Sentiment score")
    plt.xlabel("Date")

    plt.savefig(
        f"{FIGURES_DIR}/affective_pol/{EVENT}_monthly_sentiment_scores.pdf",
        bbox_inches='tight',
    )

    plt.show()


## GENERAL

In [None]:
for YEAR in [2014, 2015, 2016, 2017, 2018, 2019]:

    partisan_comments = load_df_from_parquet(
        file_name=f"user_partisan_comments_{YEAR}.parquet",
    )

    # filter comments by democrats mentioning republicans

    logging.info("Filtering comments by democrats mentioning republican")
    dem_comments_about_reps = get_comments_mentioning_opposition(
        partisan_comments,
        body="body_cleaned",
        party="dem",
        regexes=PARTY_REGEXES,
    )

    logging.info("Filtering comments by republicans mentioning democrats")
    rep_comments_about_dems = get_comments_mentioning_opposition(
        partisan_comments,
        body="body_cleaned",
        party="rep",
        regexes=PARTY_REGEXES,
    )

    # concatenate the two dataframes

    comments_about_opposing_party = pd.concat(
        [
            dem_comments_about_reps,
            rep_comments_about_dems,
        ]
    )

    logging.info("Total comments")
    logging.info(comments_about_opposing_party.shape)
    logging.info("Dem comments")
    logging.info(dem_comments_about_reps.shape)
    logging.info("Rep comments")
    logging.info(rep_comments_about_dems.shape)

    logging.info("Calculating sentiment scores")

    comments_about_opposing_party["sentiment_scores"] = comments_about_opposing_party[
        "body_cleaned"
    ].progress_apply(get_compound_sentiment_score)

    logging.info("Saving sentiment scores")

    current_date = pd.Timestamp.now().strftime("%m_%d")

    OUTPUT_DIR_TODAY = f"{OUTPUT_DIR}/{current_date}"

    if not os.path.exists(OUTPUT_DIR_TODAY):
        os.makedirs(OUTPUT_DIR_TODAY)

    comments_about_opposing_party.to_parquet(
        f"{OUTPUT_DIR_TODAY}/comments_about_opposing_party_{YEAR}.parquet",
    )


In [None]:
current_date = pd.Timestamp.now().strftime("%m_%d")

print("Comments about opposing party for the whole year")

comments_about_opposing_party = []

for YEAR in [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]:
    print(YEAR)
    comments_about_opposing_party_year = load_df_from_parquet(
        f"comments_about_opposing_party_{YEAR}.parquet",
        date=current_date,
    )
    comments_about_opposing_party_year.groupby("type")["sentiment_scores"].mean()

    user_sentiment = comments_about_opposing_party_year.groupby(
        [
            "author",
            "party",
        ]
    )["sentiment_scores"].mean()

    user_sentiment = user_sentiment.reset_index()

    sns.boxplot(
        user_sentiment,
        x="party",
        y="sentiment_scores",
        palette=PARTIES_COLORS,
    )

    plt.show()

    ################

    comments_about_opposing_party_year["date"] = pd.to_datetime(
        comments_about_opposing_party_year["created_utc"],
        unit="s",
    )

    # replace the values in the 'party' column using the mapping dictionary
    comments_about_opposing_party_year['Party'] = comments_about_opposing_party_year['party'].replace( {'dem': 'Democrat', 'rep': 'Republican'})

    comments_about_opposing_party_year["day"] = comments_about_opposing_party_year["date"].dt.date
    comments_about_opposing_party_year["week"] = (
        comments_about_opposing_party_year["date"].dt.to_period("W").dt.start_time
    )
    comments_about_opposing_party_year["month"] = (
        comments_about_opposing_party_year["date"].dt.to_period("M").dt.start_time
    )

    plt.figure(figsize=(12, 3))

    sns.lineplot(
        comments_about_opposing_party_year,
        x="month",
        y="sentiment_scores",
        hue="Party",
        palette=PARTIES_COLORS,
        marker="o",
        errorbar=("ci", 95),
    )

    plt.ylabel("Sentiment score")
    plt.xlabel("Date")

    plt.savefig(
        f"{FIGURES_DIR}/affective_pol/opposing_party_{YEAR}_monthly_sentiment_scores.pdf",
        bbox_inches='tight',
    )

    plt.show()

    comments_about_opposing_party.append(comments_about_opposing_party_year)


In [None]:
comments_about_opposing_party = pd.concat(comments_about_opposing_party)

In [None]:
display(comments_about_opposing_party)

In [None]:
plt.figure(figsize=(12, 3))

sns.lineplot(
    comments_about_opposing_party,
    x="month",
    y="sentiment_scores",
    hue="Party",
    palette=PARTIES_COLORS,
    marker="o",
    errorbar=("ci", 95),
)

plt.ylabel("Sentiment score")
plt.xlabel("Date")

plt.savefig(
    f"{FIGURES_DIR}/affective_pol/opposing_party_monthly_sentiment_scores.pdf",
    bbox_inches='tight',
)

plt.show()


In [None]:
event_theme = "gun_control"
event_name = "mass_shootings_gun_control"

import math

In [None]:
print("Event comments about opposing party")

event_comments_about_opposing_party = load_df_from_parquet(
    file_name=f"{event_name}_comments_about_opposing_party_sentiment.parquet",
)

# replace the values in the 'party' column using the mapping dictionary
event_comments_about_opposing_party['Party'] = event_comments_about_opposing_party['party'].replace( {'dem': 'Democrat', 'rep': 'Republican'})

event_comments_about_opposing_party_by_event = event_comments_about_opposing_party.groupby(["event_name", "Party"]).agg(
    sentiment_score=("sentiment_score", list),
    mean=("sentiment_score", "mean"),
    count=("id", "count"),
).reset_index()
event_comments_about_opposing_party_by_event["date"] = event_comments_about_opposing_party_by_event["event_name"].apply(lambda x: MASS_SHOOTINGS_EVENTS_INFO[x]["date"])

event_comments_about_opposing_party_by_event = event_comments_about_opposing_party_by_event.explode('sentiment_score')



In [None]:
event_comments_about_opposing_party_by_event

In [None]:
MASS_SHOOTINGS_EVENTS_INFO

In [None]:
plt.figure(figsize=(15, 4))
sns.lineplot(
    data=event_comments_about_opposing_party_by_event,
    x="date",
    y="sentiment_score",
    hue="Party",
    marker="o",
    color="k",
    palette=PARTIES_COLORS,
)

j = -1
for i, row in event_comments_about_opposing_party_by_event.iterrows():
    if i > j and i % 2 == 0:
        print(int(i / 2), row["event_name"])
        plt.text(
            row["date"],
            row["mean"],
            MASS_SHOOTINGS_EVENTS_INFO[row["event_name"]]["name"].rsplit(' ', 1)[0],
            horizontalalignment="left" if int(i / 2) in (0, 2, 3, 7, 14, 8, 18) else "right",
            verticalalignment="top" if int(i/2) in (19,) else "bottom",
            fontsize="x-small"
        )
        j = i

# display timestamp as date
# plt.xticks(
#     dem_rows["date"],
#     [
#         date if i not in (4, 8, 15, 21) else ""
#         for i, date in enumerate(
#             dem_rows["date"].dt.date
#         )
#     ],
#     rotation=45,
# )

plt.legend(loc='lower right')

plt.ylabel("Sentiment score")
plt.xlabel("Date")

plt.savefig(
    f"{FIGURES_DIR}/affective_pol/{event_name}_opposing_party_monthly_sentiment_scores.pdf",
    bbox_inches="tight",
)

plt.show()
