In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:

import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from load.utils import load_df_from_parquet

from affection.utils import (
    get_compound_sentiment_score,
)
from preprocessing.constants import OUTPUT_DIR

from affection.constants import (
    REPUBLICAN_KEYWORDS,
    DEMOCRACT_KEYWORDS,
)

from events.utils import get_event_regex


In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            f"data/logs/affective_polarization.log"
        ),
        logging.StreamHandler(stream=sys.stdout),
    ],
)

## EVENT-SPECIFIC

In [None]:
# TODO

## GENERAL

In [None]:
dem_keywords_regex = get_event_regex(DEMOCRACT_KEYWORDS, [], "or", stem=True)
rep_keywords_regex = get_event_regex(REPUBLICAN_KEYWORDS, [], "or", stem=True)


In [None]:
logging.info(dem_keywords_regex)
logging.info(rep_keywords_regex)

In [None]:
YEAR = 2016

partisan_comments = load_df_from_parquet(
    file_name=f"user_partisan_comments_{YEAR}.parquet",
)

In [None]:
# filter comments by democrats mentioning republicans or vice versa

logging.info("Filtering comments by democrats mentioning republican")
dem_comments_about_reps = partisan_comments[
    (partisan_comments["party"] == "dem")
    & (
        partisan_comments["body_cleaned"].str.contains(rep_keywords_regex, regex=True)
        & ~partisan_comments["body_cleaned"].str.contains(
            dem_keywords_regex, regex=True
        )
    )
].copy()

dem_comments_about_reps["type"] = "dem_comments_about_reps"

logging.info("Filtering comments by republicans mentioning democrats")
rep_comments_about_dems = partisan_comments[
    (partisan_comments["party"] == "rep")
    & (
        partisan_comments["tokens"].str.contains(dem_keywords_regex, regex=True)
        & ~partisan_comments["tokens"].str.contains(
            rep_keywords_regex, regex=True
        )
    )
].copy()

rep_comments_about_dems["type"] = "rep_comments_about_dems"

# concatenate the two dataframes

comments_about_opposing_party = pd.concat(
    [dem_comments_about_reps, rep_comments_about_dems]
)


In [None]:
logging.info(comments_about_opposing_party.shape)

In [None]:
logging.info("Calculating sentiment scores")

comments_about_opposing_party["sentiment_scores"] = comments_about_opposing_party[
    "body_cleaned"
].progress_apply(get_compound_sentiment_score)

logging.info("Saving sentiment scores")

current_date = pd.Timestamp.now().strftime("%m_%d")

OUTPUT_DIR_TODAY = f"{OUTPUT_DIR}/{current_date}"

if not os.path.exists(OUTPUT_DIR_TODAY):
    os.makedirs(OUTPUT_DIR_TODAY)

comments_about_opposing_party.to_parquet(
    f"{OUTPUT_DIR_TODAY}/comments_about_opposing_party_{YEAR}.parquet", index=False
)
