In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            f"data/logs/abortion_filter_comments.log"
        ),
        logging.StreamHandler(stream=sys.stdout),
    ],
)


In [None]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

from load.utils import (
    load_df_from_parquet,
)
from preprocessing.utils import (
    save_event_comments,
    load_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    MIN_OCCURENCE_FOR_VOCAB,
)

from events.abortion_constants import (
    ABORTION_EVENTS_INFO,
    ABORTION_KEYWORDS,
)
from events.utils import get_event_regex

In [None]:
YEARS = [2015, 2016, 2017, 2018, 2019]


## Filter event comments

In [None]:
logging.info("Load partisan comments")
partisan_comments_list = []
for year in YEARS:
    partisan_comments_year = load_df_from_parquet(
        file_name=f"user_partisan_comments_{year}.parquet",
    )

    partisan_comments_year = partisan_comments_year[
        partisan_comments_year["party"].isin({"dem", "rep"})
    ]

    partisan_comments_list.append(partisan_comments_year)

partisan_comments = pd.concat(
    partisan_comments_list,
    ignore_index=True,
)

logging.info("Partisan comments dataset")
logging.info(partisan_comments.shape)

In [None]:
THEME = "abortion"
OVERALL_EVENT_NAME = "abortion_events"
EVENT_NAMES = list(ABORTION_EVENTS_INFO.keys())

In [None]:
overall_event_comments_list = []

for event_name in EVENT_NAMES:
    event_info = ABORTION_EVENTS_INFO[event_name]
    logging.info(
        event_name + " " + event_info["date"].strftime("%Y-%m-%d"),
    )
    event_regex = event_info["regex"]

    logging.info(event_regex)

    # filter comments based on event date
    logging.info("Filtering event data based on date...")
    event_comments = partisan_comments[
        (
            pd.to_datetime(partisan_comments["created_utc"], unit="s")
            >= event_info["date"] - pd.Timedelta(days=1)
        )
        & (
            pd.to_datetime(partisan_comments["created_utc"], unit="s")
            <= event_info["date"] + pd.Timedelta(days=30)
        )
    ].copy()

    logging.info("Filtering event data based on keywords...")
    event_comments = event_comments[
        event_comments["tokens"].str.contains(
            event_regex,
            regex=True,
        )
    ].copy()
    logging.info("finished keyword filtering")

    logging.info(event_comments.shape)

    logging.info(f"Nr of event comments: {len(event_comments)}")

    event_comments["event_name"] = event_name

    overall_event_comments_list.append(event_comments)

overall_event_comments = pd.concat(
    overall_event_comments_list,
    ignore_index=True,
)


In [None]:
logging.info("Overall event comments dataset")
logging.info(overall_event_comments.shape)

save_event_comments(
    overall_event_comments,
    theme=THEME,
    event_name=OVERALL_EVENT_NAME,
)


## Build event vocabulary

In [None]:
event_vocab = build_vocab(
    overall_event_comments["tokens"],
    ngram_range=(1, 2),
    min_df=MIN_OCCURENCE_FOR_VOCAB,
)


In [None]:
logging.info("Vocabulary length")
logging.info(len(event_vocab))

In [None]:
logging.info("Saving event vocab")
save_event_vocab(
    event_vocab,
    theme=THEME,
    event_name=OVERALL_EVENT_NAME,
)


### Filter comments based on overall abortion

In [None]:
ABORTION_REGEX = get_event_regex(ABORTION_KEYWORDS, [], "or")

In [None]:
logging.info("Filtering event data based on keywords...")
abortion_comments = partisan_comments[
    partisan_comments["tokens"].str.contains(
        ABORTION_REGEX,
        regex=True,
    )
].copy()


In [None]:
del partisan_comments

In [None]:
save_event_comments(
    abortion_comments,
    theme=THEME,
    event_name=THEME,
)

In [None]:


abortion_vocab = build_vocab(
    abortion_comments["tokens"],
    ngram_range=(1, 2),
    min_df=MIN_OCCURENCE_FOR_VOCAB,
)

logging.info("Vocabulary length")
logging.info(len(event_vocab))

logging.info("Saving event vocab")
save_event_vocab(
    abortion_vocab,
    theme=THEME,
    event_name=THEME,
)