In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(
            f"data/logs/mass_shootings_filter_comments.log"
        ),
        logging.StreamHandler(stream=sys.stdout),
    ],
)


In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from load.utils import (
    load_df_from_parquet,
)
from preprocessing.utils import (
    save_event_comments,
    load_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    MASS_SHOOTINGS_EVENTS_INFO,
    MASS_SHOOTINGS_KEYWORDS,
    MIN_OCCURENCE_FOR_VOCAB,
    get_event_regex,
)

In [None]:
YEARS = [2015, 2016, 2017, 2018]

OVERALL_EVENT_NAME = "mass_shootings"
EVENT_NAMES = list(MASS_SHOOTINGS_EVENTS_INFO.keys())

MASS_SHOOTING_EVENTS_KEYWORDS = [
    keyword
    for event_name in EVENT_NAMES
    for keyword in MASS_SHOOTINGS_EVENTS_INFO[event_name]["keywords"]
]


In [None]:
MASS_SHOOTING_REGEX_STRICT = get_event_regex(
    MASS_SHOOTINGS_KEYWORDS,
    MASS_SHOOTING_EVENTS_KEYWORDS,
    operator="and",
)

MASS_SHOOTING_REGEX_LOOSE = get_event_regex(
    MASS_SHOOTINGS_KEYWORDS,
    MASS_SHOOTING_EVENTS_KEYWORDS,
    operator="or",
)

logging.info(MASS_SHOOTING_REGEX_STRICT)
logging.info(MASS_SHOOTING_REGEX_LOOSE)

## Filter event comments

In [None]:
logging.info("Load partisan comments")
partisan_comments_list = []
for year in YEARS:
    partisan_comments_year = load_df_from_parquet(
        file_name=f"user_partisan_comments_{year}.parquet",
    )

    partisan_comments_year = partisan_comments_year[
        partisan_comments_year["party"].isin(["dem", "rep"])
    ]

    partisan_comments_list.append(partisan_comments_year)

partisan_comments = pd.concat(partisan_comments_list, ignore_index=True)


In [None]:
logging.info("Filtering event data based on keywords...")
event_comments_loose = partisan_comments[
    partisan_comments["tokens"].str.contains(
        MASS_SHOOTING_REGEX_LOOSE,
    )
].copy()
logging.info("finished keyword filtering")


In [None]:
logging.info(event_comments_loose.shape)
logging.info(event_comments_loose.dtypes)

In [None]:
save_event_comments(event_comments_loose, OVERALL_EVENT_NAME + "_loose")

In [None]:
del partisan_comments

In [None]:
logging.info("Filtering event data based on keywords...")
event_comments = event_comments_loose[
    event_comments_loose["tokens"].str.contains(
        MASS_SHOOTING_REGEX_STRICT,
    )
].copy()
logging.info("finished keyword filtering")


In [None]:
logging.info(event_comments.shape)
logging.info(event_comments.dtypes)

In [None]:
logging.info(f"Nr of event comments: {len(event_comments)}")

In [None]:
logging.info("saving event comments...")
save_event_comments(event_comments, OVERALL_EVENT_NAME)

## Build event vocabulary

In [None]:
# Read event data
# events_comments = load_event_comments(
#     OVERALL_EVENT_NAME,
# )

event_vocab = build_vocab(
    event_comments["tokens"],
    min_comment_freq=MIN_OCCURENCE_FOR_VOCAB,
)


In [None]:
logging.info("Vocabulary length")
logging.info(len(event_vocab))

In [None]:
logging.info("Saving event vocab")
save_event_vocab(event_vocab, OVERALL_EVENT_NAME)