In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging
import sys

import pandas as pd

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(f"data/logs/elections_filter_comments.log"),
        logging.StreamHandler(stream=sys.stdout),
    ],
)


In [None]:
from tqdm import tqdm
tqdm.pandas()

from load.utils import (
    load_df_from_parquet,
)
from preprocessing.utils import (
    save_event_comments,
    load_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    MIN_OCCURENCE_FOR_VOCAB,
)

from events.elections_constants import (
    ELECTIONS_EVENTS_INFO,
)

In [None]:
YEAR = 2016

EVENT_NAME = f"us_elections_{YEAR}"
EVENT_INFO = ELECTIONS_EVENTS_INFO[EVENT_NAME]

## Filter event comments

In [None]:
logging.info(f"Load partisan comments from {YEAR}")

partisan_comments = load_df_from_parquet(
    file_name=f"user_partisan_comments_{YEAR}.parquet",
)

logging.info(partisan_comments.shape)

In [None]:
logging.info("Filtering event data based on keywords...")
event_comments = partisan_comments[
    partisan_comments["tokens"].str.contains(
        EVENT_INFO["regex"],
        regex=True,
    )
].copy()
logging.info("finished keyword filtering")


In [None]:
logging.info(event_comments.shape)
logging.info(event_comments.dtypes)

In [None]:
del partisan_comments

In [None]:
logging.info(f"Nr of event comments: {len(event_comments)}")

In [None]:
logging.info("saving event comments...")
save_event_comments(event_comments, EVENT_NAME)

## Build event vocabulary

In [None]:
# Read event data
event_comments = load_event_comments(
    theme="elections",
    event_name=EVENT_NAME,
)

logging.info("Building event vocabulary...")

# filter out tokens that occur less than MIN_OCCURENCE_FOR_VOCAB times
event_vocab = build_vocab(
    event_comments["tokens"],
    ngram_range=(1, 2),
    min_df=MIN_OCCURENCE_FOR_VOCAB,
)


In [None]:
logging.info("Vocabulary length")
logging.info(len(event_vocab))

In [None]:
logging.info("Saving event vocab")
save_event_vocab(event_vocab, EVENT_NAME)