In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
from load.utils import (
    load_df_from_parquet,
    load_comments,
    load_subreddits,
    save_df_as_parquet,
)

from preprocessing.utils import (
    tokenize_comment,
)

from preprocessing.utils import (
    build_vocab,
    save_event_vocab,
)
from preprocessing.constants import (
    MIN_OCCURENCE_FOR_VOCAB,
)


In [None]:

import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(f"data/logs/user_partisan_comments.log"),
        logging.StreamHandler(stream=sys.stdout),
    ],
)

In [None]:
from tqdm import tqdm

tqdm.pandas()


In [None]:
YEARS = [2014, 2015, 2016, 2017, 2018, 2019]
START_MONTH = 1
STOP_MONTH = 12

logging.info(YEARS)

In [None]:
subreddits = load_subreddits()

eligible_subreddits = set(subreddits["subreddit"].values)

In [None]:
for year in YEARS:
    logging.info(f"Loading comments {year}")
    comments_year = load_comments(
        years=year,
        start_month=START_MONTH,
        stop_month=STOP_MONTH,
        engine="pandas",
    )

    user_party_year = load_df_from_parquet(
        file_name=f"user_party_{year}.parquet",
    )[["author", "party"]]

    # Filter out non-democrat and non-republican users
    user_party_year = user_party_year[user_party_year["party"].isin({"dem", "rep"})]

    # Filter out regional subreddits
    comments_year = comments_year[
        comments_year["subreddit"].isin(eligible_subreddits)
    ]

    logging.info("Merging comments and user partisanship...")
    user_partisan_comments_year = comments_year.merge(
        user_party_year,
        on="author",
        how="inner",
    )

    del comments_year

    logging.info("Tokenizing comments...")
    user_partisan_comments_year["tokens"] = (
        user_partisan_comments_year["body_cleaned"]
        .progress_apply(
            tokenize_comment,
        )
        .astype("string")
    )

    logging.info("Saving user partisan tokenized comments")

    save_df_as_parquet(
        user_partisan_comments_year,
        target_file=f"user_partisan_comments_{year}.parquet",
    )

    logging.info("Building vocab...")

    year_vocab = build_vocab(
        user_partisan_comments_year["tokens"],
        ngram_range=(1, 2),
        min_df=MIN_OCCURENCE_FOR_VOCAB,
    )

    logging.info("Saving vocab...")
    save_event_vocab(
        year_vocab,
        theme="overall",
        event_name=f"user_partisan_comments_{year}",
    )
