In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import json

import numpy as np

from load.utils import (
    load_users,
    load_comments,
    save_df_as_parquet,
)
from preprocessing.constants import METADATA_DIR

In [None]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("data/logs/partisan_comments.log"),
        logging.StreamHandler(stream=sys.stdout)
    ]
)

In [None]:
from tqdm import tqdm

tqdm.pandas()


In [None]:
YEARS = [2015, 2016, 2017, 2018]
START_MONTH = 1
STOP_MONTH = 12

logging.info(YEARS)

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
dem_subreddits = set(
    flatten(
        [
            json.load(
                open(
                    f"{METADATA_DIR}/dem_subreddits_{year}_unweighted.json",
                )
            )
            for year in YEARS
        ]
    )
)
rep_subreddits = set(
    flatten(
        [
            json.load(
                open(
                    f"{METADATA_DIR}/rep_subreddits_{year}_unweighted.json",
                )
            )
            for year in YEARS
        ]
    )
)

partisan_subreddits = dem_subreddits | rep_subreddits


In [None]:
users = load_users(engine="polars")

logging.info(users.dtypes)

In [None]:
for year in YEARS:
    comments_year = load_comments(
        years=year,
        start_month=START_MONTH,
        stop_month=STOP_MONTH,
        engine="pandas",
    )

    partisan_comments_year = comments_year.query("subreddit in @partisan_subreddits")
    logging.info("Add party information to comments...")

    partisan_comments_year["party"] = np.where(
        partisan_comments_year["subreddit"].isin(dem_subreddits), "dem", "rep"
    )
    partisan_comments_year["party"] = partisan_comments_year["party"].astype("string")

    logging.info("Filter out bots & automoderators comments...")

    partisan_comments_year = partisan_comments_year.merge(
        users,
        on="author",
        how="inner",
    )

    save_df_as_parquet(
        partisan_comments_year,
        target_file=f"partisan_comments_{year}.parquet",
    )
