In [None]:
%load_ext autoreload
%autoreload 2

%cd '..'

In [None]:
import logging

# logging.basicConfig(
#     level=logging.INFO,
#     format="%(asctime)s [%(levelname)s] %(message)s",
#     handlers=[
#         logging.FileHandler("data/logs/preprocess_comments_notebook.log"),
#         logging.StreamHandler()
#     ]
# )

In [None]:
import json

import numpy as np
import polars as pl
from tqdm import tqdm
tqdm.pandas()

from load.utils import (
    load_users,
    load_subreddits,
    load_user_party,
    load_comments,
    save_df_as_parquet,
)
from preprocessing.utils import (
    tokenize_comment,
    calculate_user_party,
    save_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    METADATA_DIR,
    ELECTIONS_EVENTS_INFO,
    MIN_OCCURENCE_FOR_VOCAB,
)

In [None]:
YEAR = 2008
START_MONTH = 1
STOP_MONTH = 12

EVENT_NAME = f"us_elections_{YEAR}"
EVENT_INFO = ELECTIONS_EVENTS_INFO[EVENT_NAME]

### User Affiliation

In [None]:
subreddits = load_subreddits()

logging.info(subreddits.dtypes)

In [None]:
# take into account network structure to find other partisan subreddits which are not labeled
dem_subreddits = set(
    json.load(
        open(
            f"{METADATA_DIR}/dem_subreddits_{YEAR}_unweighted.json",
        )
    )
)
rep_subreddits = set(
    json.load(
        open(
            f"{METADATA_DIR}/rep_subreddits_{YEAR}_unweighted.json",
        )
    )
)

partisan_subreddits = dem_subreddits | rep_subreddits


In [None]:
# Filter partisan subreddits
# subreddits = subreddits[subreddits["party"].isin({"dem", "rep"})]
subreddits = subreddits.query("subreddit in @partisan_subreddits")

In [None]:
logging.info(subreddits.groupby("party")["subreddit"].count())


In [None]:
logging.info("Load comments...")

comments_pl = load_comments(
    year=YEAR,
    start_month=START_MONTH,
    stop_month=STOP_MONTH,
    engine="polars",
)

In [None]:
partisan_comments_pl = comments_pl.filter(pl.col("subreddit").is_in(list(partisan_subreddits)))

In [None]:
partisan_comments = partisan_comments_pl.to_pandas(
    # use_pyarrow_extension_array=True,
).astype(
    {
        "author": "string[pyarrow]",
        "body_cleaned": "string[pyarrow]",
        "created_utc": "int64[pyarrow]",
        "subreddit": "string[pyarrow]",
    }
)


In [None]:
partisan_comments.dtypes

In [None]:
# partisan_comments = comments.query("subreddit in @partisan_subreddits").copy()
# partisan_comments = comments.loc[t] # .copy()

logging.info(partisan_comments.shape)


In [None]:
logging.info("Add party information to comments...")

partisan_comments["party"] = np.where(
    partisan_comments["subreddit"].isin(dem_subreddits), "dem", "rep"
)
partisan_comments["party"] = partisan_comments["party"].astype("string[pyarrow]")


In [None]:
partisan_comments.dtypes

In [None]:
logging.info(partisan_comments.shape)

save_df_as_parquet(
    partisan_comments,
    target_file=f"partisan_comments_{YEAR}.parquet",
)


In [None]:
user_party = partisan_comments.groupby(by="author").progress_apply(
    calculate_user_party,
)
user_party = user_party[user_party["score"] != 0].reset_index().copy()
user_party["party"] = user_party["party"].astype("string[pyarrow]")


In [None]:
user_party.dtypes

In [None]:
logging.info(f"Nr of users: {len(user_party)}")

logging.info(user_party.groupby(by="party")["author"].count())

In [None]:
save_df_as_parquet(
    data=user_party,
    target_file=f"user_party_{YEAR}.parquet",
)

logging.info(user_party.dtypes)

## Filter event comments

In [None]:
# logging.info("Load user party")
# user_party = load_user_party(year=YEAR)

logging.info(user_party.dtypes)

In [None]:
logging.info(partisan_comments.shape)

In [None]:
users = load_users(engine="polars")

logging.info(users.dtypes)

In [None]:
logging.info(users.shape)

In [None]:
logging.info("Filter out bots & automoderators comments...")
partisan_comments = partisan_comments.merge(
    users,
    on="author",
    how="inner",
)

In [None]:
logging.info(partisan_comments.dtypes)

In [None]:
logging.info("Filtering event data based on keywords...")
event_comments = partisan_comments[
    partisan_comments["body_cleaned"].str.contains(
        EVENT_INFO["regex"],
        regex=True,
    )
].copy()
logging.info("finished keyword filtering")

In [None]:
logging.info(event_comments.shape)

In [None]:
logging.info("save temp event comments")
save_event_comments(event_comments, f"temp_{EVENT_NAME}")


In [None]:
del user_party
del users
del subreddits
del comments_pl
del partisan_comments

In [None]:
logging.info(event_comments.dtypes)

## Tokenize and stem comments

In [None]:
logging.info("Tokenizing comments...")
event_comments["tokens"] = event_comments["body_cleaned"].progress_apply(
    tokenize_comment,
).astype("string[pyarrow]")
logging.info("Finish tokenizing comments")

In [None]:
logging.info(event_comments.dtypes)

In [None]:
logging.info(f"Nr of event comments: {len(event_comments)}")

In [None]:
logging.info("saving event comments...")
save_event_comments(event_comments, EVENT_NAME)

## Build event vocabulary

In [None]:
# Read event data
# events_comments = load_event_comments(
#     event_comments,
#     EVENT_NAME,
#     file_type="parquet",
# )

event_vocab = build_vocab(
    event_comments["tokens"],
    min_comment_freq=MIN_OCCURENCE_FOR_VOCAB,
)


In [None]:
logging.info("Vocabulary length")
logging.info(len(event_vocab))

In [None]:
logging.info("Saving event vocab")
save_event_vocab(event_vocab, EVENT_NAME)