In [None]:
%load_ext autoreload
%autoreload 2
%cd '/workspaces/polarization_reddit'

In [None]:
import time

# from load.utils import load_comments
from load.utils import (
    load_users,
    load_user_party,
    load_subreddits,
    load_comments,
    save_df_as_json,
    load_comments_dask,
)
from preprocessing.utils import (
    tokenize_comment,
    calculate_user_party,
    load_event_comments,
    save_event_comments,
)
from preprocessing.constants import (
    EVENTS_DIR,
    ELECTIONS_REGEX,
    MIN_OCCURENCE_FOR_VOCAB,
)


In [None]:
YEAR = 2008
START_MONTH = 10
STOP_MONTH = 12

EVENT_NAME = f"us_election_{YEAR}"

### User Affiliation

In [None]:
subreddits = load_subreddits()[["subreddit", "party"]]

# TODO: take into account network structure to find other partisan subreddits
# which are not labeled
# Filter partisan subreddits
subreddits = subreddits[subreddits["party"].isin({"dem", "rep"})]


In [None]:
subreddits.groupby("party").count()


In [None]:
print("Load comments...")

comments = load_comments_dask(
    year=YEAR,
    start_month=START_MONTH,
    stop_month=START_MONTH,
)[["author", "subreddit"]]

print("Merge party information to comments...")
comments_party = comments.merge(subreddits, on="subreddit", how="inner")


In [None]:
display(comments_party.head(10))

In [None]:
user_party = comments_party.groupby(by="author").apply(
    calculate_user_party,
    meta={"dem_cnt": "int", "rep_cnt": "int", "score": "int", "party": "string"},
)
user_party = user_party[user_party["score"] != 0]

In [None]:
# user_party.reset_index(inplace=True)
display(user_party.head())

In [None]:
user_party.groupby("party")["party"].hist()

In [None]:
save_df_as_json(data=user_party, target_file=f"user_party_{YEAR}.json")

## Filter event comments

In [None]:
# print("Load user party")
# user_party = load_user_party(year=2016)

comments = load_comments_dask(
    year=YEAR,
    start_month=START_MONTH,
    stop_month=STOP_MONTH,
)

# Filter event data based on 
event_comments = comments[
    comments["body_cleaned"].str.contains(
        ELECTIONS_REGEX,
        regex=True,
    )
]

event_comments = event_comments.merge(
    user_party,
    on="author",
    how="inner",
)


## Tokenize and stem comments

In [None]:
print(f"Tokenizing body... (nr_rows={len(event_comments)})")

tic = time.perf_counter()

event_comments["tokens"] = event_comments["body_cleaned"].apply(
    tokenize_comment,
    meta=("tokens", "string"),
)

# event_comments["tokens"] = [
#     tokenize_comment(comment) for comment in event_comments["body_cleaned"]
# ]

toc = time.perf_counter()

print(f"\tTokenized dataframe in {toc - tic:0.4f} seconds")

In [None]:
save_event_comments(event_comments, event_name)

## Build event vocabulary

In [None]:
# Read event data
# events_comments= load_event_comments(event_comments, event_name)

event_vocab = build_vocab(
    data["tokens"],
    min_words=MIN_OCCURENCE_FOR_VOCAB,
)

In [None]:
print(len(event_vocab))

In [None]:
save_event_vocab(event_vocab, event_name)