In [1]:
%load_ext autoreload
%autoreload 2
%cd '/workspaces/polarization_reddit'

/workspaces/polarization_reddit


In [2]:
import time

import pandas as pd
import numpy as np

# from load.utils import load_comments
from load.utils import (
    load_users,
    load_user_party,
    load_user_party_parquet,
    load_subreddits,
    load_comments,
    save_df_as_json,
    save_df_as_parquet,
    load_comments_dask,
)
from preprocessing.utils import (
    tokenize_comment,
    calculate_user_party,
    load_event_comments,
    save_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    EVENTS_DIR,
    ELECTIONS_REGEX,
    MIN_OCCURENCE_FOR_VOCAB,
)

In [3]:
YEAR = 2008
START_MONTH = 1
STOP_MONTH = 12

EVENT_NAME = f"us_election_{YEAR}"

### User Affiliation

In [4]:
subreddits = load_subreddits()[["subreddit", "party"]]

# TODO: take into account network structure to find other partisan subreddits
# which are not labeled
# Filter partisan subreddits
subreddits = subreddits[subreddits["party"].isin({"dem", "rep"})]


In [5]:
subreddits.groupby("party").count()


Unnamed: 0_level_0,subreddit
party,Unnamed: 1_level_1
dem,43
rep,19


In [6]:
print("Load comments...")

comments = load_comments_dask(
    year=YEAR,
    start_month=START_MONTH,
    stop_month=START_MONTH,
)[["author", "subreddit"]]

print("Merge party information to comments...")
comments_party = comments.merge(subreddits, on="subreddit", how="inner")


Load comments...
Merge party information to comments...


In [19]:
print(len(comments_party))

25


In [7]:
user_party = comments_party.groupby(by="author").apply(
    calculate_user_party,
    meta={
        "dem_cnt": "int",
        "rep_cnt": "int",
        "score": "int",
        "party": "string",
    },
)
user_party = user_party[user_party["score"] != 0].compute()

In [8]:
user_party = user_party.reset_index()
user_party["author"] = user_party["author"].astype("string")
user_party["party"] = user_party["party"].astype("string")

In [9]:
print(f"Nr of users: {len(user_party)}")

print(user_party.groupby(by="party").count())

Nr of users: 18
dem    18
Name: party, dtype: Int64


In [10]:
save_df_as_parquet(
    data=user_party,
    target_file=f"user_party_{YEAR}.parquet",
)

## Filter event comments

In [11]:
# print("Load user party")
# user_party = load_user_party_parquet(year=YEAR)

In [12]:
comments = load_comments_dask(
    year=YEAR,
    start_month=START_MONTH,
    stop_month=STOP_MONTH,
)

user_comments = comments.merge(
    user_party,
    right_on="author",
    left_on="author",
    how="inner",
)

# Filter event data based on keywords
event_comments = user_comments[
    user_comments["body_cleaned"].str.contains(
        ELECTIONS_REGEX[YEAR],
        regex=True,
    )
]

## Tokenize and stem comments

In [13]:
event_comments["tokens"] = event_comments["body_cleaned"].apply(
    tokenize_comment,
    meta=("tokens", "string"),
).persist()

In [14]:
print(f"Nr of event comments: {len(event_comments)}")

Nr of event comments: 5964


In [15]:
save_event_comments(event_comments, EVENT_NAME, file_type="parquet")

## Build event vocabulary

In [16]:
# Read event data
# events_comments= load_event_comments(event_comments, EVENT_NAME, file_type="parquet")

event_vocab = build_vocab(
    event_comments["tokens"],
    min_words=MIN_OCCURENCE_FOR_VOCAB,
)

In [17]:
print(len(event_vocab))

2443


In [18]:
save_event_vocab(event_vocab, EVENT_NAME)