In [1]:
%load_ext autoreload
%autoreload 2

# %cd '/workspaces/polarization_reddit
%cd '/home/xavi_oliva/Documents/Github/polarization_reddit'

/home/xavi_oliva/Documents/Github/polarization_reddit


In [22]:
import time

import pandas as pd
import numpy as np

# from load.utils import load_comments
from load.utils import (
    load_users,
    load_user_party,
    load_user_party_parquet,
    load_subreddits,
    load_comments,
    save_df_as_json,
    save_df_as_parquet,
    load_comments_dask,
)
from preprocessing.utils import (
    tokenize_comment,
    calculate_user_party,
    load_event_comments,
    save_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    EVENTS_DIR,
    ELECTIONS_REGEX,
    MIN_OCCURENCE_FOR_VOCAB,
)

In [3]:
# from dask.distributed import Client, LocalCluster

# cluster = LocalCluster(
#     environ={"MALLOC_TRIM_THRESHOLD_": "65536"},
#     processes=False,
# )

# client = Client(cluster)
# print(client)


In [4]:
YEAR = 2012
START_MONTH = 1
STOP_MONTH = 12

EVENT_NAME = f"us_elections_{YEAR}"

### User Affiliation

In [5]:
subreddits = load_subreddits()

In [6]:
# TODO: take into account network structure to find other partisan subreddits
# which are not labeled
# Filter partisan subreddits
subreddits = subreddits[subreddits["party"].isin({"dem", "rep"})]

In [7]:
subreddits.groupby("party")["subreddit"].count()

party
dem    43
rep    19
Name: subreddit, dtype: int64

In [8]:
print("Load comments...")

comments = load_comments(
    year=YEAR,
    start_month=START_MONTH,
    stop_month=STOP_MONTH,
)


Load comments...


Months: 100%|██████████| 12/12 [06:09<00:00, 30.77s/it]


In [9]:
print("Merge party information to comments...")
comments_party = comments[["author", "subreddit"]].merge(
    subreddits[["subreddit", "party"]],
    on="subreddit",
    how="inner",
)

Merge party information to comments...


In [10]:
print(comments_party.shape)

(93629, 3)


In [11]:
user_party = comments_party.groupby(by="author").apply(
    calculate_user_party,
    # meta={
    #     "dem_cnt": "int",
    #     "rep_cnt": "int",
    #     "score": "int",
    #     "party": "string",
    # },
)
user_party = user_party[user_party["score"] != 0] # .compute()

In [12]:
user_party = user_party.reset_index()
user_party["author"] = user_party["author"].astype("string")
user_party["party"] = user_party["party"].astype("string")

In [13]:
print(f"Nr of users: {len(user_party)}")

print(user_party.groupby(by="party")["author"].count())

Nr of users: 12305
party
dem    5074
rep    7231
Name: author, dtype: int64


In [14]:
user_party.head()

Unnamed: 0,author,dem_cnt,rep_cnt,score,party
0,00TSZ,4,0,4,dem
1,00Uth,30,0,30,dem
2,00YLw,1,0,1,dem
3,00vz9,0,2,-2,rep
4,01BDh,2,0,2,dem


In [15]:
save_df_as_parquet(
    data=user_party,
    target_file=f"user_party_{YEAR}.parquet",
)

In [16]:
del comments_party

## Filter event comments

In [17]:
# print("Load user party")
# user_party = load_user_party_parquet(year=YEAR)
# comments = load_comments_dask(
#     year=YEAR,
#     start_month=START_MONTH,
#     stop_month=STOP_MONTH,
# )

In [18]:
comments.shape

(7625490, 4)

In [19]:
users = load_users()

In [20]:
print("Filter out bots & automoderators information to comments...")
user_comments = comments.merge(
    users[["author", "gender"]],
    on="author",
    how="inner",
)

Filter out bots & automoderators information to comments...


In [23]:
user_comments = user_comments.merge(
    user_party,
    on="author",
    how="inner",
)

# Filter event data based on keywords
event_comments = user_comments[
    user_comments["body_cleaned"].str.contains(
        ELECTIONS_REGEX[YEAR],
        regex=True,
    )
]

In [24]:
del user_party
del users
del subreddits
del comments
del user_comments

## Tokenize and stem comments

In [25]:
event_comments["tokens"] = event_comments["body_cleaned"].apply(
    tokenize_comment,
    # meta=("tokens", "string"),
) # .persist()

In [26]:
print(f"Nr of event comments: {len(event_comments)}")

Nr of event comments: 960942


In [27]:
save_event_comments(event_comments, EVENT_NAME, file_type="parquet")

## Build event vocabulary

In [28]:
# Read event data
# events_comments= load_event_comments(event_comments, EVENT_NAME, file_type="parquet")

event_vocab = build_vocab(
    event_comments["tokens"],
    min_comment_freq=MIN_OCCURENCE_FOR_VOCAB,
)

In [29]:
print(len(event_vocab))

246399


In [30]:
save_event_vocab(event_vocab, EVENT_NAME)