In [25]:
%load_ext autoreload
%autoreload 2

# %cd '/workspaces/polarization_reddit
%cd '/home/xavi_oliva/Documents/Github/polarization_reddit'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/xavi_oliva/Documents/Github/polarization_reddit


In [26]:
import time

import pandas as pd
import numpy as np

# from load.utils import load_comments
from load.utils import (
    load_users,
    load_user_party,
    load_user_party_parquet,
    load_subreddits,
    load_comments,
    save_df_as_json,
    save_df_as_parquet,
    load_comments_dask,
)
from preprocessing.utils import (
    tokenize_comment,
    calculate_user_party,
    load_event_comments,
    save_event_comments,
    build_vocab,
    save_event_vocab,
)

from preprocessing.constants import (
    EVENTS_DIR,
    ELECTIONS_REGEX,
    MIN_OCCURENCE_FOR_VOCAB,
)

In [27]:
# from dask.distributed import Client, LocalCluster

# cluster = LocalCluster(
#     environ={"MALLOC_TRIM_THRESHOLD_": "65536"},
#     processes=False,
# )

# client = Client(cluster)
# print(client)


In [28]:
YEAR = 2012
START_MONTH = 1
STOP_MONTH = 12

EVENT_NAME = f"us_elections_{YEAR}"

### User Affiliation

In [29]:
subreddits = load_subreddits()

In [30]:
# TODO: take into account network structure to find other partisan subreddits
# which are not labeled
# Filter partisan subreddits
subreddits = subreddits[subreddits["party"].isin({"dem", "rep"})]

In [31]:
subreddits.groupby("party")["subreddit"].count()

party
dem    43
rep    19
Name: subreddit, dtype: int64

In [33]:
print("Load comments...")

comments = load_comments(
    year=YEAR,
    start_month=START_MONTH,
    stop_month=STOP_MONTH,
)


Load comments...


Months:  75%|███████▌  | 9/12 [04:21<01:33, 31.28s/it]

: 

: 

In [None]:
print("Merge party information to comments...")
comments_party = comments[["author", "subreddit"]].merge(
    subreddits[["subreddit", "party"]],
    on="subreddit",
    how="inner",
)

Merge party information to comments...


In [None]:
print(comments_party.shape)

(22438, 3)


In [None]:
user_party = comments_party.groupby(by="author").apply(
    calculate_user_party,
    # meta={
    #     "dem_cnt": "int",
    #     "rep_cnt": "int",
    #     "score": "int",
    #     "party": "string",
    # },
)
user_party = user_party[user_party["score"] != 0] # .compute()

In [None]:
user_party = user_party.reset_index()
user_party["author"] = user_party["author"].astype("string")
user_party["party"] = user_party["party"].astype("string")

In [None]:
print(f"Nr of users: {len(user_party)}")

print(user_party.groupby(by="party")["author"].count())

Nr of users: 3966
party
dem    3966
Name: author, dtype: int64


In [None]:
user_party.head()

Unnamed: 0,author,dem_cnt,rep_cnt,score,party
0,00Uth,79,0,79,dem
1,02Fgp,11,0,11,dem
2,03CQj,3,0,3,dem
3,04jgp,3,0,3,dem
4,08lAf,1,0,1,dem


In [None]:
save_df_as_parquet(
    data=user_party,
    target_file=f"user_party_{YEAR}.parquet",
)

## Filter event comments

In [None]:
# print("Load user party")
# user_party = load_user_party_parquet(year=YEAR)
# comments = load_comments_dask(
#     year=YEAR,
#     start_month=START_MONTH,
#     stop_month=STOP_MONTH,
# )

In [None]:
comments.shape

(868535, 16)

In [None]:
user_comments = comments.merge(
    user_party,
    right_on="author",
    left_on="author",
    how="inner",
)

# Filter event data based on keywords
event_comments = user_comments[
    user_comments["body_cleaned"].str.contains(
        ELECTIONS_REGEX[YEAR],
        regex=True,
    )
]

## Tokenize and stem comments

In [None]:
event_comments["tokens"] = event_comments["body_cleaned"].apply(
    tokenize_comment,
    # meta=("tokens", "string"),
) # .persist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  event_comments["tokens"] = event_comments["body_cleaned"].apply(


In [None]:
print(f"Nr of event comments: {len(event_comments)}")

Nr of event comments: 211753


In [None]:
save_event_comments(event_comments, EVENT_NAME, file_type="parquet")

## Build event vocabulary

In [None]:
# Read event data
# events_comments= load_event_comments(event_comments, EVENT_NAME, file_type="parquet")

event_vocab = build_vocab(
    event_comments["tokens"],
    min_words=MIN_OCCURENCE_FOR_VOCAB,
)

In [None]:
print(len(event_vocab))

64247


In [None]:
save_event_vocab(event_vocab, EVENT_NAME)