In [None]:
%load_ext autoreload
%autoreload 2
%cd '..'

In [None]:
import seaborn as sns
sns.set_theme()
from matplotlib import pyplot as plt
import pandas as pd
import matplotlib as mpl

from preprocessing.utils import (
    load_event_comments,
    load_event_vocab,
)

from load.utils import (
    load_df_from_parquet
)

from events.mass_shootings_constants import (
    MASS_SHOOTINGS_EVENTS_INFO,
)
from eda.utils import plot_event_comments_distribution
from eda.constants import PARTIES_COLORS

In [None]:
EVENTS = [
    ("gun_control", "mass_shootings_gun_control"),
    ("gun_control", "mass_shootings"),
    ("elections", "us_elections_2012"),
    ("elections", "us_elections_2016"),
    ("elections", "us_midterms_2014"),
    ("elections", "us_midterms_2018"),
    ("abortion", "abortion"),
]

In [None]:
for event_theme, event_name in EVENTS:
    print(event_name)
    event_comments = load_event_comments(theme=event_theme, event_name=event_name)
    print(f"Number of comments: {len(event_comments)}")
    print(f"Number of unique users: {len(event_comments['author'].unique())}")
    print(f"Number of dem users: {len(event_comments[event_comments['party'] == 'dem']['author'].unique())}")
    print(f"Number of rep users: {len(event_comments[event_comments['party'] == 'rep']['author'].unique())}")
    print(f"Number of unique subreddits: {len(event_comments['subreddit'].unique())}")

In [None]:
THEME = "gun_control"
EVENT_KEY = "mass_shootings"

In [None]:
event_comments = load_event_comments(theme=THEME, event_name=EVENT_KEY)

event_comments = event_comments[event_comments["event_name"] != "charleston_church_shooting"]

In [None]:
# count number of comments per event_name and party

data = event_comments.groupby(["event_name", "party"], as_index=False).agg(number_comments=("id", "count"))


data["event_date"] = data["event_name"].apply(lambda x: MASS_SHOOTINGS_EVENTS_INFO[x]["date"])
data["event_name"] = data["event_name"].apply(lambda x: MASS_SHOOTINGS_EVENTS_INFO[x]["name"])

# order rows by MASS_SHOOTINGS_EVENTS_INFO["event_name"]["date"] ascending

data = data.sort_values(by=["event_date", "party"])


In [None]:
plot_event_comments_distribution(data, THEME, EVENT_KEY)

In [None]:
THEME = "elections"
EVENT_KEY = "us_elections_2016"

event_comments = load_event_comments(theme=THEME, event_name=EVENT_KEY)

In [None]:
# plot barplot with monthly number of dem and rep users per month

event_comments["date"] = pd.to_datetime(event_comments["created_utc"], unit="s")

event_comments["month"] = event_comments["date"].dt.month

data = event_comments.groupby([pd.Grouper(
    key="date",
    freq="M"
), "party"], as_index=False).agg(number_users=("author", "nunique"))

fig = plt.figure(figsize=(15, 6))

# format as year and date
data["month"] = data["date"].dt.strftime('%Y-%m')

data["Party"] = data["party"].apply(lambda x: "Democrat" if x == "dem" else "Republican")

ax = sns.barplot(x="month", y="number_users", hue="Party", data=data, palette=PARTIES_COLORS)
ax.get_yaxis().set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))
plt.xticks(rotation=45)
ax.set_xlabel("Time")
ax.set_ylabel(f"Number of users {EVENT_KEY}")

plt.show()

## Overall

In [None]:
for year in range(2012, 2020):
    partisan_comments = load_df_from_parquet(
        file_name=f"user_partisan_comments_{year}.parquet",
    )

    print(f"Year: {year}")
    print(f"Number of comments: {len(partisan_comments)}")
    print(f"Number of unique users: {len(partisan_comments['author'].unique())}")
    print(f"Number of dem users: {len(partisan_comments[partisan_comments['party'] == 'dem']['author'].unique())}")
    print(f"Number of rep users: {len(partisan_comments[partisan_comments['party'] == 'rep']['author'].unique())}")
    print(f"Number of unique subreddits: {len(partisan_comments['subreddit'].unique())}")