In [None]:
# First, let's load the Reddit API creds.
%load_ext dotenv
%dotenv ../ingest/praw_creds.env
%dotenv ../.env

import os

REDDIT_CLIENT_ID = os.environ.get("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET = os.environ.get("REDDIT_CLIENT_SECRET")
SUBREDDIT = "politics"
VERSION = os.environ.get("VERSION") + "-eda"

In [None]:
import asyncpraw

"""
Set up the Reddit client instance.
I'm using a read-only PRAW intance because I have no need to post comments.
I'm using async PRAW pretty much just because `ingest` does so.

Credentials need to be supplied via env var.
"""
reddit = asyncpraw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=f"python:vivshaw/politeiamancer:{VERSION} (by /u/vivshaw)",
)

In [None]:
"""
Let's load some comments!
"""
subreddit = await reddit.subreddit(SUBREDDIT)

comments = []

async for comment in subreddit.comments(limit=100):
    comment_as_dict = {
        # ID
        "fullname": comment.name,
        # Comment details
        "author": comment.author,
        "body": comment.body,
        "permalink": comment.permalink,
        # Time
        "created_utc": int(comment.created_utc),
    }
    comments.append(comment_as_dict)

In [None]:
import pandas as pd

"""
Now that we've loaded them, we need to get 'em into a Pandas dataframe.
"""

df = pd.DataFrame(comments)
df.head()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

"""
Time for some sentiment analysis!
"""


def sentiment_score(text: str) -> float:
    """
    Calculate a sentiment score for a piece of text using VADER.
    """
    sentiment_intensity_analyzer = SentimentIntensityAnalyzer()
    valence_scores = sentiment_intensity_analyzer.polarity_scores(text)
    return valence_scores


ratings_df = df["body"].apply(sentiment_score).apply(pd.Series)
df = pd.concat([df, ratings_df], axis=1)

In [None]:
"""
Let's see some summary stats.
"""

df[["neg", "neu", "pos", "compound"]].describe()

In [None]:
"""
How 'bout looking at our most-negative, most-neutral, and most-positive comment?
"""

most_negative = df.loc[df["neg"].idxmax()]
print("Most negative comment:")
print(most_negative["body"])
print("\n")

most_neutral = df.loc[df["neu"].idxmax()]
print("Most neutral comment:")
print(most_neutral["body"])
print("\n")

most_positive = df.loc[df["pos"].idxmax()]
print("Most positive comment:")
print(most_positive["body"])
print("\n")

"""
OK, how about by compound score?
"""
compound_most_negative = df.loc[df["compound"].idxmin()]
print("Most negative compound score comment:")
print(compound_most_negative["body"])
print("\n")

compound_most_positive = df.loc[df["compound"].idxmax()]
print("Most positive compound score comment:")
print(compound_most_positive["body"])
print("\n")

# Alright, seems like compound scores are the way to go. The raw scores are not that informative.

In [None]:
from nrclex import NRCLex

"""
Next up, we'll analyze some emotional valence.
"""
emotions = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "trust"]


def emotion_scores(text):
    emotion_dict = {emotion: 0 for emotion in emotions}

    affect_frequencies = NRCLex(text).affect_frequencies

    for emotion in emotions:
        if emotion in affect_frequencies:
            emotion_dict[emotion] = affect_frequencies[emotion]

    return emotion_dict


emotions_df = df["body"].apply(emotion_scores).apply(pd.Series)
df = pd.concat([df, emotions_df], axis=1)

In [None]:
"""
Let's see some summary stats.
"""
df[emotions].describe()

In [None]:
"""
OK, most emotional comments?
"""
most_angry = df.loc[df["anger"].idxmax()]
print("Most angry comment:")
print(most_angry["body"])

most_disgusted = df.loc[df["disgust"].idxmax()]
print("Most disgusted comment:")
print(most_disgusted["body"])

most_fearful = df.loc[df["fear"].idxmax()]
print("Most fearful comment:")
print(most_fearful["body"])

most_joyful = df.loc[df["joy"].idxmax()]
print("Most joyful comment:")
print(most_joyful["body"])

most_sad = df.loc[df["sadness"].idxmax()]
print("Most sad comment:")
print(most_sad["body"])

most_surprised = df.loc[df["surprise"].idxmax()]
print("Most surprised comment:")
print(most_surprised["body"])

most_trustful = df.loc[df["trust"].idxmax()]
print("Most trustful comment:")
print(most_trustful["body"])


# Alright, emotional analysis is not amazingly accurate. But it's worth poking at.

In [None]:
"""
Let's grab the comments mentioning Trump.
"""
bodies = df["body"].str.lower()

trump_filter = bodies.str.contains("trump")
trump_df = df.loc[trump_filter]
trump_df.head()

In [None]:
trump_df[
    [
        "compound",
        "anger",
        "disgust",
        "fear",
        "joy",
        "sadness",
        "surprise",
        "trust",
    ]
].agg(["count", "min", "max", "mean", "median", "skew", "std"])

In [None]:
"""
Now, those referring to Biden.
"""

biden_filter = bodies.str.contains("biden")
biden_df = df.loc[biden_filter]
biden_df.head()

In [None]:
biden_df[["compound"] + emotions].agg(
    ["count", "min", "max", "mean", "median", "skew", "std"]
)

In [None]:
# What do the emotional deltas look like?
(
    biden_df[["compound"] + emotions].mean()
    - trump_df[
        ["compound", "anger", "disgust", "fear", "joy", "sadness", "surprise", "trust"]
    ].mean()
)

In [None]:
import seaborn as sns

sns.set_theme()

sns.pairplot(data=df[["compound"] + emotions])

In [None]:
# After a number of runs, seems like there's not any big patterns emerging that I'd want to build a model for.
# That's OK- my main goal is just to stream and visualize the data anyways!
# What I think I _have_ seen is:
#  - Biden results seem to have consistently higher skewness than Trump, as if Biden has a fatter positive-sentiment tail or is skewed by negative outliers