In [6]:
import praw
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import pandas as pd
import datetime

import secrets
import config

Models used here from HuggingFace:
* [News Classifier](https://huggingface.co/mrm8488/bert-mini-finetuned-age_news-classification)
* [Sentiment Analysis Pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines)

In [2]:
news_tokenizer = AutoTokenizer.from_pretrained(config.HF_TOKENIZER_NEWS_CLASSIFIER)
news_model = AutoModelForSequenceClassification.from_pretrained(config.HF_MODEL_NEWS_CLASSIFIER)
sentiment_model = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [3]:
# Create a Reddit object which allows us to interact with the Reddit API
reddit = praw.Reddit(
    client_id=secrets.REDDIT_API_CLIENT_ID,
    client_secret=secrets.REDDIT_API_CLIENT_SECRET,
    user_agent=secrets.REDDIT_API_USER_AGENT
)
subreddit = reddit.subreddit(config.NEWS_SUBREDDITS)

In [None]:
submissions = []

# Stream new submissions in from our favorite subreddits until we reach a certain number
for submission in subreddit.stream.submissions():
    submissions.append(submission)
    if len(submissions) > config.NUM_SUBMISSION_TO_GET:
        break

In [7]:
class RedditSubmission():
    subreddit: str
    title: str
    time_created: datetime.datetime
    author: str
    inference_subject: str
    inference_sentiment: str

    def __init__(self, subreddit: str, title: str, time_created: str, author: str, inference_subject: str = None, inference_sentiment: str = None):
        self.subreddit = subreddit
        self.title = title
        self.time_created = self.convert_time_to_datetime(time_created)
        self.author = author
        self.inference_subject = self.run_subject_analysis()
        self.inference_sentiment = self.run_sentiment_analysis()

    # Convert time from Reddit API into a Python datetime object
    def convert_time_to_datetime(self, time_created) -> datetime.datetime:
        dt = datetime.datetime.fromtimestamp(time_created)
        return dt

    # Take the output from our news classifier and map it to a class
    def map_news_output_to_class(self, inference_output: torch.Tensor) -> str:
        softmax_values = []
        for output in inference_output:
            softmax_values.append(output.item())
        max_value = max(softmax_values)
        max_index = softmax_values.index(max_value)
        return config.NEWS_CLASSES[max_index]
    
    # Run the news classifier model on the input
    def run_subject_analysis(self) -> str:
        inputs = news_tokenizer(self.title, return_tensors="pt")
        labels = torch.tensor([1]).unsqueeze(0) # Batch size of 1
        outputs = news_model(**inputs, labels=labels) # Unpack key-value pairs into keyword args in function call
        news_subject = self.map_news_output_to_class(outputs.logits[0]) # Taking softmax tensor from inference
        return news_subject

    # Run the sentiment analysis pipeline model on the input
    def run_sentiment_analysis(self) -> str:
        sentiment = sentiment_model(self.title)
        return sentiment[0]["label"]

In [8]:
reddit_submission_objects = []

# Transform submissions into easy to handle objects
for submission in submissions:
    s = RedditSubmission(submission.subreddit, submission.title, submission.created_utc, submission.author)
    reddit_submission_objects.append(s)

In [None]:
# Convert our scraped data into a Pandas dataframe
pd.DataFrame([vars(submission) for submission in reddit_submission_objects])

Next up:
1. How do I find the top 100 posts of all time from your favorite subreddits?
2. How do I parse comments from the post?
3. And finally, how do I parse replies from that comment?
4. Bonus! If you have time, browse HuggingFace and try to find an out of the box model to apply to your favorite Reddit data. Even if you can't code it up, how would you, given enough time, implement the algorithm(s)?