In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import matplotlib.pyplot as plt
import praw

import re
import string

In [None]:
with open('../../../.secret/reddit/ZSDSFI_client_id.txt') as f:
    client_id = f.read()

with open('../../../.secret/reddit/ZSDSFI_client_secret.txt') as f:
    client_secret = f.read()

In [None]:
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent="ZS"
)

In [None]:
def get_reddit_posts(
    reddit_praw: praw.reddit.Reddit,
    sub_name: str,
    sort_by: str,
    time_filter: str = None,
    limit: int = 1000,
    return_time = True
    ):
    """Returns a DataFrame consisting of PRAW submission objects. 
    These objects contain data about body, comments, and other aspects of a Reddit post.
    The function needs to take in a PRAW Reddit object and a subreddit name, and a 
    sorting option (new, hot, or top).

    Optionally takes in a time filter if sorting by top.
    This  determines which time period of top posts to retrieve (hour, day, week, month, year, all).

    By defualt, returns a tuple of the DataFrame and the time the function was run. 
    This may be updated, but it's important to not when a set of submissions was retrieved,
    since subreddits are getting new submissions constantly and Reddit only keeps 1000 of them
    on the website at any given time. 
    The return_time parameter can be set to False. 
    """
    time_run = dt.datetime.now()

    posts = []

    subreddit = reddit_praw.subreddit(sub_name)

    if sort_by == 'new':
        submissions = subreddit.new(limit=limit)
    elif sort_by == 'hot':
        submissions = subreddit.hot(limit=limit)
    elif sort_by == 'top':
        submissions = subreddit.top(limit=limit, time_filter=time_filter)
    else:
        print(f'Function currently does not support sorting by {sort_by}.')
        return
    
    for submission in submissions:
        posts.append([submission])
    
    df = pd.DataFrame(posts)
    df.columns = ['submission']

    if time_filter:
        if time_filter == 'all':
            tf_str = ' (from all time) '
        else:
            tf_str = f' (in the past {time_filter}) '
    else:
        tf_str = ' '

    print(f'Collected {len(posts)} {sort_by.upper()} submissions{tf_str}from r/{sub_name} as of {time_run.strftime("%m/%d/%Y at %H:%M")}.')

    if return_time:
        return (df, time_run)
    else:
        return df

In [None]:
def get_posts_data(
        submission_df, 
        time_run: dt.datetime = None, 
        drop_24 = False
        ):
    """Returns a DataFrame displaying information about a set of submissions.
    Takes in a DataFrame output by get_reddit_posts() containing a 'submission',
    which should consist of PRAW submission objects.

    If drop_24 is True, drops any submissions less than 24 hours old, which may
    not have a sufficient number of comments/votes for analysis.
    """
    df = submission_df.copy()
    df['title'] = df['submission'].apply(lambda x: x.title)
    df['created_utc'] = df['submission'].apply(lambda x: x.created_utc)
    df['datetime'] = df['created_utc'].apply(lambda x: dt.datetime.fromtimestamp(x))
    df['id'] = df['submission'].apply(lambda x: x.id)
    df['url'] = df['submission'].apply(lambda x: x.url)
    df['selftext'] = df['submission'].apply(lambda x: x.selftext)
    df['post_hint'] = df['submission'].apply(lambda x: x.post_hint if 'post_hint' in vars(x) else None)
    df['score'] = df['submission'].apply(lambda x: x.score)
    df['upvote_ratio'] = df['submission'].apply(lambda x: x.upvote_ratio)
    df['num_comments'] = df['submission'].apply(lambda x: x.num_comments)

    # There is occasionally an HTTP 429 error with the next line, but simply running it again usually works.
    
    try:
        df['comments'] = df['submission'].apply(lambda x: x.comments)
    except:
        df['comments'] = df['submission'].apply(lambda x: x.comments)

    if drop_24:
        df = df[df['datetime'] < time_run - dt.timedelta(days=1)]

    return df

___
## New

In [None]:
df_new, time_run_new = get_reddit_posts(reddit, sub_name='askreddit', sort_by='new', limit=1000)
print(time_run_new)

In [None]:
df_new

In [None]:
# This cell may have to run twice if HTTP 429 response

df_new = get_posts_data(df_new, time_run=None, drop_24=False)

In [None]:
df_new.head()

In [None]:
df_new.shape

In [None]:
df_new['datetime'].max()

In [None]:
df_new['datetime'].min()

In [None]:
# # Keeping only posts that are at least 24 hours old

# df_new_all = df_new.copy()

# df_new = df_new[df_new['datetime'] < time_run_new - dt.timedelta(days=1)]

In [None]:
df_new['datetime'].max()

In [None]:
df_new.shape

In [None]:
df_new.head()

In [None]:
df_new.info()

In [None]:
df_new.to_pickle(
    f'./data/askreddit_new_reddit_data_{time_run_new.strftime("%m_%d_%Y")}.pkl'
)

In [None]:
# # Replace string below with appropriate file name

# reddit_data_file = './data/askreddit_new_reddit_data_09_15_2023.pkl'

# df_new = pd.read_pickle(reddit_data_file)

In [None]:
# df_new.head()

___
## Top (year)

In [None]:
df_top_year, time_run_top_year = get_reddit_posts(reddit, sub_name='askreddit', sort_by='top', time_filter='year', limit=1000)

In [None]:
df_top_year

In [None]:
# This cell may have to run twice if HTTP 429 response

df_top_year = get_posts_data(df_top_year)

In [None]:
df_top_year

In [None]:
df_top_year.shape

In [None]:
df_top_year['datetime'].max()

In [None]:
df_top_year['datetime'].min()

In [None]:
df_top_year.to_pickle(
    f'./data/askreddit_top_year_reddit_data_{time_run_top_year.strftime("%m_%d_%Y")}.pkl'
)

In [None]:
# # Replace string below with appropriate file name

# reddit_data_file = './data/askreddit_top_year_reddit_data_09_15_2023.pkl'

# df_top_year = pd.read_pickle(reddit_data_file)

In [None]:
# df_top_year.head()

___
## Top (month)

In [None]:
df_top_month, time_run_top_month = get_reddit_posts(reddit, sub_name='askreddit', sort_by='top', time_filter='month', limit=1000)

In [None]:
df_top_month

In [None]:
# This cell may have to run twice if HTTP 429 response

df_top_month = get_posts_data(df_top_month)

In [None]:
df_top_month

In [None]:
df_top_month.shape

In [None]:
df_top_month['datetime'].max()

In [None]:
df_top_month['datetime'].min()

In [None]:
df_top_month.to_pickle(
    f'./data/askreddit_top_month_reddit_data_{time_run_top_month.strftime("%m_%d_%Y")}.pkl'
)

In [None]:
# # Replace string below with appropriate file name

# reddit_data_file = './data/askreddit_top_all_reddit_data_09_15_2023.pkl'

# df_top_all = pd.read_pickle(reddit_data_file)

In [None]:
# df_top_all.head()

___
## Top (all)

In [None]:
df_top_all, time_run_top_all = get_reddit_posts(reddit, sub_name='askreddit', sort_by='top', time_filter='all', limit=1000)

In [None]:
df_top_all

In [None]:
# This cell may have to run twice if HTTP 429 response

df_top_all = get_posts_data(df_top_all)

In [None]:
df_top_all

In [None]:
df_top_all.shape

In [None]:
df_top_all['datetime'].max()

In [None]:
df_top_all['datetime'].min()

In [None]:
df_top_all.to_pickle(
    f'./data/askreddit_top_all_reddit_data_{time_run_top_all.strftime("%m_%d_%Y")}.pkl'
)

In [None]:
# # Replace string below with appropriate file name

# reddit_data_file = './data/askreddit_top_all_reddit_data_09_15_2023.pkl'

# df_top_all = pd.read_pickle(reddit_data_file)

In [None]:
# df_top_all.head()

___
## Hot

In [None]:
df_hot, time_run_hot = get_reddit_posts(reddit, sub_name='askreddit', sort_by='hot', limit=1000)

In [None]:
df_hot

In [None]:
# This cell may have to run twice if HTTP 429 response

df_hot = get_posts_data(df_hot)

In [None]:
df_hot

In [None]:
df_hot.shape

In [None]:
df_hot['datetime'].max()

In [None]:
df_hot['datetime'].min()

In [None]:
df_hot.to_pickle(
    f'./data/askreddit_hot_reddit_data_{time_run_hot.strftime("%m_%d_%Y")}.pkl'
)

In [None]:
# # Replace string below with appropriate file name

# reddit_data_file = './data/askreddit_hot_reddit_data_09_15_2023.pkl'

# df_hot = pd.read_pickle(reddit_data_file)

In [None]:
# df_hot.head()