# Extract Historical Reddit Data
Data is extracted to aid ML model training.

In [212]:
import datetime as dt
import calendar
import configparser
import pandas as pd
import requests
import urllib
import time
import json

In [213]:
# Read passwords and secrets from config file
config_parser = configparser.ConfigParser()
config_parser.read("src/configuration/config.cfg")

['src/configuration/config.cfg']

In [227]:
# Set parameters
sub_reddit = config_parser["praw"]["subreddit"]
sub_file_prefix = "submissions"
comment_file_prefix = "comment"
years = [2021]
months = range(1,6)
sub_columns = ["id",'author_fullname','title','score','author_premium','domain','over_18','subreddit_id','permalink','parent_whitelist_status','url','created_utc','num_comments','upvote_ratio']

In [222]:
def get_subreddit_data(object_type, columns, username='', subreddit='', search_query='', max_time=None, min_time=1609459200):
    # Start from current time if not specified
    if max_time is None:
        max_time = int(time.time())

    # Generate filter string
    filter_string = urllib.parse.urlencode(
        {k: v for k, v in zip(
            ['author', 'subreddit', 'q'],
            [username, subreddit, search_query]) if v != ""})

    url_format = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before={}"

    before = max_time
    df = pd.DataFrame()
    
    # Loop through period to retrieve all records
    while before > min_time:
        # Format the Url with variables and make the request
        url = url_format.format(object_type, filter_string, before)
        resp = requests.get(url)

        # Convert records to dataframe
        dfi = pd.json_normalize(json.loads(resp.text)['data'])
        # Filter out unwanted columns
        df = pd.concat([df, dfi[columns]])

        # set `before` to the earliest comment/post in the results next
        # requests.get(...) we will only retrieve submissions/comments before
        # the earliest that we already have, thus not fetching any duplicates
        before = dfi['created_utc'].min()
        
    return df


In [216]:
def get_daily_reddit_data(years, months, days, object_type, columns, subreddit, file_prefix):
    for year in years:
        for month in months:
            for day in days:
                ymd = year * 10000 + month * 100 + day
                tomorrow = day + 1
                
                # Set start and end date for request
                after_time = start_epoch=int(dt.datetime(year, month, day).timestamp())
                before_time = start_epoch=int(dt.datetime(year, month, tomorrow).timestamp())
                
                # Get the required Subreddit data at the daily level
                df = get_subreddit_data(
                    object_type=object_type,
                    columns=columns,
                    username=username,
                    subreddit=subreddit,
                    max_time=before_time,
                    min_time=after_time)

                # Check for duplicates and retirve number of unique records
                dupes = df['id'].duplicated().any()
                total = df['id'].nunique() 

                if dupes:
                    print("There are duplicates in the data for " + str(ymd))

                print("For " + str(ymd) + " " + str(total) + " values were extracted")

                filename = file_prefix + "_" + str(ymd) + ".csv"
            
                # Save Data to a CSV
                df.to_csv(filename)


In [217]:
def get_monthly_reddit_data(years, months, object_type, columns, subreddit, file_prefix):
    for year in years:
        for month in months:
            ym = year * 100 + month
            last_day = calendar.monthrange(year, month)[1]
            
            # Set start and end date for request
            after_time = start_epoch=int(dt.datetime(year, month, 1).timestamp())
            before_time = start_epoch=int(dt.datetime(year, month, last_day).timestamp())
    
            # Get the required Subreddit data at the monthly level
            df = get_subreddit_data(
                object_type=object_type,
                username=username,
                columns=columns,
                subreddit=subreddit,
                max_time=before_time,
                min_time=after_time)

            # Check for duplicates and retirve number of unique records
            dupes = df['id'].duplicated().any()
            total = df['id'].nunique() 
            
            if dupes:
                print("There are duplicates in the data for " + str(ymd))
                      
            print("For " + str(ym) + " " + str(total) + " values were extracted")

            filename = file_prefix + "_" + str(ym) + ".csv"

            # Save Data to a CSV
            df.to_csv(filename)

In [None]:
get_monthly_reddit_data(years=years
                      ,months=months
                      ,object_type="submission"
                      ,columns=sub_columns
                      ,subreddit=sub_reddit
                      ,file_prefix=sub_file_prefix)
        