In [212]:
import datetime as dt
import calendar
import configparser
import pandas as pd
import requests
import urllib
import time
import json

In [213]:
# Read passwords and secrets from config file
config_parser = configparser.ConfigParser()
config_parser.read("src/configuration/config.cfg")

['src/configuration/config.cfg']

In [227]:
# Set parameters
reddit_client_id = config_parser["praw"]["client_id"]
reddit_client_secret = config_parser["praw"]["client_secret"]
reddit_password = config_parser["praw"]["password"]
reddit_username = config_parser["praw"]["username"]
reddit_agent = config_parser["praw"]["user_agent"] + reddit_username
sub_reddit = config_parser["praw"]["subreddit"]
sub_file_prefix = "submissions"
comment_file_prefix = "comment"
years = [2021]
months = range(1,6)
sub_columns = ["id",'author_fullname','title','score','author_premium','domain','over_18','subreddit_id','permalink','parent_whitelist_status','url','created_utc','num_comments','upvote_ratio']
comm_columns = ["id",'link_id']

In [222]:
def get_subreddit_data(object_type, columns, username='', subreddit='', search_query='', max_time=None, min_time=1609459200):
    # start from current time if not specified
    if max_time is None:
        max_time = int(time.time())

    # generate filter string
    filter_string = urllib.parse.urlencode(
        {k: v for k, v in zip(
            ['author', 'subreddit', 'q'],
            [username, subreddit, search_query]) if v != ""})

    url_format = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before={}"

    before = max_time
    df = pd.DataFrame()
    
    while before > min_time:
        print(before)
        url = url_format.format(object_type, filter_string, before)
        resp = requests.get(url)

        # convert records to dataframe
        dfi = pd.json_normalize(json.loads(resp.text)['data'])
        # filter out unwanted columns
        df = pd.concat([df, dfi[columns]])

        # set `before` to the earliest comment/post in the results
        # next time we call requests.get(...) we will only get comments/posts before
        # the earliest that we already have, thus not fetching any duplicates
        before = dfi['created_utc'].min()
        
    return df


In [216]:
def get_daily_reddit_data(years, months, days, object_type, columns, subreddit, file_prefix):
    for year in years:
        for month in months:
            for day in days:
                ymd = year * 10000 + month * 100 + day
                tomorrow = day + 1

                after_time = start_epoch=int(dt.datetime(year, month, day).timestamp())
                before_time = start_epoch=int(dt.datetime(year, month, tomorrow).timestamp())

                df = get_subreddit_data(
                    object_type=object_type,
                    columns=columns,
                    username=username,
                    subreddit=subreddit,
                    max_time=before_time,
                    min_time=after_time)

                dupes = df['id'].duplicated().any()
                total = df['id'].nunique() 

                if dupes:
                    print("There are duplicates in the data for " + str(ymd))

                print("For " + str(ymd) + " " + str(total) + " values were extracted")

                filename = file_prefix + "_" + str(ymd) + ".csv"

                df.to_csv(filename)

In [217]:
def get_monthly_reddit_data(years, months, object_type, columns, subreddit, file_prefix):
    for year in years:
        for month in months:
            ym = year * 100 + month
            last_day = calendar.monthrange(year, month)[1]
            
            after_time = start_epoch=int(dt.datetime(year, month, 1).timestamp())
            before_time = start_epoch=int(dt.datetime(year, month, last_day).timestamp())
    
            df = get_subreddit_data(
                object_type=object_type,
                username=username,
                columns=columns,
                subreddit=subreddit,
                max_time=before_time,
                min_time=after_time)

            dupes = df['id'].duplicated().any()
            total = df['id'].nunique() 
            
            if dupes:
                print("There are duplicates in the data for " + str(ymd))
                      
            print("For " + str(ym) + " " + str(total) + " values were extracted")

            filename = file_prefix + "_" + str(ym) + ".csv"

            df.to_csv(filename)

In [218]:
#data['subreddit_name_prefixed'] = submission.subreddit_name_prefixed
#data['name'] = submission.name
##data['upvote_ratio'] = submission.upvote_ratio
#data['ups'] = submission.ups
#data['created'] = submission.created
#data['url_overridden_by_dest'] = submission.url_overridden_by_dest


In [229]:
get_monthly_reddit_data(years=[2021]
                      ,months=[1,2,3,4]
                      ,object_type="submission"
                      ,columns=sub_columns
                      ,subreddit=sub_reddit
                      ,file_prefix=sub_file_prefix)
        

1612051200
1612042005
1612033817
1612024774
1612018053
1612006762
1611998837
1611987978
1611978581
1611967440
1611958717
1611952008
1611941560
1611937636
1611933087
1611927268
1611920500
1611915387
1611904968
1611895932
1611884891
1611875089
1611869181
1611861830
1611854276
1611847505
1611840418
1611833261
1611826707
1611815205
1611807529
1611797230
1611789994
1611779275
1611772718
1611768390
1611762786
1611676941
1611671233
1611665714
1611660478
1611654182
1611644333
1611637388
1611628696
1611620841
1611614603
1611606385
1611599896
1611595601
1611589656
1611584825
1611580618
1611575689
1611568239
1611562354
1611556515
1611544096
1611533471
1611518983
1611508387
1611499392
1611491632
1611480070
1611460643
1611445758
1611431390
1611420423
1611412825
1611407143
1611394632
1611378267
1611365715
1611348347
1611337037
1611330259
1611324820
1611314990
1611304109
1611295054
1611284510
1611272253
1611259777
1611251710
1611243491
1611236305
1611226654
1611219048
1611210031
1611198585
1611188337