In [194]:
import calendar
import configparser
import praw
from pushshift_py import PushshiftAPI
import datetime as dt
import pandas as pd

In [195]:
# Read passwords and secrets from config file
config_parser = configparser.ConfigParser()
config_parser.read("src/configuration/config.cfg")

['src/configuration/config.cfg']

In [196]:
# Set parameters
reddit_client_id = config_parser["praw"]["client_id"]
reddit_client_secret = config_parser["praw"]["client_secret"]
reddit_password = config_parser["praw"]["password"]
reddit_username = config_parser["praw"]["username"]
reddit_agent = config_parser["praw"]["user_agent"] + reddit_username
sub_reddit = config_parser["praw"]["subreddit"]
sub_file_prefix = "submissions"
comment_file_prefix = "comment"
years = [2021]
months = range(2,6)
sub_columns = ["id",'author_fullname','title','score','author_premium','domain','over_18','subreddit_id','permalink','parent_whitelist_status','url','created_utc']
comm_columns = ["id",'link_id']

In [197]:
# Establish connection to reddit
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    password=reddit_password,
    user_agent=reddit_agent,
    username=reddit_username,
)

In [198]:
def get_subreddit_submission_history(sub, year, month):
    api = PushshiftAPI()
    last_day = calendar.monthrange(year, month)[1] # last day of month
    after = start_epoch=int(dt.datetime(year, month, 1).timestamp())
    before = start_epoch=int(dt.datetime(year, month, last_day).timestamp())

    submissions = api.search_submissions(subreddit=sub,
                             before=before,
                             after=after)
    results = []
    for submission in submissions:
        results.append(submission)
    
    return results


In [199]:
def get_subreddit_comment_history(sub, year, month):
    api = PushshiftAPI()
    last_day = calendar.monthrange(year, month)[1] # last day of month
    after = start_epoch=int(dt.datetime(year, month, 1).timestamp())
    before = start_epoch=int(dt.datetime(year, month, last_day).timestamp())
    
    comments = api.search_comments(subreddit=sub,
                             before=before,
                             after=after)
    
    results = []
    for comment in comments:
        results.append(comment)
        
    return results

In [None]:
for year in years:
    for month in months:
        # Extract submissions for year and month
        yearmonth = year * 100 + month
        sub_output = get_subreddit_submission_history(sub=sub_reddit, year=year, month=month)
        print("For " + str(yearmonth) + " " + str(len(sub_output)) + " submissions were extracted")
        s_df = pd.DataFrame([c.d_ for c in sub_output])
        s_df = s_df[sub_columns]
        sub_filename = sub_file_prefix + "_" + str(yearmonth) + ".csv"
        s_df.to_csv(sub_filename)
        
        # Extract comments for year and month
        comm_output = get_subreddit_comment_history(sub=sub_reddit, year=year, month=month)
        print("For " + str(yearmonth) + " " + str(len(comm_output)) + " comments were extracted")
        c_df = pd.DataFrame([c.d_ for c in comm_output])
        c_df = c_df[comm_columns]
        comm_filename = comment_file_prefix + "_" + str(yearmonth) + ".csv"
        c_df.to_csv(comm_filename)


In [97]:
#data['subreddit_name_prefixed'] = submission.subreddit_name_prefixed
#data['name'] = submission.name
##data['upvote_ratio'] = submission.upvote_ratio
#data['ups'] = submission.ups
#data['created'] = submission.created
#data['url_overridden_by_dest'] = submission.url_overridden_by_dest


In [129]:
dt.datetime.now()

datetime.datetime(2021, 6, 10, 13, 31, 10, 510164)

In [151]:
c_df[['link_id']]

Unnamed: 0,link_id
0,t3_ei4wgq
1,t3_ei9quo
2,t3_ei1wf9
3,t3_ei5h2w
4,t3_ei5h2w
...,...
4305,t3_ei4wgq
4306,t3_ei9quo
4307,t3_ei1wf9
4308,t3_ei4wgq


In [None]:
import pandas as pd
import requests
import urllib
import time
import json

def get_data(object_type, username='', subreddit='', search_query='', max_time=None, min_time=1609459200):
    # start from current time if not specified
    if max_time is None:
        max_time = int(time.time())

    # generate filter string
    filter_string = urllib.parse.urlencode(
        {k: v for k, v in zip(
            ['author', 'subreddit', 'q'],
            [username, subreddit, search_query]) if v != ""})

    url_format = "https://api.pushshift.io/reddit/search/{}/?size=500&sort=desc&{}&before={}"

    before = max_time
    df = pd.DataFrame()
    
    while before > min_time:
        url = url_format.format(object_type, filter_string, before)
        resp = requests.get(url)

        # convert records to dataframe
        dfi = pd.json_normalize(json.loads(resp.text)['data'])
        
        if object_type == 'comment':
            #dfi = dfi.rename(columns={'created_utc': 'date', 'body': 'comment'})
            df = pd.concat([df, dfi[comm_columns]])
        elif object_type == 'submission':
            #dfi = dfi.rename(columns={'created_utc': 'date', 'selftext': 'post'})
            #dfi = dfi[dfi['post'].ne('')]
            df = pd.concat([df, dfi[sub_columns]])

        # set `before` to the earliest comment/post in the results
        # next time we call requests.get(...) we will only get comments/posts before
        # the earliest that we already have, thus not fetching any duplicates
        before = dfi['created_utc'].min()

        # if needed
        # time.sleep(1)
        
    return df
#Testing by getting the comments and checking for duplicate values (by id):

username = ""
subreddit = "worldnews"

for year in years:
    for month in months:
        last_day = calendar.monthrange(year, month)[1] # last day of month
        after = start_epoch=int(dt.datetime(year, month, 1).timestamp())
        before = start_epoch=int(dt.datetime(year, month, last_day).timestamp())
    
        df_comments = get_data(
            object_type='submission',
            username=username,
            subreddit=subreddit,
            max_time=before,
            min_time=after)

        df_comments['id'].duplicated().any()    # False
        total = df_comments['id'].nunique()             # 2200
        yearmonth = year * 100 + month
        print("For " + str(yearmonth) + " " + str(total) + " values were extracted")
        
        sub_filename = sub_file_prefix + "_" + str(yearmonth) + ".csv"

        df_comments.to_csv(sub_filename)

In [201]:
print(total)

34200


In [202]:
sub_filename = sub_file_prefix + "_" + str(yearmonth) + ".csv"

df_comments.to_csv(sub_filename)