In [159]:
import calendar
import configparser
import praw
from pushshift_py import PushshiftAPI
import datetime as dt
import pandas as pd

In [160]:
# Read passwords and secrets from config file
config_parser = configparser.ConfigParser()
config_parser.read("src/configuration/config.cfg")

['src/configuration/config.cfg']

In [171]:
# Set parameters
reddit_client_id = config_parser["praw"]["client_id"]
reddit_client_secret = config_parser["praw"]["client_secret"]
reddit_password = config_parser["praw"]["password"]
reddit_username = config_parser["praw"]["username"]
reddit_agent = config_parser["praw"]["user_agent"] + reddit_username
sub_reddit = config_parser["praw"]["subreddit"]
sub_file_prefix = "submissions"
comment_file_prefix = "comment"
years = [2021]
months = range(1,6)
sub_columns = ["id",'author_fullname','title','score','author_premium','domain','over_18','subreddit_id','permalink','parent_whitelist_status','url','created_utc']
comm_columns = ["id",'link_id']

In [172]:
# Establish connection to reddit
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    password=reddit_password,
    user_agent=reddit_agent,
    username=reddit_username,
)

In [173]:
def get_subreddit_submission_history(sub, year, month):
    api = PushshiftAPI()
    last_day = calendar.monthrange(year, month)[1] # last day of month
    after = start_epoch=int(dt.datetime(year, month, 1).timestamp())
    before = start_epoch=int(dt.datetime(year, month, last_day).timestamp())

    submissions = api.search_submissions(subreddit=sub,
                             before=before,
                             after=after)
    results = []
    for submission in submissions:
        results.append(submission)
    
    return results


In [174]:
def get_subreddit_comment_history(sub, year, month):
    api = PushshiftAPI()
    last_day = calendar.monthrange(year, month)[1] # last day of month
    after = start_epoch=int(dt.datetime(year, month, 1).timestamp())
    before = start_epoch=int(dt.datetime(year, month, last_day).timestamp())
    
    comments = api.search_comments(subreddit=sub,
                             before=before,
                             after=after)
    
    results = []
    for comment in comments:
        results.append(comment)
        
    return results

In [None]:
for year in years:
    for month in months:
        # Extract submissions for year and month
        yearmonth = year * 100 + month
        sub_output = get_subreddit_submission_history(sub=sub_reddit, year=year, month=month)
        print("For " + str(yearmonth) + " " + str(len(sub_output)) + " submissions were extracted")
        s_df = pd.DataFrame([c.d_ for c in sub_output])
        s_df = s_df[sub_columns]
        sub_filename = sub_file_prefix + "_" + str(yearmonth) + ".csv"
        s_df.to_csv(sub_filename)
        
        # Extract comments for year and month
        comm_output = get_subreddit_comment_history(sub=sub_reddit, year=year, month=month)
        print("For " + str(yearmonth) + " " + str(len(comm_output)) + " comments were extracted")
        c_df = pd.DataFrame([c.d_ for c in comm_output])
        c_df = c_df[comm_columns]
        comm_filename = comment_file_prefix + "_" + str(yearmonth) + ".csv"
        c_df.to_csv(comm_filename)


For 202101 4605 submissions were extracted
For 202101 4465 comments were extracted
For 202102 4984 submissions were extracted


In [97]:
#data['subreddit_name_prefixed'] = submission.subreddit_name_prefixed
#data['name'] = submission.name
##data['upvote_ratio'] = submission.upvote_ratio
#data['ups'] = submission.ups
#data['created'] = submission.created
#data['url_overridden_by_dest'] = submission.url_overridden_by_dest


In [129]:
dt.datetime.now()

datetime.datetime(2021, 6, 10, 13, 31, 10, 510164)

In [151]:
c_df[['link_id']]

Unnamed: 0,link_id
0,t3_ei4wgq
1,t3_ei9quo
2,t3_ei1wf9
3,t3_ei5h2w
4,t3_ei5h2w
...,...
4305,t3_ei4wgq
4306,t3_ei9quo
4307,t3_ei1wf9
4308,t3_ei4wgq
