In [1]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd
from datetime import datetime, timedelta, date

In [2]:
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

In [3]:
def pull_comments_for(subreddit, start_at, end_at):
    
    def map_comments(comments):
        return list(map(lambda comment: {
            'post_id': comment['link_id'],
            'created_utc': comment['created_utc'],
            'comment_body': comment['body'],
            'comment_author': comment['author'],
            'comment_id': comment['id']}, comments))
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/comment/search/?subreddit={}&after={}&before={}&size={}&filter=author,body,created_utc,link_id,id'
    
    comment_collections = map_comments( \
        make_request(URI_TEMPLATE.format(subreddit, start_at, end_at, SIZE))['data'])
    n = len(comment_collections)
    while n == SIZE:
        last = comment_collections[-1]
        new_start_at = last['created_utc'] - (10)
        
        more_comments = map_comments( \
            make_request(URI_TEMPLATE.format(subreddit, new_start_at, end_at, SIZE))['data'])
        
        n = len(more_comments)
        comment_collections.extend(more_comments)
    return comment_collections

In [4]:
def give_me_intervals(start_at, number_of_days_per_interval = 2):
    
    end_at = math.ceil(datetime.utcnow().timestamp())
    print('Unix start time:',end_at)
    period = (86400 * number_of_days_per_interval)
    end = start_at + period
    yield (int(start_at), int(end))
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        yield int(start_at), int(end)
        
start_at = int(date(2021,10,21).strftime("%s")) #start of 2020 April     
intervals = give_me_intervals(start_at, 1)

In [5]:
subreddit = 'Singapore'
comments_list = []
author_list = []
post_id_list = []
created_utc_list = []
comment_id_list = []
for interval in intervals:
    pulled_comments = pull_comments_for(subreddit, interval[0], interval[1])
    comments_list.extend([x['comment_body'] for x in pulled_comments])
    author_list.extend([x['comment_author'] for x in pulled_comments])
    post_id_list.extend([x['post_id'] for x in pulled_comments])
    created_utc_list.extend([x['created_utc'] for x in pulled_comments])
    comment_id_list.extend([x['comment_id'] for x in pulled_comments])
    print('Comment count:',len(comments_list),end='\r')
    time.sleep(.100)

Unix start time: 1635487728
Comment count: 900

In [6]:
df = pd.DataFrame({'Post Id':post_id_list, 'Comment Author':author_list,'Comment':comments_list,'Comment Date':created_utc_list, 'Comment ID':comment_id_list})
df.head(20)

Unnamed: 0,Post Id,Comment Author,Comment,Comment Date,Comment ID
0,t3_qc382v,reallifeluxury,So there is no huge public outcry demanding f...,1634745622,hhdiyl2
1,t3_qc3mob,[deleted],[deleted],1634745624,hhdiyqr
2,t3_qbm3md,[deleted],[removed],1634745633,hhdizmb
3,t3_qbm3md,blackwoodsix,"Wa meng ti, wa meng ti",1634745636,hhdizvu
4,t3_qc382v,Jay-ay,Sauce? I don't know man. Just don't want this ...,1634745642,hhdj0d2
5,t3_qbyerz,No_Bend8840,But they also keep trying to prevent it from p...,1634745648,hhdj0yn
6,t3_qbyerz,khaophat,Smlj,1634745663,hhdj27s
7,t3_qbyerz,vonstirlitz,"Most of my friends have already gone, and I wo...",1634745665,hhdj2g5
8,t3_qbyerz,Human-Indication,I was hoping no background music would have br...,1634745667,hhdj2l5
9,t3_qc382v,Jay-ay,"No, but the current measures are not working t...",1634745694,hhdj52c


In [7]:
df.to_csv('~/Desktop/reddit_data_raw_updated.csv', index = False)