## 1. Import libraries 

In [1]:
import pandas as pd
import requests 
import json 
import csv 
import time
import datetime
import numpy as np

## 2. Functions 

In [2]:
def sendQuery(query, after, before, subreddit, size, endpoint):
    url = 'https://api.pushshift.io/reddit/search/' + str(endpoint) + '?q=' + str(query) \
    + '&size=' + str(size) + '&after='+ str(after) +'&before='+ str(before) 
    r = requests.get(url)
    
    print(url)
    print(len(r.text))
    
    data = json.loads(r.text)    
    return data['data']

In [12]:
#This function will be used to extract the key data points from each JSON result

def extractSubmissionsFeatures(entity):
   
    records = list() 
    
    #extracting attributes of interest
    title = entity['title']
    url = entity['url']
    
    try:
        flair = entity['link_flair_text']
    except KeyError:
        flair = "NaN"    
        
    author = entity['author']
    submission_id = entity['id']
    score = entity['score']
    created = datetime.datetime.fromtimestamp(entity['created_utc']) 
    num_comments = entity['num_comments']
    permalink = entity['permalink']
    selftext= entity['selftext'] if 'selftext' in entity else ''
    subreddit= entity['subreddit']
    subreddit_id= entity['subreddit_id']
    
    records.append((submission_id,title,url,author,score,created,num_comments,permalink,flair, selftext,subreddit,subreddit_id))
    headings = ['submission_id','title','url','author','score','created','num_comments','permalink','flair', 'selftext','subreddit','subreddit_id']
    
    return submission_id, records, headings 

def extractCommentsFeatures(entity):
   
    records = list() 
    
    #extracting attributes of interest
    try:
        flair = entity['author_flair_text']
    except KeyError:
        flair = "NaN"    
        
    author = entity['author']
    comment_id = entity['id']
    parent_id = entity['parent_id']
    score = entity['score']
    created = datetime.datetime.fromtimestamp(entity['created_utc']) 
    permalink = entity['permalink']
    body= entity['body']
    subreddit= entity['subreddit']
    subreddit_id= entity['subreddit_id']

    
    records.append((comment_id,author,score,created,permalink,flair,body,parent_id, subreddit,subreddit_id, ))
    headings = ['comment_id','author','score','created','permalink','flair','body','parent_id', 'subreddit','subreddit_id']
    
    return comment_id, records, headings

In [13]:
def get_pushshift_data(endpoint):
    
    all_records = {}
    after = "1580536800"     #Submissions after this timestamp (1580536800 = 01 Feb 20)
    before = "1630126800"    #Submissions before this timestamp (1630126800 = 28 Aug 21)
    query = "art+exhibition"
    subreddit = "covid19" 
    size = 1000
    

    data = sendQuery(query, after, before, subreddit, size, endpoint)
    
    while len(data) > 0: 
        for d in data:
            id_, observ, headings = extractSubmissionsFeatures(d) if endpoint=="submission" else extractCommentsFeatures(d)
            all_records[id_] = observ

        after = data[-1]['created_utc']
        try:
            data = sendQuery(query, after, before, subreddit, size, endpoint)
        except:
            break
            
    print(str(len(all_records)) + " submissions have been downloaded")
    return all_records, headings
    

In [14]:
def save_into_csv(endpoint):
    all_records, headings = get_pushshift_data(endpoint)
    
    upload_count = 0
    file = 'data_art_'+ endpoint +'.csv' #PLACE THE NAME OF YOUR FILE HERE
    
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
       
        a.writerow(headings)
       
        for r in all_records:
            a.writerow(all_records[r][0])   

In [9]:
#comments
save_into_csv('comment')

https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1580536800&before=1630126800
272082
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1581097278&before=1630126800
231940
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1581655428&before=1630126800
200598
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1582284137&before=1630126800
232442
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1582776555&before=1630126800
234824
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1583337697&before=1630126800
266206
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1583693599&before=1630126800
267434
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=1000&after=1584143154&before=1630126800
301000
https://api.pushshift.io/reddit/search/comment?q=art+exhibition&size=100

In [15]:
# submissions
save_into_csv('submission')

https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1580536800&before=1630126800
950038
https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1581857590&before=1630126800
806322
https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1582851614&before=1630126800
910287
https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1583905464&before=1630126800
697199
https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1585513036&before=1630126800
753863
https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1586823599&before=1630126800
777331
https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1588509750&before=1630126800
797435
https://api.pushshift.io/reddit/search/submission?q=art+exhibition&size=1000&after=1590158114&before=1630126800
630408
https://api.pushshift.io/reddit/search/submissio