## 1. Import libraries 

In [104]:
import pandas as pd
import requests 
import json 
import csv 
import datetime
import time

COMMENTS = 'comment'
SUBMISSION = 'submission'

## 2. Functions 

In [117]:

def send_query(query, after, before, subreddit, endpoint, size=100):
    
    """Makes the API Request and return a limit of 100 records (maximum allowed)

    Parameters
    ----------
    query : str
        search criteria keywords
    after : timestamp
        timestamp of the start date of searching interval
    before : timestamp
        timestamp of the end date of searching interval  
    subreddit : str, optional
        narrow the result to a specific subreddit. If not specified, it will search in all reddit data.
    size: int, default=100
        maximum number of results in a query 
    endpoint: str
        API endpoint: comment or submission. 
        
    Returns
    -------
    list
        a list of dicts. Each dict contains information related to a post/comment. 
    """
    
    # create url with search parameters
    url = f"https://api.pushshift.io/reddit/search/{endpoint}?q={query}&size={size}&after={after}&before={before}&subreddit={subreddit}&sort=asc&metadata=true"
    
    r = requests.get(url)      #request data to API
    if r.status_code == 429:
        time.sleep(5)
        r = requests.get(url)

    return json.loads(r.text)['data']

In [111]:
def extract_submissions_features(entity):
    
    """Filter relevant attributes from a submission

    Parameters
    ----------
    entity : dict
        Dict object that contains all information about a submission returned in the API request. 
        
        
    Returns
    -------
    submission_id: str
        Id of the submission
    records: list
        List of values 
    attributes: list 
        List of name of the attributes of the submission
    """
    
    records = list() 
    
    title = entity['title']
    url = entity['url']
    
    try:
        flair = entity['link_flair_text']
    except KeyError:
        flair = "NaN"    
        
    author = entity['author']
    submission_id = entity['id']
    score = entity['score']
    created = datetime.datetime.fromtimestamp(entity['created_utc']) 
    num_comments = entity['num_comments']
    permalink = entity['permalink']
    selftext= entity['selftext']
    subreddit= entity['subreddit']
    subreddit_id= entity['subreddit_id']
    
    records.append((submission_id,title,url,author,score,created,num_comments,permalink,flair, selftext,subreddit,subreddit_id))
    
    attributes = ['submission_id','title','url','author','score','created','num_comments','permalink','flair', 'selftext','subreddit','subreddit_id']
    
    return submission_id, records, attributes 

In [112]:
def extract_comment_features(entity):
    
    """Filter relevant attributes from a comment

    Parameters
    ----------
    entity : dict
        Dict object that contains all information about a comment returned in the API request. 
        
        
    Returns
    -------
    comment_id: str
        Id of the comment
    records: list
        List of values 
    attributes: list 
        List of name of the attributes of the comment
    """
   
    records = list() 
    
    #extracting attributes of interest
    try:
        flair = entity['author_flair_text']
    except KeyError:
        flair = "NaN"    
        
    author = entity['author']
    comment_id = entity['id']
    parent_id = entity['parent_id']
    score = entity['score']
    created = datetime.datetime.fromtimestamp(entity['created_utc']) 
    permalink = entity['permalink']
    body= entity['body']
    subreddit= entity['subreddit']
    subreddit_id= entity['subreddit_id']

    
    records.append((comment_id,author,score,created,permalink,flair,body,parent_id, subreddit,subreddit_id, ))
    attributes = ['comment_id','author','score','created','permalink','flair','body','parent_id', 'subreddit','subreddit_id']
    
    return comment_id, records, attributes

In [113]:
def get_pushshift_data(endpoint, query, after, before, subreddit):
    
    """Makes several API requests (~100 records each) to get all records during the period.  

    Parameters
    ----------
    endpoint: str
        API endpoint: comment or submission.
    query : str
        search criteria keywords
    after : timestamp
        timestamp of the start date of searching interval
    before : timestamp
        timestamp of the end date of searching interval  
    subreddit : str, optional
        narrow the result to a specific subreddit. If not specified, it will search in all reddit data.
        
        
    Returns
    -------
    all_records: list
        List of all search results. 
    attributes: list 
        List of name of the attributes of the object(comment or submission). 
    """
        
    all_records = {}

    data = send_query(query, after, before, subreddit, endpoint)
    
    # make API requests iteratively to get all data during the period specified. 
    while len(data) > 0:  
        for d in data:
            if endpoint == SUBMISSION:
                id_, observ, attributes = extract_submissions_features(d) 
                all_records[id_] = observ
                
            elif endpoint == COMMENTS:
                id_, observ, attributes = extract_comment_features(d)
                all_records[id_] = observ

        after = data[-1]['created_utc'] #update start date with the date of the latest harvested record 
        data = send_query(query, after, before, subreddit, endpoint)
        
    print(str(len(all_records)), endpoint +'s', "have been downloaded")
    return all_records, attributes
    

In [114]:
def harvest_reddit_data(endpoint, query, after, before, subreddit):
    """ Harvest data from Reddit using the Pushshift API. 

    Parameters
    ----------
    endpoint: str
        API endpoint: comment or submission.
    query : str
        search criteria keywords
    after : timestamp
        timestamp of the start date of searching interval
    before : timestamp
        timestamp of the end date of searching interval  
    subreddit : str, optional
        narrow the result to a specific subreddit. If not specified, it will search in all reddit data.
        
        
    Returns
    -------
     csv file containing all query results.  
     
    """
    
    all_records, attributes = get_pushshift_data(endpoint,query, after, before, subreddit)
    
    upload_count = 0
    file = 'data_'+ endpoint +'.csv' #PLACE THE NAME OF YOUR FILE HERE
    
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
       
        a.writerow(attributes)
       
        for r in all_records:
            a.writerow(all_records[r][0])   

In [115]:
#search parameters
after = "1580536800"       #Start timestamp (1580536800 = 01 Feb 2020)
before = "1630126800"      #End timestamp (1630126800 = 28 Aug 2021)
query = "covid+lockdown"   #query: posts/comments that included the terms 'covid' and 'lockdown'
subreddit = "covid19"      #search for posts/comments in the subreddit 'r/covid19'

#harvest comments
harvest_reddit_data(COMMENTS, query, after, before, subreddit)


# harvest submissions
harvest_reddit_data(SUBMISSION, query, after, before, subreddit)

https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1580536800&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1586253793&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1587473586&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1588363068&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1589401594&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1592392571&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1596457477&before=163012680

In [118]:
#harvest comments
harvest_reddit_data(COMMENTS, query, after, before, subreddit)

https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1580536800&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1586253793&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1587473586&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1588363068&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1589401594&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1592392571&before=1630126800&subreddit=covid19&sort=asc&metadata=true
https://api.pushshift.io/reddit/search/comment?q=covid+lockdown&size=100&after=1596457477&before=163012680

In [None]:
# harvest submissions
harvest_reddit_data(SUBMISSION, query, after, before, subreddit)