# Using Pushshift Module to extract Submissions Data from Reddit via Python

PRAW is pretty good at gettin reddit data but there are some limitations with it.
Including the removal of the [subreddit.submissions endpoint](https://www.reddit.com/r/changelog/comments/7tus5f/update_to_search_api/.). 

So for extracting Reddit submissions and the primarily data such as upvotes and comments count, I put together this notebook using Pushshift.

If you still prefer PRAW for extract submissions, I have written a code [template here](https://github.com/SeyiAgboola/Seyi_Projects/blob/master/submission_list.py).

I will also host the code on GitHub.

More info on the removal of the [subreddit.submissions endpoint](https://www.reddit.com/r/redditdev/comments/8bia9n/praw_psa_the_subredditsubmissions_method_no/).

# Import modules

In [8]:
import pandas as pd
import requests #Pushshift accesses Reddit via an url so this is needed
import json #JSON manipulation
import csv #To Convert final table into a csv file to save to your machine
import time
import datetime

# Pushshift URL Examples

In [2]:
#We can access the Pushshift API through building an URL with the relevant parameters without even needing Reddit credentials.
#These are some examples. You can follow the links and they will generate a page with JSON data
search_ps4_after_date = "https://api.pushshift.io/reddit/search/submission/?q=screenshot&after=1514764800&before=1517443200&subreddit=PS4"
search_science = "https://api.pushshift.io/reddit/search/submission/?q=science"

# Parameters for your Pushshift URL
These are probably the most important parameters to consider when building your Pushshift URL:

* size — increase limit of returned entries to 1000
* after — where to start the search
* before — where to end the search
* title — to search only within the submission’s title
* subreddit — to narrow it down to a particular subreddit

In [37]:
#Adapted from this https://gist.github.com/dylankilkenny/3dbf6123527260165f8c5c3bc3ee331b
#This function builds an Pushshift URL, accesses the webpage and stores JSON data in a nested list
def getPushshiftData(query, after, before, subreddit):
    #Build URL
#     url_submission = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
#     url = url_submission
    url_comment = 'https://api.pushshift.io/reddit/search/comment/?q='+str(query)+'&size=1000&metadata=true&after='+str(after)+'&before='+str(before)+'&subreddit='+str(subreddit)
    url = url_comment
    #Print URL to show user
    print(url)
    #Request URL
    r = requests.get(url)
    #Load JSON data from webpage into data variable
    data = json.loads(r.text)
    #return the data element which contains all the submissions data
    return data['data'], data['metadata']


# Extract key information from Submissions

We want key data for further analysis including: 
* Submission Title
* URL 
* Flair
* Author
* Submission post ID
* Score
* Upload Time
* No. of Comments 
* Permalink.


In [4]:
#This function will be used to extract the key data points from each JSON result
def collectSubData(subm):
    #subData was created at the start to hold all the data which is then added to our global subStats dictionary.
    subData = list() #list to store data points
    #title = subm['title']
    #url = subm['url']
    #flairs are not always present so we wrap in try/except
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"    
    author = subm['author']
    sub_id = subm['id']
    score = subm['score']
    created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    #numComms = subm['num_comments']
    permalink = subm['permalink']
    body = subm['body']
    #Put all data points into a tuple and append to subData
    #subData.append((sub_id,title,url,author,score,created,numComms,permalink,flair))
    subData.append((sub_id,author,body,score,created,permalink,flair))

    #Create a dictionary entry of current submission data and store all data related to it
    subStats[sub_id] = subData

# Update your Search Settings here

- Create your timestamps and queries for your search URL
- https://www.unixtimestamp.com/index.php > Use this to create your timestamps
- after = "1577836800" #Submissions after this timestamp (1577836800 = 01 Jan 20)
- before = "1607040000" #Submissions before this timestamp (1607040000 = 04 Dec 20)
- query = "Cyberpunk 2077" #Keyword(s) to look for in submissions
- sub = "Games" #Which Subreddit to search in

In [145]:
# after = "1514764800" #01/01/2018 @ 12:00am (UTC)
# before = "1515542400" #01/10/2018 @ 12:00am (UTC)
query = "vaccine" 
subreddit = "conspiracy"

after = "1577836800" #Submissions after this timestamp (1577836800 = 01 Jan 20)
#before = "1607040000" #Submissions before this timestamp (1607040000 = 04 Dec 20)
#before = "1583020800" # 03/01/2020 @ 12:00am (UTC)
before = "1607990400" #12/15/2020 @ 12:00am (UTC)

# query = "Cyberpunk 2077" #Keyword(s) to look for in submissions
# sub = "Games" #Which Subreddit to search in

#subCount tracks the no. of total submissions we collect
subCount = 0
#subStats is the dictionary where we will store our data.
subStats = {}

In [58]:
# We need to run this function outside the loop first to get the updated after variable
data, metadata = getPushshiftData(query, after, before, subreddit)
# Will run until all posts have been gathered i.e. When the length of data variable = 0
# from the 'after' date up until before date
while len(data) > 0: #The length of data is the number submissions (data[0], data[1] etc), once it hits zero (after and before vars are the same) end
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    #update after variable to last created date of submission
    after = data[-1]['created_utc']
    #data has changed due to the new after variable provided by above code
    time.sleep(1)
    data, metadata = getPushshiftData(query, after, before, subreddit)

print(len(data))

https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1604981775&before=1607990400&subreddit=conspiracy
100
2020-11-10 14:27:27
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1605014847&before=1607990400&subreddit=conspiracy
100
2020-11-10 20:01:47
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1605034907&before=1607990400&subreddit=conspiracy
100
2020-11-11 02:54:27
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1605059667&before=1607990400&subreddit=conspiracy
100
2020-11-11 13:07:51
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1605096471&before=1607990400&subreddit=conspiracy
100
2020-11-11 19:20:14
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1605118814&before=1607990400&subreddit=conspiracy
100
2020-11-12 00:33:43
https://api.pushshift.io/red

100
2020-11-23 07:24:25
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1606112665&before=1607990400&subreddit=conspiracy
100
2020-11-23 17:27:26
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1606148846&before=1607990400&subreddit=conspiracy
100
2020-11-23 20:54:10
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1606161250&before=1607990400&subreddit=conspiracy
100
2020-11-24 01:56:01
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1606179361&before=1607990400&subreddit=conspiracy
100
2020-11-24 07:55:15
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1606200915&before=1607990400&subreddit=conspiracy
100
2020-11-24 17:52:21
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1606236741&before=1607990400&subreddit=conspiracy
100
2020-11-24 21:21:31
http

100
2020-12-06 22:50:16
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607291416&before=1607990400&subreddit=conspiracy
100
2020-12-07 01:23:30
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607300610&before=1607990400&subreddit=conspiracy
100
2020-12-07 04:15:42
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607310942&before=1607990400&subreddit=conspiracy
100
2020-12-07 10:55:03
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607334903&before=1607990400&subreddit=conspiracy
100
2020-12-07 16:55:32
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607356532&before=1607990400&subreddit=conspiracy
100
2020-12-07 20:11:50
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607368310&before=1607990400&subreddit=conspiracy
100
2020-12-07 23:25:57
http

100
2020-12-13 01:05:38
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607817938&before=1607990400&subreddit=conspiracy
100
2020-12-13 04:50:57
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607831457&before=1607990400&subreddit=conspiracy
100
2020-12-13 11:59:16
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607857156&before=1607990400&subreddit=conspiracy
100
2020-12-13 18:45:05
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607881505&before=1607990400&subreddit=conspiracy
100
2020-12-13 21:06:21
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607889981&before=1607990400&subreddit=conspiracy
100
2020-12-14 01:24:41
https://api.pushshift.io/reddit/search/comment/?q=vaccine&size=1000&metadata=true&after=1607905481&before=1607990400&subreddit=conspiracy
100
2020-12-14 04:06:51
http

# Check your Submission Extraction was successful

In [59]:
print(str(len(subStats)) + " submissions have added to list")
print("1st entry is:")
print("On " + str(list(subStats.values())[0][0][4]) + " " + list(subStats.values())[0][0][1] + " created: " + str(list(subStats.values())[0][0][5]))
print("Last entry is:")
print("On " + str(list(subStats.values())[-1][0][4]) + " " + list(subStats.values())[-1][0][1] + " created: " + str(list(subStats.values())[-1][0][5]))

55528 submissions have added to list
1st entry is:
On 2020-01-01 08:56:59 ILoveChinaxxx created: /r/conspiracy/comments/ehlnfu/the_war_on_the_family_has_been_going_on_for/fcpiw33/
Last entry is:
On 2020-12-15 00:59:47 kakemak created: /r/conspiracy/comments/kd9jym/what_will_the_long_term_effects_of_the_covid/gfv8cee/


# Save data to CSV file

In [60]:
def updateSubs_file():
    upload_count = 0
    #location = "\\Reddit Data\\" >> If you're running this outside of a notebook you'll need this to direct to a specific location
    #print("input filename of submission file, please add .csv")
    #filename = input() #This asks the user what to name the file
    filename = "comments_vaccine.csv"
    file = filename
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
        #headers = ["Post ID","Title","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
        headers = ["Post ID","Author","Body","Score","Publish Date","Permalink","Flair"]
        a.writerow(headers)
        for sub in subStats:
            a.writerow(subStats[sub][0])
            upload_count+=1
            
        print(str(upload_count) + " submissions have been uploaded")
updateSubs_file()

55528 submissions have been uploaded


## Using pushshift aggregations - not working

In [148]:
# def getPushshiftAggs(query, after, before, subreddit):
#     #Build URL
#     #https://api.pushshift.io/reddit/search/comment/?q=trump&after=7d&aggs=created_utc&frequency=hour&size=0
#     url_aggs = 'https://api.pushshift.io/reddit/search/comment/?q='+str(query)+'&aggs=created_utc&frequency=week&size=0&metadata=true&after='+str(after)+'&before='+str(before)+'&subreddit='+str(subreddit)
#     url = url_aggs
#     #Print URL to show user
#     print(url)
#     #Request URL
#     r = requests.get(url)
#     #Load JSON data from webpage into data variable
#     data = json.loads(r.text)
#     #return the data element which contains all the submissions data
#     return data['data'], data['metadata']

# data, metadata = getPushshiftAggs(query, after, before, subreddit)
# # Will run until all posts have been gathered i.e. When the length of data variable = 0
# # from the 'after' date up until before date
# while len(data) > 0: #The length of data is the number submissions (data[0], data[1] etc), once it hits zero (after and before vars are the same) end
#     for submission in data:
#         collectSubData(submission)
#         subCount+=1
#     # Calls getPushshiftData() with the created date of the last submission
#     print(len(data))
#     print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
#     #update after variable to last created date of submission
#     after = data[-1]['created_utc']
#     #data has changed due to the new after variable provided by above code
#     time.sleep(1)
#     data, metadata = getPushshiftAggs(query, after, before, subreddit)

# print(len(data))

https://api.pushshift.io/reddit/search/comment/?q=vaccine&aggs=created_utc&frequency=week&size=0&metadata=true&after=1585870209&before=1607990400&subreddit=conspiracy
0


---

# Using PRAW

In [14]:
# Reddit App info (for enabling scripts to use praw)
my_client_id = "" # client-id
my_client_secret = "" # secret
my_user_agent = "anti_vax_radicalisation" #user-agent
#my_developer_name = "dreamtraitor" # developers

import praw

reddit = praw.Reddit(client_id=my_client_id, client_secret=my_client_secret, user_agent=my_user_agent)

# get 10 hot posts from the MachineLearning subreddit
# hot_posts = reddit.subreddit('MachineLearning').hot(limit=10)
# for post in hot_posts:
#     print(post.title)

In [15]:
# explore praw subreddit
subreddit = reddit.subreddit("conspiracy")

print(subreddit.display_name)  # output: redditdev
print(subreddit.title)         # output: reddit development
print(subreddit.description)   # output: a subreddit for discussion of ...

conspiracy
conspiracy
#####[Youtube to censor election fraud content](https://old.reddit.com/r/conspiracy/comments/k9uavr/youtube_swings_its_big_dick_around/)

* [**Conspiracy** - a secret plan by a group to do something unlawful or harmful](http://www.oxforddictionaries.com/us/definition/american_english/conspiracy?q=conspiracy)

* [**Theory** - a supposition or a system of ideas intended to explain something](http://www.oxforddictionaries.com/us/definition/american_english/theory?q=theory)

* [**Conspiracy Theory** - a *hypothesis* that some covert but influential organization is responsible for a circumstance or event](http://www.oxforddictionaries.com/us/definition/american_english/conspiracy-theory?q=conspiracy+theory)



This is a forum for free thinking and for discussing issues which have captured your imagination. Please respect other views and opinions, and keep an open mind. Our goal is to create a fairer and more transparent world for a better future.
_____

######**[List o

In [64]:
# top submissions
count = 0
for submission in subreddit.top(limit=10000):
    count+=1
count

990

In [69]:
# comments
count = 0
for comment in subreddit.comments(limit=10000):
    count+=1
count

957

Can't use praw to get historical data
>Since Reddit limits all listings to ~1000 entries, it is currently impossible to get all posts in a subreddit using their API. However, third-party datasets with APIs exist, such as pushshift.io.<br>
>[Link](https://stackoverflow.com/questions/53988619/praw-6-get-all-submission-of-a-subreddit)

### Collect comments by iterating over all "vaccine" submissions
[praw docs](https://praw.readthedocs.io/en/latest/tutorials/comments.html)


In [21]:
# get vaccine submission IDs from pusshift csv 
file_sub = "submissions_vaccine.csv"
df_sub = pd.read_csv(file_sub)
list_subID = df_sub.iloc[:,0]

#comCount tracks the no. of total submissions we collect
comCount = 0
#comStats is the dictionary where we will store our data.
comStats = {}

# from https://praw.readthedocs.io/en/latest/tutorials/comments.html
for sub_id in list_subID:
    comments_data = list() #list to store data points
    submission = reddit.submission(id=sub_id)
    submission.comments.replace_more(limit=None)
    for c in submission.comments.list():
        #print(comment.body)
        name = c.author.name if c.author else '[deleted]' # handle deleted comments
        comments_data.append((c.submission.id, c.id, c.body, datetime.datetime.fromtimestamp(c.created_utc), name, c.score, c.permalink))
    #Create a dictionary entry of current comments data and store all data related to it
    comStats[sub_id] = comments_data

In [22]:
def updatecoms_file():
    upload_count = 0
    #location = "\\Reddit Data\\" >> If you're running this outside of a notebook you'll need this to direct to a specific location
    #print("input filename of submission file, please add .csv")
    #filename = input() #This asks the user what to name the file
    filename = "comments_vaccine_praw.csv"
    file = filename
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
        #headers = ["Post ID","Title","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
        #headers = ["Post ID","Body","Score","Publish Date","Permalink","Flair"]
        headers = ["Submission ID", "Comment ID", "Body", "Publish Date", "Author", "Score", "Permalink"]
        a.writerow(headers)
        for sub_id in comStats.keys():
            for com in comStats[sub_id]:
                row = list()
                row.extend([col for col in com])
                a.writerow(row)
                upload_count+=1            
        print(str(upload_count) + " comments have been uploaded")
updatecoms_file()

77366 comments have been uploaded
