In [1]:
import pandas as pd
import requests #Pushshift accesses Reddit via an url so this is needed
import json #JSON manipulation
import csv #To Convert final table into a csv file to save to your machine
import time
import datetime
import collections

## Build function that builds pushshift URLs

In [2]:
def getPushshiftData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

## Build function to extract key data points

In [3]:
#This function will be used to extract the key data points from each JSON result
def collectSubData(subm):
    subData = list() #list to store data points
    title = subm['title']
    url = subm['url']
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = "NaN"    
    author = subm['author']
    sub_id = subm['id']
    score = subm['score']
    created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    numComms = subm['num_comments']
    permalink = subm['permalink']
    
    subData.append((sub_id,title,url,author,score,created,numComms,permalink,flair))
    subStats[sub_id] = subData

## Where and what data will we be storing?

In [4]:
#Subreddit to query
sub='nba' #Which Subreddit to search in

#before and after dates (use https://www.epochconverter.com/)
# 2022 NBA playoffs Apr 16, 2022 – Jun 19, 2022, Finals Jun 2, 2022 - June 16, 2022
before = "1656550800" #Jun 30 2022
after = "1649901600"  #Apr 14 2022

# 2022 NBA regular season Oct 19, 2021 – Apr 10, 2022
# before = "1649638800" #Apr 11 2022
# after = "1634518800"  #Oct 18 2021

#Keyword(s) to look for in submissions
query = "nba" 
subCount = 0
subStats = {}

## Run code and loop until all submissions are collected

In [5]:
# We need to run this function outside the loop first to get the updated after variable
data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered 
# from the 'after' date up until before date
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before, sub)
    
print(len(data))

https://api.pushshift.io/reddit/search/submission/?title=nba&size=1000&after=1649901600&before=1656550800&subreddit=nba
250
2022-04-18 12:29:29
https://api.pushshift.io/reddit/search/submission/?title=nba&size=1000&after=1650310169&before=1656550800&subreddit=nba
249
2022-04-23 10:44:38
https://api.pushshift.io/reddit/search/submission/?title=nba&size=1000&after=1650735878&before=1656550800&subreddit=nba
250
2022-04-27 16:58:28
https://api.pushshift.io/reddit/search/submission/?title=nba&size=1000&after=1651103908&before=1656550800&subreddit=nba
250
2022-05-02 18:11:32
https://api.pushshift.io/reddit/search/submission/?title=nba&size=1000&after=1651540292&before=1656550800&subreddit=nba
247
2022-05-08 11:17:30
https://api.pushshift.io/reddit/search/submission/?title=nba&size=1000&after=1652033850&before=1656550800&subreddit=nba
249
2022-05-12 16:16:55
https://api.pushshift.io/reddit/search/submission/?title=nba&size=1000&after=1652397415&before=1656550800&subreddit=nba
250
2022-05-16 1

## Check Submission

In [6]:
print(str(len(subStats)) + " submissions have added to list")
print("1st entry is:")
print(list(subStats.values())[0][0][1] + " created: " + str(list(subStats.values())[0][0][5]))
print("Last entry is:")
print(list(subStats.values())[-1][0][1] + " created: " + str(list(subStats.values())[-1][0][5]))

4853 submissions have added to list
1st entry is:
Former Dallas Maverick &amp; First Indian born player in the NBA, Satnam Singh just made his professional wrestling debut for AEW created: 2022-04-13 19:12:32
Last entry is:
With the spurs trading Dejounte Murray to the Hawks is this the most competitive the NBA has ever been? created: 2022-06-29 17:50:07


In [7]:
def update_sub_file():
    upload_count = 0
    location = "/Users/levinadong/Desktop/"
    print("input filename of submission file, please add .csv")
    filename = input()
    file = location + filename
    with open(file, 'w', newline='', encoding='utf-8') as file:
        a = csv.writer(file, delimiter=',')
        headers = ["Post ID","Title","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
        a.writerow(headers)
        for sub in subStats:
            a.writerow(subStats[sub][0])
            upload_count+=1

        print(str(upload_count) + " submissions have been uploaded into a csv file")
        
update_sub_file()

input filename of submission file, please add .csv
nba.csv
4853 submissions have been uploaded into a csv file
