In [1]:
import pandas as pd
import requests #Pushshift accesses Reddit via an url so this is needed
import json #JSON manipulation
import csv #To Convert final table into a csv file to save to your machine
import time
import datetime

In [2]:
#Adapted from this https://gist.github.com/dylankilkenny/3dbf6123527260165f8c5c3bc3ee331b
#This function builds an Pushshift URL, accesses the webpage and stores JSON data in a nested list
def getPushshiftData(query, after, before, minimum_comments):
    #Build URL
    
    #url = 'https://api.pushshift.io/reddit/search/submission/?title='+str(query)+'&size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    url = 'https://api.pushshift.io/reddit/search/submission/?size=500&after='+str(after)+'&before='+str(before)+'&num_comments=>2&over_18=false&fields=author,id,subreddit,subreddit_id,created_utc,num_comments'
    #Print URL to show user
    print(url)
    r = requests.get(url)
    if r.status_code < 400: 
        r.json() 
        data = json.loads(r.text, strict=False) 
        return data['data'] 
    else: 
        return None

In [3]:
#This function will be used to extract the key data points from each JSON result
def collectSubData(subm):
    #subData was created at the start to hold all the data which is then added to our global subStats dictionary.
    subData = list() #list to store data points
    try:
        id = subm['id']
    except KeyError:
        id = ""
    try:
        author = subm['author']
    except KeyError:
        author = ""
    try:
        subreddit = subm['subreddit']
    except KeyError:
        subreddit = ""
    try:
        subreddit_id = subm['subreddit_id']
    except KeyError:
        subreddit_id = ""
    created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
    if author != "[deleted]":
        subData.append((id,author,subreddit,subreddit_id,created))
    #Create a dictionary entry of current submission data and store all data related to it
        subStats[id] = subData
        try:
            usersInSubreddit[subreddit].append(author)
        except KeyError:
            usersInSubreddit[subreddit] = list()
            usersInSubreddit[subreddit].append(author)
        try:
            commentsForUser[author]+=1
        except KeyError:
            commentsForUser[author] = 1

In [4]:
#Create your timestamps and queries for your search URL
#https://www.unixtimestamp.com/index.php > Use this to create your timestamps

def updateSubs_file(x):
    upload_count = 0
    #location = "\\Reddit Data\\" >> If you're running this outside of a notebook you'll need this to direct to a specific location
    print("input filename of submission file, please add .csv")
    #filename = input()#f"2012-09-{x}.csv"#input() #This asks the user what to name the file
    file = f"2012-09-{x}.csv"
    print("Got Here")
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
        headers = ["id", "author","subreddit","subreddit_id","created","num_comments"]
        a.writerow(headers)
        for sub in subStats:
            a.writerow(subStats[sub][0])
            upload_count+=1
            
        print(str(upload_count) + " submissions have been uploaded")

def record_file(x):
    print("Got Here")
    upload_count = 0
    dont_try = set()
  
    #location = "\\Reddit Data\\" >> If you're running this outside of a notebook you'll need this to direct to a specific location
    print("input filename of submission file, please add .csv")
    filename = f"2012-09-{x}_subreddits.csv"#input() #This asks the user what to name the file
    file = filename
    with open(file, 'w', newline='', encoding='utf-8') as file: 
        a = csv.writer(file, delimiter=',')
        headers = ["source", "target", "weight"]
        a.writerow(headers)
        for source in usersInSubreddit:
            dont_try.add(source)
            for destination in usersInSubreddit:
                if destination not in dont_try:
                    count = 0
                    if source != destination:
                        for user in usersInSubreddit[source]:
                            if user in usersInSubreddit[destination]:
                                count+=1
                    if count > 1:
                        a.writerow([source,destination,count])
                        #subreddits_of_interest.add(source)
            upload_count+=1
            
        print(str(upload_count) + " submissions have been uploaded")


In [5]:
# We need to run this function outside the loop first to get the updated after variable

after = 1346475600 #Submissions after this timestamp (1577836800 = 01 Jan 20)
before = after + 86400 #Submissions before this timestamp (1607040000 = 04 Dec 20)
query = "" #Keyword(s) to look for in submissions
minimum_comments = "2" #Which Subreddit to search in

#subCount tracks the no. of total submissions we collect
subCount = 0
#subStats is the dictionary where we will store our data.
subStats = {}
commentsForUser = {}
usersInSubreddit = {}
subreddits_of_interest = set()
users_of_interest = set()
for x in range(1,30):

    data = getPushshiftData(query, after, before, minimum_comments)
    # Will run until all posts have been gathered i.e. When the length of data variable = 0
    # from the 'after' date up until before date
    while len(data) > 0: #The length of data is the number submissions (data[0], data[1] etc), once it hits zero (after and before vars are the same) end
        for submission in data:
            collectSubData(submission)
            subCount+=1
        # Calls getPushshiftData() with the created date of the last submission
        print(len(data))
        print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
        #update after variable to last created date of submission
        after = data[-1]['created_utc']
        # data has changed due to the new after variable provided by above code
        # while data is None:
        #     time.sleep(2)
        #     data = getPushshiftData(query, after, before, minimum_comments)
        data = getPushshiftData(query, after, before, minimum_comments)
        while data is None:
            time.sleep(1)
            data = getPushshiftData(query, after, before, minimum_comments)
        
    #print(len(data))
    #updateSubs_file(x)
    #record_file(x)

    for user in commentsForUser:
        user_comments = commentsForUser[user]
        if  user_comments > 15:
            users_of_interest.add(user)


    f = open(f"users{x}.txt", "a")
    for user in users_of_interest:
        user_comments = commentsForUser[user]
        f.write(f"{user},{user_comments}\n")
    f.close()

    for subreddit in usersInSubreddit:
        num_users = len(usersInSubreddit[subreddit]) 
        if num_users > 1000:
            # print(num_users)
            # print("\n")
            # print(usersInSubreddit[subreddit])
            # print("\n")
            subreddits_of_interest.add(subreddit)
    
    f = open(f"subreddits{x}.txt", "a")
    for subreddit in subreddits_of_interest:
        num_users = len(usersInSubreddit[subreddit]) 
        f.write(f"{subreddit},{num_users}\n")       
    f.close()
    after = before
    before+=86400


https://api.pushshift.io/reddit/search/submission/?size=500&after=1346475600&before=1346562000&num_comments=>2&over_18=false&fields=author,id,subreddit,subreddit_id,created_utc,num_comments
100
2012-09-01 00:05:08
https://api.pushshift.io/reddit/search/submission/?size=500&after=1346475908&before=1346562000&num_comments=>2&over_18=false&fields=author,id,subreddit,subreddit_id,created_utc,num_comments
100
2012-09-01 00:10:15
https://api.pushshift.io/reddit/search/submission/?size=500&after=1346476215&before=1346562000&num_comments=>2&over_18=false&fields=author,id,subreddit,subreddit_id,created_utc,num_comments
100
2012-09-01 00:16:05
https://api.pushshift.io/reddit/search/submission/?size=500&after=1346476565&before=1346562000&num_comments=>2&over_18=false&fields=author,id,subreddit,subreddit_id,created_utc,num_comments
100
2012-09-01 00:23:05
https://api.pushshift.io/reddit/search/submission/?size=500&after=1346476985&before=1346562000&num_comments=>2&over_18=false&fields=author,id,su

KeyboardInterrupt: 

In [None]:
print(str(len(subStats)) + " submissions have added to list")
# print("1st entry is:")
# print(list(subStats.values())[0][0][1] + " created: " + str(list(subStats.values())[0][0][5]))
# print("Last entry is:")
# print(list(subStats.values())[-1][0][1] + " created: " + str(list(subStats.values())[-1][0][5]))
#after = before