## Web Scraping using `Pushshift`

The goal is to apply `Pushshift`, which contains archived posts from a subreddit, to access Reddit comment and submission database. More info [HERE](https://github.com/pushshift/api).

In [None]:
import requests
import time
import pandas as pd
import json
from ast import literal_eval

Extract posts and comments from the specified subreddit.

In [None]:
subreddit = 'offmychest'
maxThings = -1
printWait = 2
requestSize = 100

In [None]:
# make a request from url to extract json 
def requestJSON(url):
    while True:
        try:
            r = requests.get(url)
            if r.status_code != 200:
                print('error code', r.status_code)
                time.sleep(5)
                continue
            else:
                break
        except Exception as e:
            print(e)
            time.sleep(5)
            continue
    return r.json()

In [None]:
# request
meta = requestJSON('https://api.pushshift.io/meta')
limitPerMinute = meta['server_ratelimit_per_minute']
requestWait = 60 / limitPerMinute
print('server_ratelimit_per_minute', limitPerMinute)

server_ratelimit_per_minute 120


In [None]:
def get_data(thing, subreddit, target_cols):
    i = 0

    with open(subreddit + '_' + thing + '.txt', 'w') as f:
        print('\n[starting', thing + 's]')

        if maxThings < 0:
            url = 'https://api.pushshift.io/reddit/search/'\
                  + thing + '/?subreddit='\
                  + subreddit\
                  + '&metadata=true&size=0'
            
            json = requestJSON(url)
            
            totalResults = json['metadata']['total_results']
            print('total ' + thing + 's', 'in', subreddit,':', totalResults)
        else:
            totalResults = maxThings
            print('downloading most recent', maxThings)


        created_utc = ''

        startTime = time.time()
        timePrint = startTime
        while True:
            url = 'http://api.pushshift.io/reddit/search/'\
                  + thing + '/?subreddit=' + subreddit\
                  + '&size=' + str(requestSize)\
                  + '&before=' + str(created_utc)

            json = requestJSON(url)

            if len(json['data']) == 0:
                break

            doneHere = False
            for post in json['data']:
                created_utc = post["created_utc"]
                try:
                    data = dict((k, post[k]) for k in target_cols)
                    f.write(str(data) + '\n')
                    i += 1
                    if i >= totalResults:
                        doneHere = True
                        break
                except:
                    continue
                
            if doneHere:
                break
            
            if time.time() - timePrint > printWait:
                timePrint = time.time()
                percent = i / totalResults * 100
                
                timePassed = time.time() - startTime
                
                print('{:.2f}'.format(percent) + '%', '|',
                        time.strftime("%H:%M:%S", time.gmtime(timePassed)))


            time.sleep(requestWait)

In [None]:
get_data('submission', subreddit, ('id', 'title', 'author', 'subreddit', 'num_comments', 'selftext'))
get_data('comment', subreddit, ('id', 'author', 'link_id', 'parent_id', 'body'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
65.31% | 07:02:32
65.31% | 07:02:35
65.32% | 07:02:37
65.33% | 07:02:39
65.33% | 07:02:42
65.34% | 07:02:44
65.35% | 07:02:47
65.36% | 07:02:49
65.37% | 07:02:51
65.38% | 07:02:53
error code 429
65.38% | 07:03:01
65.39% | 07:03:03
65.40% | 07:03:05
65.41% | 07:03:08
65.42% | 07:03:10
65.42% | 07:03:13
65.43% | 07:03:16
65.44% | 07:03:18
65.44% | 07:03:21
65.45% | 07:03:23
65.46% | 07:03:26
65.46% | 07:03:28
65.47% | 07:03:31
65.48% | 07:03:33
65.49% | 07:03:35
65.50% | 07:03:38
error code 429
65.50% | 07:03:44
65.51% | 07:03:47
65.52% | 07:03:49
65.53% | 07:03:51
65.54% | 07:03:54
65.54% | 07:03:56
65.55% | 07:03:59
65.55% | 07:04:02
65.55% | 07:04:06
65.56% | 07:04:10
65.57% | 07:04:13
65.57% | 07:04:15
65.58% | 07:04:17
65.59% | 07:04:20
65.60% | 07:04:22
65.60% | 07:04:25
65.61% | 07:04:27
65.62% | 07:04:30
65.62% | 07:04:33
65.63% | 07:04:35
65.64% | 07:04:38
65.65% | 07:04:41
65.65% | 07:04:43
65.66% | 07:04:45
65.67

In [None]:
subs = []
comments = []

# load submissions data
with open('offmychest_submission.txt') as file:
  for line in file:
    subs.append(literal_eval(line))

# load comment data
with open('offmychest_comment.txt') as file:
  for line in file:
    comments.append(literal_eval(line))

In [None]:
# formulate to dataframe
posts_df = pd.DataFrame(subs)
display(posts_df.head())
print(len(posts_df))

Unnamed: 0,id,title,author,subreddit,num_comments,selftext
0,j8q3h7,"You've gone and done it humanity, you've mad D...",-The-Goat,offmychest,0,Good job.
1,j8q3bd,I don't know how to have friends,biggestyikesofall,offmychest,0,Pandemic life only makes it worse. \n\nPast ex...
2,j8q33q,I'm not happy!,jake51551,offmychest,0,And that's perfectly fine. Because I know how ...
3,j8q2un,Why cant the world fucking kill me already.,anotherhuman000,offmychest,0,im too much of a wuss to kill myself.
4,j8q2cn,I miss my parents,song_4_that,offmychest,0,I haven't seen my parents in months because my...


965036


In [None]:
# formulate to dataframe
comments_df = pd.DataFrame(comments)
display(comments_df.head())
print(len(comments_df))

Unnamed: 0,id,author,link_id,parent_id,body
0,g8g11pu,CharlieRakes,t3_j8vg56,t3_j8vg56,Heck yes! ✊ You can say stuff like I love eati...
1,g8g0zxs,ihaveawoken613,t3_j90dng,t3_j90dng,Yea for sure. I can imagine putting all that e...
2,g8g0m02,Lil_Cheets,t3_j8zz6x,t3_j8zz6x,Well in my opinion I wouldn’t confess since I ...
3,g8g0ck7,[deleted],t3_j8xzit,t3_j8xzit,[removed]
4,g8g0c3j,KangarooInAZoo,t3_j8q8u6,t1_g8e20mv,Horrible advice. You're throwing all this extr...


3685744


id and link_id difference and usage: https://www.reddit.com/r/pushshift/comments/ayvut7/how_do_you_link_the_comments_with_their/?utm_source=share&utm_medium=web2x&context=3