In [24]:
# Standards
import pandas as pd
import numpy as np

# API
import requests

# Automating
import time
import datetime
import warnings
import sys
import datetime
import os

class reddit_scraper:
    
    def __init__(self,subreddit, n_iter, epoch_right_now):
        '''
        I recommend n_iter to be low, no higher than 10 
        
        n_iter  = how many times you want to scrape 100 reddit messages
        epoch_right_now = start here
        subreddit = name of subreddit in a string
        '''
        self.subreddit = subreddit
        self.n_iter = n_iter
        self.epoch_start = epoch_right_now
        self.epoch_earliest = 0
        
        # check for folder, is it does not exist make the folder
        try:
            os.makedirs(f'./data/reddit/{self.subreddit}', exist_ok=True)
        except OSError as e:
            if e.errno != errno.EEXIST:
                pass
        
        
    def get_comments(self, subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run

        # store base url variable
        base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='
        # instantiate empty list    
        df_list = []
        # save current epoch, 
        current_time = epoch_right_now

        # set up for loop
        for post in range(n_iter):
            # instantiate get request
            #time.sleep(5)
            res = requests.get(
                # requests.get takes base_url and params
                base_url,
                # parameters for get request
                params={ 
                    # specify subreddit
                    'subreddit': subreddit,
                    # specify number of posts to pull
                    'size': 100,
                    # ???
                    'lang': True,
                    # pull everything from current time backward
                    'before': current_time})

            # take data from most recent request, store as df
            df = pd.DataFrame(res.json()['data'])

            # pull specific columns from dataframe for analysis
            try:
                df_filtered = df.T.loc[['title', 'created_utc', 'selftext', 'subreddit', 'media_only', 'author', 'permalink']]
            except:
                df_filtered = df.T.loc[['title', 'created_utc', 'selftext', 'subreddit', 'author', 'permalink']]
                #print(f'Stopped at {self.epoch_earliest}')
                
            # append to empty dataframe list
            df_list.append(df_filtered)

            # set current time counter back to last epoch in recently grabbed df
            current_time = df['created_utc'].min()
            self.epoch_earliest = current_time
        # return one dataframe for all requests
        df = pd.concat(df_list, axis=1)
        df = df.T
        return df.reset_index()
    # Adapated from Tim Book's Lesson Example

    def reddit_getter(self,sessions):
        '''
        This function multiplies n_iter by sessions, resulting in a total amount of requested scraped datapoints
        
        The amount scraped each request is 100 comments and their associated metadata
        
        The sessions define how many scraping sessions there will be
        
        Eg: n_iter = 10, sessions = 1 # hard define request = 100
        Total Scraped Message = 10 * 100 * 1 = 1000
        
        n_iter = 10, sessions = 100
        Total Scraped Messages = 10 * 100 * 1000 = 1000000
        
        Use this function to contuinopusly scrape because it has a wait timer to ensure reddit is not overloaded. It is set to a 2 second wait 
        '''
                
        for i in range(sessions):
            print('earliest',rs.epoch_earliest,'starting',rs.epoch_start)
            # scrape reddit based on inputted parameters when instantiating class
            time.sleep(5)
            scraped = rs.get_comments(subreddit='askengineers', n_iter=rs.n_iter, epoch_right_now=rs.epoch_start)

            #after messages have been scraped, store amount into a single csv, 
            scraped.to_csv(f'./data/reddit/{rs.subreddit}/comments_epoch_{rs.epoch_earliest}-{rs.epoch_start}.csv', index=False)

            # define the starting point as the epoch with the smallest epoch and go backwards again
            #print('earliest',rs.epoch_earliest, 'starting',rs.epoch_start)
            rs.epoch_start = rs.epoch_earliest
            
            # print out log for user to monitor amount collectec
            print(f'Scraped {(i+1) * self.n_iter * 100} of {sessions*self.n_iter*100} requested comments from subreddit {self.subreddit}')
        return f'Scraping complete, collected {self.n_iter * sessions*100} of reddit comments from subreddit {self.subreddit}'
        

In [25]:
rs = reddit_scraper(n_iter=10, epoch_right_now=1503598550 , subreddit='askengineers')

In [26]:
#test = rs.get_comments(subreddit='askengineers', n_iter=rs.n_iter, epoch_right_now=rs.epoch_start)

In [27]:
#test.to_csv(f'./data/reddit/comments_{rs.subreddit}_{datetime.datetime.now()}_epoch_{rs.epoch_earliest}-{rs.epoch_start}.csv')

In [28]:
rs.reddit_getter(50)

earliest 0 starting 1526260045
Scraped 1000 of 8000 requested comments from subreddit askengineers
earliest 1523485563 starting 1523485563
Scraped 2000 of 8000 requested comments from subreddit askengineers
earliest 1520915593 starting 1520915593
Scraped 3000 of 8000 requested comments from subreddit askengineers
earliest 1518096253 starting 1518096253
Scraped 4000 of 8000 requested comments from subreddit askengineers
earliest 1514889337 starting 1514889337
Scraped 5000 of 8000 requested comments from subreddit askengineers
earliest 1510759793 starting 1510759793
Scraped 6000 of 8000 requested comments from subreddit askengineers
earliest 1507430977 starting 1507430977
Scraped 7000 of 8000 requested comments from subreddit askengineers
earliest 1503598550 starting 1503598550
Scraped 8000 of 8000 requested comments from subreddit askengineers


'Scraping complete, collected 8000 of reddit comments from subreddit askengineers'

In [None]:
uwu = pd.read_csv('./data/reddit/askengineers/comments_2020-10-01 18:19:16.028963_epoch_1600265449-1601587311.csv', index_col=0)

In [None]:
rs.epoch_start - rs.epoch_earliest

In [None]:
rs.epoch_start

In [None]:
rs.epoch_earliest