## This notebook comprises the pipeline used to scrape posts from Reddit. The custom function takes topic as an argument and can be easily applied or modified for other Reddit topics. 

In [1]:
import requests
import pandas as pd
import time
import random
import string
import numpy as np

In [2]:
# custom function to generate random string of random length 
def get_random_string(min_len, max_len):
    # With combination of lower and upper case
    length = random.randint(min_len, max_len)
    # generate random alphabets, upper and lower cases included
    result_str = ''.join(random.choice(string.ascii_letters) for i in range(length))
    return result_str

In [3]:
# test custom function
get_random_string(5,10)

'izIWCUp'

In [4]:
# custom based edited from Reddit API tutorial to put into pipeline scraping, de-duplicating and 
# subsequently write to csv
def scrape(topic):
    
    options = ['hot', 'controversial', 'new', 'top']
    posts = []
    
    for option in options:

        url = 'https://www.reddit.com/r/' + topic + '/' + option + '.json'
        filename = 'scrape_'+topic+'.csv'
        #filename = topic+'_'+option+'.csv'

        after = None

        for a in range(40):

            # vary user-agent name to prevent jamming by Reddit
            agent = get_random_string(5,10)

            if after == None:
                current_url = url
            else:
                current_url = url + '?after=' + after
            print(current_url)
            res = requests.get(current_url, headers={'User-agent': agent})

            if res.status_code != 200:
                print('Status error', res.status_code)
                break

            current_dict = res.json()
            current_posts = [p['data'] for p in current_dict['data']['children']]
            posts.extend(current_posts)
            after = current_dict['data']['after']

            # generate a random sleep duration to look more 'natural'
            sleep_duration = random.randint(2,10)
            print(a)
            time.sleep(sleep_duration)
                
    # convert posts to dataframe            
    df = pd.DataFrame(posts) 
    #drop duplicates and remove empty posts
    df.drop_duplicates(subset=['selftext'], inplace = True)
    df.to_csv('./datasets/'+filename, index = False)

    print('completed')

In [5]:
reddits = ['DisneyPlus','Netflix']

In [None]:
for subred in reddits:
    scrape(subred)

https://www.reddit.com/r/DisneyPlus/hot.json
0
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m8d63f
1
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m86xbb
2
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m6f7rm
3
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m532ym
4
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m3v2d9
5
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m1oo4o
6
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m1ppum
7
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_m0rc36
8
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_lzjl9r
9
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_lykxg7
10
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_ly6d93
11
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_lxr5h2
12
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_lvzg67
13
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_lvw694
14
https://www.reddit.com/r/DisneyPlus/hot.json?after=t3_lusax4
15
ht