# Scraping Reddit data

Using [Pushshift's](https://github.com/pushshift/api) API, 2000+ posts going back from 1st September 2021 were scraped from the following subreddits:

- r/languagelearning
- r/linguistics

In [69]:
#Imports

import requests
import pandas as pd
import numpy as np
import time

### Helper Functions

In [62]:
def scrape_subreddit_posts(subreddit, size, before):    

    '''
    returns a DataFrame with information about a given number (size <= 100) of posts 
    posted on subreddit before a certain time (before in epoch time)
    '''
    
    url = 'https://api.pushshift.io/reddit/search/submission/'
    params = {
        'subreddit' : subreddit,
        'size' : size,
        'before': before    
    }
    res = requests.get(url, params)
    df = pd.DataFrame(res.json()['data'])[['subreddit','title','selftext','created_utc', 'author','num_comments']]
    return df

### Data Collection

In [93]:
# Collecting most recent 2000 posts of the subreddit r/lanuguagelearning which haven't 
#been removed and were posted before 1 Sep 2021 and have some self text
#Removed posts and no selftext posts are also included in this scrape

languagelearning_posts = pd.DataFrame()
num_posts_target = 2000
num_posts = 0
before = 1630454400 # epoch time 1 Sep 2021 12:00 AM
url = 'https://api.pushshift.io/reddit/search/submission/'
while num_posts < num_posts_target:
    df = scrape_subreddit_posts('languagelearning', 100, before)
    time.sleep(20)
    languagelearning_posts = languagelearning_posts.append(df, ignore_index=True)
    num_removed_posts = len(df[df['selftext'] == '[removed]'])
    num_only_title_posts = len(df[df['selftext'] == ''])
    before = df['created_utc'].min()
    print(before, num_posts)
    num_posts += len(df)-num_removed_posts-num_only_title_posts

1630314389 0
1630163899 52
1629974923 108
1629807499 190
1629671168 246
1629518211 295
1629376965 345
1629223016 397
1629094698 457
1628966529 500
1628803197 548
1628671257 614
1628531549 652
1628422247 701
1628269201 749
1628134542 793
1628009868 842
1627907759 894
1627766004 946
1627663668 1001
1627554437 1047
1627419486 1093
1627291267 1152
1627150703 1200
1627024031 1252
1626895296 1303
1626778685 1368
1626647039 1437
1626509321 1501
1626345303 1562
1626203936 1616
1626083798 1691
1625942061 1757
1625814793 1822
1625682893 1890
1625535612 1961


In [97]:
#saving collected data

languagelearning_posts.to_csv('../Data/languagelearning.csv', index=False)

In [99]:
# Collecting most recent 2000 posts of the subreddit r/linguistics which haven't 
#been removed and were posted before 1 Sep 2021 and have some self text
#Removed posts and no selftext posts are also included in this scrape

linguistics_posts = pd.DataFrame()
num_posts_target = 2000
num_posts = 0
before = 1630454400 # epoch time 1 Sep 2021 12:00 AM
url = 'https://api.pushshift.io/reddit/search/submission/'
while num_posts < num_posts_target:
    df = scrape_subreddit_posts('linguistics', 100, before)
    time.sleep(20)
    linguistics_posts = linguistics_posts.append(df, ignore_index=True)
    num_removed_posts = len(df[df['selftext'] == '[removed]'])
    num_only_title_posts = len(df[df['selftext'] == ''])
    before = df['created_utc'].min()
    print(before, num_posts)
    num_posts += len(df)-num_removed_posts-num_only_title_posts

1629997573 0
1629505503 75
1629071769 146
1628615216 219
1628176613 299
1627764834 371
1627321642 452
1626872310 523
1626394923 597
1625943876 668
1625564249 737
1625176336 810
1624768602 879
1624316732 948
1623918746 1029
1623570350 1087
1623240539 1130
1622901231 1199
1622558199 1265
1622146948 1337
1621774828 1415
1621383273 1475
1620938385 1546
1620616384 1617
1620247251 1678
1619817017 1750
1619565511 1814
1619207962 1881
1618933648 1947


In [104]:
# saving collected data

linguistics_posts.to_csv('../Data/linguistics.csv', index=False)