In [1]:
#Imports

import requests
import pandas as pd
import numpy as np
import time

### Helper Functions

In [2]:
def scrape_subreddit_posts(subreddit, size, before):    

    '''
    returns a DataFrame with information about a given number (size <= 100) of posts 
    posted on subreddit before a certain time (before in epoch time)
    '''
    
    url = 'https://api.pushshift.io/reddit/search/submission/'
    params = {
        'subreddit' : subreddit,
        'size' : size,
        'before': before    
    }
    res = requests.get(url, params)
    df = pd.DataFrame(res.json()['data'])[['subreddit','title','selftext','created_utc', 'author','num_comments']]
    return df

### Data Collection

In [3]:
# Collecting most recent 2000 posts of the subreddit r/lanuguagelearning which haven't 
#been removed and were posted before 1 Sep 2021 and have some self text
#Removed posts and no selftext posts are also included in this scrape

languagelearning_posts = pd.DataFrame()
num_posts_target = 5000
num_posts = 0
before = 1625535612 # last post in scraping 1
url = 'https://api.pushshift.io/reddit/search/submission/'
while num_posts < num_posts_target:
    df = scrape_subreddit_posts('languagelearning', 100, before)
    time.sleep(20)
    languagelearning_posts = languagelearning_posts.append(df, ignore_index=True)
    num_removed_posts = len(df[df['selftext'] == '\[removed\]'])
    num_only_title_posts = len(df[df['selftext'] == ''])
    before = df['created_utc'].min()
    print(before, num_posts)
    num_posts += len(df)-num_removed_posts-num_only_title_posts

1625398787 0
1625241627 81
1625103150 159
1624969772 249
1624834501 327
1624724978 409
1624608985 490
1624463865 568
1624313874 648
1624190606 727
1624033605 812
1623926935 894
1623830615 944
1623672490 1002
1623509098 1087
1623359110 1164
1623216473 1240
1623066032 1316
1622910434 1390
1622762365 1467
1622626943 1546
1622485614 1619
1622324508 1697
1622196945 1775
1622064278 1849
1621900832 1923
1621772989 2000
1621633932 2065
1621518655 2151
1621367554 2222
1621255859 2300
1621101279 2376
1620951568 2449
1620819696 2525
1620703914 2600
1620597872 2676
1620481886 2748
1620305286 2828
1620174264 2909
1620044273 2982
1619884081 3065
1619729066 3143
1619578854 3225
1619452547 3303
1619338257 3377
1619202283 3457
1619064036 3535
1618937996 3609
1618810495 3691
1618681574 3772
1618516842 3855
1618411097 3940
1617916522 4021
1617808172 4099
1617659342 4182
1617548217 4259
1617395586 4335
1617242730 4421
1617128909 4498
1616990167 4574
1615985440 4646
1615843158 4727
1615741124 4809
16155960

In [4]:
#saving collected data

languagelearning_posts.to_csv('../Data/languagelearning2.csv', index=False)

In [5]:
# Collecting most recent 2000 posts of the subreddit r/linguistics which haven't 
#been removed and were posted before 1 Sep 2021 and have some self text
#Removed posts and no selftext posts are also included in this scrape

linguistics_posts = pd.DataFrame()
num_posts_target = 5000
num_posts = 0
before = 1618933648 # last post in scraping 1
url = 'https://api.pushshift.io/reddit/search/submission/'
while num_posts < num_posts_target:
    df = scrape_subreddit_posts('linguistics', 100, before)
    time.sleep(20)
    linguistics_posts = linguistics_posts.append(df, ignore_index=True)
    num_removed_posts = len(df[df['selftext'] == '[removed]'])
    num_only_title_posts = len(df[df['selftext'] == ''])
    before = df['created_utc'].min()
    print(before, num_posts)
    num_posts += len(df)-num_removed_posts-num_only_title_posts

1618596204 0
1617897238 67
1617579527 122
1617194912 184
1615978816 250
1615625575 318
1615267758 385
1614406753 445
1614026265 504
1613675067 560
1613230494 622
1612816166 691
1612188658 750
1611825118 800
1611413725 862
1611075438 929
1610744666 1000
1610469544 1069
1610120196 1134
1609785472 1203
1609407937 1281
1609017358 1356
1608625396 1436
1608244538 1504
1607893330 1565
1607547129 1636
1607204012 1714
1606862358 1790
1606634137 1865
1606314054 1941
1605966188 2020
1605639643 2092
1605268184 2163
1604952849 2223
1604349810 2310
1603846197 2378
1603467947 2461
1603019756 2534
1602570438 2608
1602140592 2680
1601754633 2752
1601375402 2824
1600860787 2906
1600333565 2975
1599821420 3059
1599356346 3129
1598889371 3199
1598424434 3277
1597801511 3349
1597327614 3429
1596796474 3507
1596299123 3584
1595824121 3663
1595382503 3736
1594940740 3802
1594550774 3856
1594107801 3922
1593630518 3997
1593201496 4063
1592805485 4132
1592418105 4196
1591953755 4267
1591516715 4333
1591030447 

In [6]:
# saving collected data

linguistics_posts.to_csv('../Data/linguistics2.csv', index=False)