## Webscraping via API

In [None]:
import json
import requests
from tqdm import tqdm
from nltk import defaultdict
from time import sleep


### 1. SUBMISSIONS

Using Pushshift's API to retrieve at least 1000 submissions from both r/crossfit and r/bjj

#### a. Crossfit Submissions

In [15]:
SUB = "crossfit"   # subreddit to scrape
START = 0           # lower bound for scrape (0 = today, 1 = yesterday, etc.)
INC = 1             # increment for scrape (1 = 1 day per scrape)
STOP = 120         # upper bound for scrape (14 = 2 weeks of data total)
NUM = 1000  #total number of comments per scrape

SINK = "crossfit_reddit-sub-data.json"



In [16]:
def main():

    # define an iterator using Pushift API for the scrape
    def yield_request(sub, start, stop, inc, num):
        while start <= stop:
            try: 
                r = requests.get(f'https://api.pushshift.io/reddit/search/submission/?subreddit={sub}&size={num}&before={start}d&after={start+inc}d')
                yield r.json()
                start += inc
            except json.decoder.JSONDecodeError:
                sleep(10)

                
    
    # collect the comments
    data = []
    for scrape in tqdm(yield_request(SUB, START, STOP, INC, NUM)):
        print(len(scrape['data']))
        data.extend(scrape['data'])
    print(f"{len(data)} subs collected")
    
    #write to disk
    with open(SINK, "w") as sink:
        json.dump(data, sink)
    print(f"Subs written to {SINK}")

if __name__ == "__main__":
    main()

1it [00:01,  1.55s/it]

26


2it [00:03,  1.83s/it]

27


3it [00:05,  1.95s/it]

45


4it [00:07,  1.94s/it]

21


5it [00:09,  1.91s/it]

33


6it [00:11,  1.96s/it]

32


7it [00:13,  1.97s/it]

32


8it [00:15,  2.01s/it]

28


9it [00:16,  1.82s/it]

22


10it [00:18,  1.76s/it]

38


11it [00:20,  1.66s/it]

25


12it [00:22,  1.76s/it]

24


13it [00:23,  1.71s/it]

38


14it [00:25,  1.84s/it]

27


15it [00:27,  1.76s/it]

29


16it [00:28,  1.66s/it]

23


17it [00:31,  1.93s/it]

39


18it [00:33,  1.97s/it]

31


19it [00:35,  2.09s/it]

29


20it [00:37,  2.10s/it]

24


21it [00:39,  2.08s/it]

23


22it [00:41,  2.04s/it]

18


23it [00:43,  2.05s/it]

21


24it [00:45,  1.98s/it]

27


25it [00:47,  1.99s/it]

17


26it [00:49,  2.02s/it]

24


27it [00:51,  1.88s/it]

22


28it [00:52,  1.73s/it]

25


29it [00:54,  1.77s/it]

14


30it [00:56,  1.86s/it]

23


31it [00:59,  2.01s/it]

27


32it [01:01,  2.08s/it]

26


33it [01:03,  2.08s/it]

25


34it [01:05,  2.03s/it]

27


35it [01:07,  2.07s/it]

20


36it [01:09,  2.09s/it]

16


37it [01:11,  2.14s/it]

25


38it [01:13,  2.13s/it]

28


39it [01:16,  2.12s/it]

25


40it [01:17,  2.02s/it]

16


41it [01:20,  2.07s/it]

17


42it [01:22,  2.07s/it]

20


43it [01:24,  2.06s/it]

21


44it [01:25,  1.91s/it]

24


45it [01:27,  1.89s/it]

15


46it [01:29,  1.82s/it]

14


47it [01:31,  1.94s/it]

16


48it [01:33,  2.00s/it]

27


49it [01:35,  1.82s/it]

18


50it [01:37,  1.93s/it]

24


51it [01:39,  1.97s/it]

22


52it [01:40,  1.87s/it]

30


53it [01:43,  1.95s/it]

24


54it [01:45,  1.98s/it]

21


55it [01:46,  1.82s/it]

24


56it [01:48,  1.89s/it]

12


57it [01:50,  1.99s/it]

22


58it [01:52,  2.05s/it]

23


59it [01:55,  2.14s/it]

18


60it [01:56,  1.99s/it]

16


61it [01:58,  2.01s/it]

14


62it [02:01,  2.06s/it]

21


63it [02:02,  1.86s/it]

21


64it [02:04,  1.77s/it]

21


65it [02:06,  1.92s/it]

15


66it [02:08,  1.98s/it]

19


67it [02:10,  1.94s/it]

11


68it [02:12,  2.00s/it]

17


69it [02:14,  1.95s/it]

10


70it [02:16,  1.92s/it]

13


71it [02:18,  1.93s/it]

13


72it [02:19,  1.77s/it]

22


73it [02:21,  1.80s/it]

13


74it [02:23,  1.91s/it]

20


75it [02:25,  1.95s/it]

19


76it [02:27,  2.04s/it]

16


77it [02:29,  2.04s/it]

17


78it [02:31,  1.83s/it]

9


79it [02:32,  1.68s/it]

21


80it [02:34,  1.73s/it]

20


81it [02:36,  1.79s/it]

13


82it [02:38,  1.79s/it]

20


83it [02:40,  1.89s/it]

23


84it [02:42,  1.96s/it]

16


85it [02:44,  1.94s/it]

17


86it [02:46,  1.90s/it]

13


87it [02:48,  2.00s/it]

18


88it [02:50,  2.05s/it]

17


89it [02:52,  2.05s/it]

25


90it [02:54,  2.02s/it]

24


91it [02:56,  2.01s/it]

20


92it [02:58,  2.04s/it]

16


93it [03:00,  1.97s/it]

17


94it [03:02,  1.94s/it]

15


95it [03:04,  2.00s/it]

16


96it [03:06,  2.05s/it]

22


97it [03:08,  2.04s/it]

19


98it [03:10,  2.08s/it]

17


99it [03:12,  1.98s/it]

11


100it [03:14,  2.00s/it]

14


101it [03:16,  1.95s/it]

11


102it [03:17,  1.74s/it]

18


103it [03:19,  1.68s/it]

33


104it [03:21,  1.85s/it]

22


105it [03:22,  1.64s/it]

16


106it [03:24,  1.82s/it]

15


107it [03:26,  1.70s/it]

14


108it [03:28,  1.84s/it]

15


109it [03:30,  1.93s/it]

18


110it [03:32,  2.03s/it]

24


111it [03:35,  2.09s/it]

16


112it [03:36,  2.02s/it]

11


113it [03:39,  2.12s/it]

23


114it [03:41,  2.10s/it]

16


115it [03:43,  2.05s/it]

21


116it [03:45,  2.08s/it]

21


117it [03:47,  2.10s/it]

21


118it [03:49,  1.93s/it]

29


119it [03:51,  1.98s/it]

20


120it [03:53,  1.99s/it]

11


121it [03:54,  1.94s/it]

9
2519 subs collected





Subs written to crossfit_reddit-sub-data.json


-------

#### b. BJJ Submissions

In [17]:
SUB = "bjj"   # subreddit to scrape
START = 0           # lower bound for scrape (0 = today, 1 = yesterday, etc.)
INC = 1             # increment for scrape (1 = 1 day per scrape)
STOP = 120           # upper bound for scrape (14 = 2 weeks of data total)
NUM = 25           # total number of comments per scrape

SINK = "bjj_reddit-sub-data.json"



In [18]:
def main():

    # define an iterator using Pushift API for the scrape
    def yield_request(sub, start, stop, inc, num):
        while start <= stop:
            try: 
                r = requests.get(f'https://api.pushshift.io/reddit/search/submission/?subreddit={sub}&size={num}&before={start}d&after={start+inc}d')
                yield r.json()
                start += inc
            except json.decoder.JSONDecodeError:
                sleep(10)

                
    
    # collect the comments
    data = []
    for scrape in tqdm(yield_request(SUB, START, STOP, INC, NUM)):
        print(len(scrape['data']))
        data.extend(scrape['data'])
    print(f"{len(data)} subs collected")
    
    #write to disk
    with open(SINK, "w") as sink:
        json.dump(data, sink)
    print(f"Subs written to {SINK}")

if __name__ == "__main__":
    main()

1it [00:02,  2.04s/it]

25


2it [00:04,  2.12s/it]

25


3it [00:05,  1.93s/it]

25


4it [00:07,  1.98s/it]

25


5it [00:10,  2.04s/it]

25


6it [00:12,  2.06s/it]

25


7it [00:13,  1.80s/it]

25


8it [00:15,  1.88s/it]

25


9it [00:17,  1.93s/it]

25


10it [00:19,  1.96s/it]

25


11it [00:21,  2.00s/it]

25


12it [00:22,  1.76s/it]

25


13it [00:25,  1.91s/it]

25


14it [00:27,  1.98s/it]

25


15it [00:29,  2.00s/it]

25


16it [00:31,  2.08s/it]

25


17it [00:33,  2.13s/it]

25


18it [00:35,  2.02s/it]

25


19it [00:37,  2.05s/it]

25


20it [00:40,  2.18s/it]

25


21it [00:42,  2.17s/it]

25


22it [00:44,  2.13s/it]

25


23it [00:46,  2.14s/it]

25


24it [00:48,  1.93s/it]

25


25it [00:50,  1.99s/it]

25


26it [00:51,  1.80s/it]

25


27it [00:53,  1.93s/it]

24


28it [00:55,  1.81s/it]

25


29it [00:57,  1.89s/it]

25


30it [00:59,  1.93s/it]

25


31it [01:01,  1.94s/it]

25


32it [01:03,  2.00s/it]

25


33it [01:05,  2.05s/it]

25


34it [01:07,  2.05s/it]

25


35it [01:09,  2.00s/it]

25


36it [01:11,  2.03s/it]

25


37it [01:13,  2.07s/it]

25


38it [01:15,  2.06s/it]

25


39it [01:18,  2.12s/it]

25


40it [01:20,  2.16s/it]

25


41it [01:21,  2.00s/it]

25


42it [01:23,  1.98s/it]

25


43it [01:25,  1.82s/it]

25


44it [01:27,  1.92s/it]

25


45it [01:29,  1.80s/it]

25


46it [01:31,  1.91s/it]

25


47it [01:33,  1.89s/it]

25


48it [01:35,  1.97s/it]

25


49it [01:37,  2.02s/it]

25


50it [01:39,  2.08s/it]

25


51it [01:41,  2.08s/it]

25


52it [01:43,  1.92s/it]

25


53it [01:44,  1.80s/it]

25


54it [01:46,  1.88s/it]

25


55it [01:49,  1.99s/it]

25


56it [01:50,  1.95s/it]

25


57it [01:52,  1.98s/it]

25


58it [01:54,  1.84s/it]

25


59it [01:56,  2.00s/it]

25


60it [01:58,  2.04s/it]

25


61it [02:01,  2.08s/it]

25


62it [02:03,  2.16s/it]

25


63it [02:05,  2.16s/it]

25


64it [02:07,  2.09s/it]

25


65it [02:09,  1.90s/it]

25


66it [02:11,  2.06s/it]

25


67it [02:13,  2.15s/it]

25


68it [02:15,  1.97s/it]

25


69it [02:17,  1.99s/it]

25


70it [02:19,  2.01s/it]

25


71it [02:21,  2.05s/it]

25


72it [02:23,  1.93s/it]

25


73it [02:25,  2.01s/it]

25


74it [02:27,  1.88s/it]

25


75it [02:29,  1.96s/it]

25


76it [02:30,  1.83s/it]

25


77it [02:32,  1.96s/it]

25


78it [02:35,  2.05s/it]

25


79it [02:37,  2.02s/it]

25


80it [02:39,  2.06s/it]

25


81it [02:41,  2.15s/it]

25


82it [02:43,  2.14s/it]

25


83it [02:45,  2.15s/it]

25


84it [02:48,  2.18s/it]

25


85it [02:50,  2.21s/it]

25


86it [02:52,  2.22s/it]

25


87it [02:54,  2.21s/it]

25


88it [02:56,  2.05s/it]

25


89it [02:58,  2.05s/it]

25


90it [03:00,  2.08s/it]

25


91it [03:02,  1.92s/it]

25


92it [03:04,  1.99s/it]

25


93it [03:06,  2.04s/it]

25


94it [03:08,  1.98s/it]

25


95it [03:10,  2.12s/it]

25


96it [03:13,  2.13s/it]

25


97it [03:15,  2.07s/it]

25


98it [03:17,  2.16s/it]

25


99it [03:19,  2.19s/it]

25


100it [03:21,  2.12s/it]

25


101it [03:23,  2.16s/it]

25


102it [03:26,  2.19s/it]

25


103it [03:28,  2.17s/it]

25


104it [03:29,  1.98s/it]

25


105it [03:31,  2.03s/it]

25


106it [03:34,  2.10s/it]

25


107it [03:36,  2.21s/it]

25


108it [03:38,  2.19s/it]

25


109it [03:40,  1.99s/it]

25


110it [03:42,  2.04s/it]

25


111it [03:43,  1.82s/it]

25


112it [03:46,  1.99s/it]

25


113it [03:48,  2.04s/it]

25


114it [03:50,  2.10s/it]

25


115it [03:52,  2.15s/it]

25


116it [03:54,  2.15s/it]

25


117it [03:57,  2.15s/it]

25


118it [03:59,  2.12s/it]

25


119it [04:01,  2.10s/it]

25


120it [04:02,  1.93s/it]

25


121it [04:04,  2.02s/it]

25
3024 subs collected





Subs written to bjj_reddit-sub-data.json


### 2. COMMENTS

#### a. Crossfit comments

In [2]:
SUB = "crossfit"   # subreddit to scrape
START = 0           # lower bound for scrape (0 = today, 1 = yesterday, etc.)
INC = 1             # increment for scrape (1 = 1 day per scrape)
STOP = 14           # upper bound for scrape (14 = 2 weeks of data total)
NUM = 1000           # total number of comments per scrape

SINK = "crossfit_reddit-comment-data.json"

In [3]:
def main():

    # define an iterator using Pushift API for the scrape
    def yield_request(sub, start, stop, inc, num):
        while start <= stop:
            try: 
                r = requests.get(f'https://api.pushshift.io/reddit/search/comment/?subreddit={sub}&size={num}&before={start}d&after={start+inc}d')
                yield r.json()
                start += inc
            except json.decoder.JSONDecodeError:
                sleep(10)

                
    
    # collect the comments
    data = []
    for scrape in tqdm(yield_request(SUB, START, STOP, INC, NUM)):
        print(len(scrape['data']))
        data.extend(scrape['data'])
    print(f"{len(data)} comments collected")
    
    #write to disk
    with open(SINK, "w") as sink:
        json.dump(data, sink)
    print(f"Comments written to {SINK}")

if __name__ == "__main__":
    main()

1it [00:02,  2.92s/it]

849


2it [00:07,  3.75s/it]

995


3it [00:10,  3.52s/it]

820


4it [00:14,  3.70s/it]

889


5it [00:17,  3.32s/it]

716


6it [00:20,  3.39s/it]

692


7it [00:24,  3.42s/it]

620


8it [00:27,  3.46s/it]

799


9it [00:31,  3.51s/it]

991


10it [00:34,  3.49s/it]

910


11it [00:39,  3.78s/it]

579


12it [00:42,  3.68s/it]

759


13it [00:45,  3.33s/it]

459


14it [00:48,  3.36s/it]

698


15it [00:52,  3.47s/it]

830
11606 comments collected





Comments written to crossfit_reddit-comment-data.json


#### b. BJJ Comments

In [6]:
import json
import requests
from tqdm import tqdm
from nltk import defaultdict
from time import sleep

SUB = "bjj"   # subreddit to scrape
START = 0           # lower bound for scrape (0 = today, 1 = yesterday, etc.)
INC = 1             # increment for scrape (1 = 1 day per scrape)
STOP = 14           # upper bound for scrape (14 = 2 weeks of data total)
NUM = 500           # total number of comments per scrape

SINK = "bjj_reddit-comment-data.json"



In [7]:
def main():

    # define an iterator using Pushift API for the scrape
    def yield_request(sub, start, stop, inc, num):
        while start <= stop:
            try: 
                r = requests.get(f'https://api.pushshift.io/reddit/search/comment/?subreddit={sub}&size={num}&before={start}d&after={start+inc}d')
                yield r.json()
                start += inc
            except json.decoder.JSONDecodeError:
                sleep(10)

                
    
    # collect the comments
    data = []
    for scrape in tqdm(yield_request(SUB, START, STOP, INC, NUM)):
        print(len(scrape['data']))
        data.extend(scrape['data'])
    print(f"{len(data)} comments collected")
    
    #write to disk
    with open(SINK, "w") as sink:
        json.dump(data, sink)
    print(f"Comments written to {SINK}")

if __name__ == "__main__":
    main()

1it [00:02,  2.70s/it]

500


2it [00:06,  3.08s/it]

500


3it [00:08,  2.81s/it]

500


4it [00:11,  2.79s/it]

500


5it [00:14,  2.89s/it]

500


6it [00:17,  2.95s/it]

500


7it [00:19,  2.70s/it]

500


8it [00:22,  2.61s/it]

499


9it [00:24,  2.51s/it]

500


10it [00:26,  2.56s/it]

500


11it [00:29,  2.68s/it]

500


12it [00:33,  2.83s/it]

500


13it [00:36,  2.90s/it]

500


14it [00:39,  3.00s/it]

500


15it [00:42,  2.84s/it]

500
7499 comments collected





Comments written to bjj_reddit-comment-data.json
