In [1]:
import os
import time
import json
import random
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from datetime import timezone

# Crawl Submissions from r/TodayIamHappy

In [58]:
end_date = datetime(2020, 1, 1)
end_timestamp = end_date.replace(tzinfo = timezone.utc).timestamp()
print(end_date, end_timestamp)

2020-01-01 00:00:00 1577836800.0


In [59]:
start_date = datetime(2019, 2, 17)
start_timestamp = start_date.replace(tzinfo = timezone.utc).timestamp()
print(start_date, start_timestamp)

2019-02-17 00:00:00 1550361600.0


In [60]:
submission_cols = ['id', 'created_utc', 'title', 'selftext', 'num_comments', 'subreddit', 'subreddit_id']
submission_cols_opt = ['score', 'author_fullname', 'link_flair_text']

In [61]:
submission_dict = {col: [] for col in submission_cols + submission_cols_opt}

In [62]:
def is_json(json_data):
    try:
        json_object = json.loads(json_data)
    except:
        return False
    return True

In [63]:
current_timestamp = end_timestamp
url = 'https://api.pushshift.io/reddit/search/submission/'
while current_timestamp > start_timestamp:
    if (current_timestamp - end_timestamp) % 86400 == 0:
        date_1 = datetime.fromtimestamp(current_timestamp - 86400, timezone.utc)
        date_2 = datetime.fromtimestamp(current_timestamp, timezone.utc)
        print('Crawling from {} to {}...'.format(date_1, date_2))
    params = {
        'subreddit': 'TodayIamHappy',
        'sort': 'desc',
        'sort_type': 'created_utc',
        'after': str(int(current_timestamp - 3600 - 1)),
        'before': str(int(current_timestamp)),
        'size': '100'
    }
    res = requests.get(url, params = params)
    if is_json(res.text):
        submissions = res.json()['data']
        for submission in submissions:

            all_cols_exist = True
            for col in submission_cols:
                if col not in submission:
                    all_cols_exist = False
                    break
            if not all_cols_exist:
                continue

            for col in submission_cols:
                submission_dict[col].append(submission[col])

            for col in submission_cols_opt:
                if col in submission:
                    submission_dict[col].append(submission[col])
                else:
                    submission_dict[col].append(None)

    current_timestamp = current_timestamp - 3600

Crawling from 2019-12-31 00:00:00+00:00 to 2020-01-01 00:00:00+00:00...
Crawling from 2019-12-30 00:00:00+00:00 to 2019-12-31 00:00:00+00:00...
Crawling from 2019-12-29 00:00:00+00:00 to 2019-12-30 00:00:00+00:00...
Crawling from 2019-12-28 00:00:00+00:00 to 2019-12-29 00:00:00+00:00...
Crawling from 2019-12-27 00:00:00+00:00 to 2019-12-28 00:00:00+00:00...
Crawling from 2019-12-26 00:00:00+00:00 to 2019-12-27 00:00:00+00:00...
Crawling from 2019-12-25 00:00:00+00:00 to 2019-12-26 00:00:00+00:00...
Crawling from 2019-12-24 00:00:00+00:00 to 2019-12-25 00:00:00+00:00...
Crawling from 2019-12-23 00:00:00+00:00 to 2019-12-24 00:00:00+00:00...
Crawling from 2019-12-22 00:00:00+00:00 to 2019-12-23 00:00:00+00:00...
Crawling from 2019-12-21 00:00:00+00:00 to 2019-12-22 00:00:00+00:00...
Crawling from 2019-12-20 00:00:00+00:00 to 2019-12-21 00:00:00+00:00...
Crawling from 2019-12-19 00:00:00+00:00 to 2019-12-20 00:00:00+00:00...
Crawling from 2019-12-18 00:00:00+00:00 to 2019-12-19 00:00:00+0

Crawling from 2019-09-08 00:00:00+00:00 to 2019-09-09 00:00:00+00:00...
Crawling from 2019-09-07 00:00:00+00:00 to 2019-09-08 00:00:00+00:00...
Crawling from 2019-09-06 00:00:00+00:00 to 2019-09-07 00:00:00+00:00...
Crawling from 2019-09-05 00:00:00+00:00 to 2019-09-06 00:00:00+00:00...
Crawling from 2019-09-04 00:00:00+00:00 to 2019-09-05 00:00:00+00:00...
Crawling from 2019-09-03 00:00:00+00:00 to 2019-09-04 00:00:00+00:00...
Crawling from 2019-09-02 00:00:00+00:00 to 2019-09-03 00:00:00+00:00...
Crawling from 2019-09-01 00:00:00+00:00 to 2019-09-02 00:00:00+00:00...
Crawling from 2019-08-31 00:00:00+00:00 to 2019-09-01 00:00:00+00:00...
Crawling from 2019-08-30 00:00:00+00:00 to 2019-08-31 00:00:00+00:00...
Crawling from 2019-08-29 00:00:00+00:00 to 2019-08-30 00:00:00+00:00...
Crawling from 2019-08-28 00:00:00+00:00 to 2019-08-29 00:00:00+00:00...
Crawling from 2019-08-27 00:00:00+00:00 to 2019-08-28 00:00:00+00:00...
Crawling from 2019-08-26 00:00:00+00:00 to 2019-08-27 00:00:00+0

Crawling from 2019-05-17 00:00:00+00:00 to 2019-05-18 00:00:00+00:00...
Crawling from 2019-05-16 00:00:00+00:00 to 2019-05-17 00:00:00+00:00...
Crawling from 2019-05-15 00:00:00+00:00 to 2019-05-16 00:00:00+00:00...
Crawling from 2019-05-14 00:00:00+00:00 to 2019-05-15 00:00:00+00:00...
Crawling from 2019-05-13 00:00:00+00:00 to 2019-05-14 00:00:00+00:00...
Crawling from 2019-05-12 00:00:00+00:00 to 2019-05-13 00:00:00+00:00...
Crawling from 2019-05-11 00:00:00+00:00 to 2019-05-12 00:00:00+00:00...
Crawling from 2019-05-10 00:00:00+00:00 to 2019-05-11 00:00:00+00:00...
Crawling from 2019-05-09 00:00:00+00:00 to 2019-05-10 00:00:00+00:00...
Crawling from 2019-05-08 00:00:00+00:00 to 2019-05-09 00:00:00+00:00...
Crawling from 2019-05-07 00:00:00+00:00 to 2019-05-08 00:00:00+00:00...
Crawling from 2019-05-06 00:00:00+00:00 to 2019-05-07 00:00:00+00:00...
Crawling from 2019-05-05 00:00:00+00:00 to 2019-05-06 00:00:00+00:00...
Crawling from 2019-05-04 00:00:00+00:00 to 2019-05-05 00:00:00+0

In [64]:
print(len(submission_dict['id']), len(set(submission_dict['id'])))

367 367


In [65]:
submission_df = pd.DataFrame(submission_dict)

In [66]:
submission_df

Unnamed: 0,id,created_utc,title,selftext,num_comments,subreddit,subreddit_id,score,author_fullname,link_flair_text
0,ei81i5,1577822644,TIAH because I did the things,"I ended the year not quite as I wanted, but I ...",6,TodayIamHappy,t5_wpspv,1,t2_51bsneye,S
1,egvfeg,1577563391,TIAH because I cleaned my room,"Got up at four this morning, looked at the tim...",6,TodayIamHappy,t5_wpspv,1,t2_11m7hi,S
2,egtsrb,1577555820,TIAH because I finally moved into a house by m...,After years of living in flat shares and with ...,5,TodayIamHappy,t5_wpspv,1,t2_5azviqqo,S
3,egjt92,1577494779,TIAH because I ordered all of my new computers...,"I have been saving for 6 months now, and with ...",5,TodayIamHappy,t5_wpspv,1,t2_kpxl5yo,
4,eg4br3,1577410404,TIAH because I just took my first real shower ...,"I was treated for cancer this year, and since ...",8,TodayIamHappy,t5_wpspv,1,t2_1i4vkuhe,M
...,...,...,...,...,...,...,...,...,...,...
362,aruiw3,1550473579,Today my cat was stretchy and it made me happy.,,0,TodayIamHappy,t5_wpspv,1,t2_2uhdzwod,
363,arnxir,1550431269,TIAH because i had a wonderful hike up the hil...,Spending my time in the nature always makes me...,5,TodayIamHappy,t5_wpspv,15,t2_33zpmhq8,
364,arnkvr,1550429358,TIAH because I found this subreddit.,Is it okay to use this as sort of a gratitude ...,2,TodayIamHappy,t5_wpspv,28,t2_2xa30jei,
365,arjz1k,1550404992,TIAH because I started a new subreddit and hop...,It was not planned. It just struck my mind and...,0,TodayIamHappy,t5_wpspv,50,t2_38ppbu69,


In [67]:
submission_df.to_csv('pushshift/tiah_submissions_20190217_20191231.csv', index = False)

# Crawl Comments According to the Submissions

In [68]:
submission_df = pd.read_csv('pushshift/tiah_submissions_20190217_20191231.csv')

In [69]:
submission_df

Unnamed: 0,id,created_utc,title,selftext,num_comments,subreddit,subreddit_id,score,author_fullname,link_flair_text
0,ei81i5,1577822644,TIAH because I did the things,"I ended the year not quite as I wanted, but I ...",6,TodayIamHappy,t5_wpspv,1,t2_51bsneye,S
1,egvfeg,1577563391,TIAH because I cleaned my room,"Got up at four this morning, looked at the tim...",6,TodayIamHappy,t5_wpspv,1,t2_11m7hi,S
2,egtsrb,1577555820,TIAH because I finally moved into a house by m...,After years of living in flat shares and with ...,5,TodayIamHappy,t5_wpspv,1,t2_5azviqqo,S
3,egjt92,1577494779,TIAH because I ordered all of my new computers...,"I have been saving for 6 months now, and with ...",5,TodayIamHappy,t5_wpspv,1,t2_kpxl5yo,
4,eg4br3,1577410404,TIAH because I just took my first real shower ...,"I was treated for cancer this year, and since ...",8,TodayIamHappy,t5_wpspv,1,t2_1i4vkuhe,M
...,...,...,...,...,...,...,...,...,...,...
362,aruiw3,1550473579,Today my cat was stretchy and it made me happy.,,0,TodayIamHappy,t5_wpspv,1,t2_2uhdzwod,
363,arnxir,1550431269,TIAH because i had a wonderful hike up the hil...,Spending my time in the nature always makes me...,5,TodayIamHappy,t5_wpspv,15,t2_33zpmhq8,
364,arnkvr,1550429358,TIAH because I found this subreddit.,Is it okay to use this as sort of a gratitude ...,2,TodayIamHappy,t5_wpspv,28,t2_2xa30jei,
365,arjz1k,1550404992,TIAH because I started a new subreddit and hop...,It was not planned. It just struck my mind and...,0,TodayIamHappy,t5_wpspv,50,t2_38ppbu69,


In [70]:
def is_json(json_data):
    try:
        json_object = json.loads(json_data)
    except:
        return False
    return True

In [71]:
comment_ids = []

In [72]:
for i in tqdm(range(submission_df.shape[0])):
    submission_id = submission_df.iloc[i]['id']
    url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_id)
    res = requests.get(url)
    if is_json(res.text):
        comment_ids += res.json()['data']

100%|███████████████████████████████████████████████████████████████| 367/367 [09:56<00:00,  1.63s/it]


In [73]:
len(comment_ids)

1105

In [74]:
pd.DataFrame({'id': comment_ids}).to_csv('pushshift/tiah_comment_ids_20190217_20191231.csv', index = False)

### Continue crawling the comment ids

In [50]:
len(comment_ids)

824

In [169]:
comment_ids[-10:]

['dc76tfw',
 'dd1cdhv',
 'dbx20a3',
 'dc6qxsl',
 'dcosxo6',
 'dcow3f7',
 'dc5dmfn',
 'dcq5plm',
 'dc08ifx',
 'dbz8074']

In [138]:
url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_df.iloc[26182+9702]['id'])
res = requests.get(url)
print(res.json()['data'])
for i in res.json()['data']:
    if i in comment_ids:
        print(True)

['djp74h6', 'djp7bjg', 'djp7pzm', 'djp8srl', 'djp9wz0', 'djpac3w', 'djpag5e', 'djpakig', 'djpasvb', 'djpax12', 'djpb54f', 'djpb6rz', 'djpb7h7', 'djpbcl5', 'djpbqrm', 'djpbyrl', 'djpcceq', 'djpd9e4']


In [140]:
url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_df.iloc[26182+9701]['id'])
res = requests.get(url)
res.json()['data']

['djp7nh2',
 'djp7qbn',
 'djp7t1v',
 'djp7xyd',
 'djp7yfs',
 'djp81zg',
 'djp82el',
 'djp83ct',
 'djp8xdp',
 'djp9hkd',
 'djpa0z3',
 'djpeu2c']

In [141]:
for i in tqdm(range(26182+9702, submission_df.shape[0])):
    submission_id = submission_df.iloc[i]['id']
    url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_id)
    res = requests.get(url)
    if is_json(res.text):
        comment_ids += res.json()['data']

100%|████████████████████████████████████████████████████████| 19659/19659 [12:21:52<00:00,  2.26s/it]


In [142]:
len(comment_ids)

1017407

In [143]:
pd.DataFrame({'id': comment_ids}).to_csv('pushshift/casual_conv_comment_ids_20170101_20171231.csv', index = False)

--------------------

In [75]:
comment_cols = ['id', 'link_id', 'parent_id', 'created_utc', 'body', 'subreddit', 'subreddit_id']
comment_cols_opt = ['score', 'author_fullname']

In [76]:
comment_dict = {col: [] for col in comment_cols + comment_cols_opt}

In [77]:
def crawl_comments(comment_query):
    url = 'https://api.pushshift.io/reddit/comment/search?ids={}'.format(comment_query)
    res = requests.get(url)
    if not is_json(res.text):
        return
    comments = res.json()['data']

    for comment in comments:
        all_cols_exist = True
        for col in comment_cols:
            if col not in comment:
                all_cols_exist = False
                break
        if not all_cols_exist:
            continue

        for col in comment_cols:
            comment_dict[col].append(comment[col])

        for col in comment_cols_opt:
            if col in comment:
                comment_dict[col].append(comment[col])
            else:
                comment_dict[col].append(None)

In [78]:
N = len(comment_ids)
batch_size = 1000
num_batches = N // batch_size

for batch in tqdm(range(num_batches)):
    s = batch * batch_size
    t = s + batch_size
    comment_query = ','.join(comment_ids[s:t])
    crawl_comments(comment_query)

if N % batch_size != 0:
    comment_query = ','.join(comment_ids[(num_batches*batch_size):N])
    crawl_comments(comment_query)

100%|███████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.48s/it]


In [79]:
comment_df = pd.DataFrame(comment_dict)

In [80]:
comment_df

Unnamed: 0,id,link_id,parent_id,created_utc,body,subreddit,subreddit_id,score,author_fullname
0,egt9ff5,t3_ascdt2,t3_ascdt2,1550595672,Well if you want someone to talk with then a u...,TodayIamHappy,t5_wpspv,1,t2_38ppbu69
1,egtarex,t3_ascdt2,t3_ascdt2,1550596566,"That’s amazing, I’m happy to hear that!\n\nPer...",TodayIamHappy,t5_wpspv,1,t2_2zzephhe
2,egtdz98,t3_ascxas,t3_ascxas,1550598658,"Thanks for your appreciation. But honestly, al...",TodayIamHappy,t5_wpspv,1,t2_38ppbu69
3,egthkeq,t3_asdma1,t3_asdma1,1550600982,Thanks for submitting to r/TodayIamHappy /u/TI...,TodayIamHappy,t5_wpspv,1,t2_6l4z3
4,egtiuck,t3_asdp8t,t3_asdp8t,1550601816,That is crazy! Let’s hope that some of them be...,TodayIamHappy,t5_wpspv,1,t2_1k6wswle
...,...,...,...,...,...,...,...,...,...
1100,eguf1om,t3_as7flg,t1_egsc4ww,1550624408,Too bad its not a boy,TodayIamHappy,t5_wpspv,1,t2_aj9tc
1101,eguwrbv,t3_as71cl,t3_as71cl,1550638797,TIAH because this subreddit exists and people ...,TodayIamHappy,t5_wpspv,1,t2_yjisnr0
1102,egux3y5,t3_as71cl,t1_eguwrbv,1550639145,It really makes me feel happy that people are ...,TodayIamHappy,t5_wpspv,1,t2_38ppbu69
1103,egv5m5a,t3_as2bg0,t1_egsct01,1550649703,Thank you very much!\n\nSounds like it's simpl...,TodayIamHappy,t5_wpspv,1,t2_33zpmhq8


In [81]:
comment_df.to_csv('pushshift/tiah_comments_20190217_20191231.csv', index = False)