In [1]:
import os
import time
import json
import random
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from datetime import timezone

# Test

In [2]:
dt = datetime(2021, 5, 1)
timestamp = dt.replace(tzinfo = timezone.utc).timestamp()
print(timestamp)

1619827200.0


In [3]:
url = 'https://api.pushshift.io/reddit/search/submission/'

In [4]:
params = {
    'subreddit': 'CasualConversation',
    'sort': 'desc',
    'sort_type': 'created_utc',
    'after': str(int(timestamp)),  # Exclusive!
    'before': str(int(timestamp + 3600)),  # Exclusive!
    'size': '1000'
}

In [5]:
res = requests.get(url, params = params)

In [6]:
len(res.json()['data'])

8

In [None]:
comment_ids = []
for i in tqdm(range(200)):
    url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_df.iloc[i]['id'])
    res = requests.get(url)
    comment_ids += res.json()['data']
print(len(comment_ids))

In [58]:
url = 'https://api.pushshift.io/reddit/comment/search?ids={}'.format(','.join(comment_ids[:1000]))
res = requests.get(url)
print(res)

<Response [200]>


In [59]:
len(res.json()['data'])

1000

# Crawl Submissions from r/CasualConversation

In [2]:
end_date = datetime(2022, 1, 1)
end_timestamp = end_date.replace(tzinfo = timezone.utc).timestamp()
print(end_date, end_timestamp)

2022-01-01 00:00:00 1640995200.0


In [3]:
start_date = datetime(2021, 5, 1)
start_timestamp = start_date.replace(tzinfo = timezone.utc).timestamp()
print(start_date, start_timestamp)

2021-05-01 00:00:00 1619827200.0


In [4]:
submission_cols = ['id', 'created_utc', 'title', 'selftext', 'num_comments', 'subreddit', 'subreddit_id']
submission_cols_opt = ['score', 'author_fullname', 'link_flair_text']

In [5]:
submission_dict = {col: [] for col in submission_cols + submission_cols_opt}

In [6]:
def is_json(json_data):
    try:
        json_object = json.loads(json_data)
    except:
        return False
    return True

In [9]:
current_timestamp

1622390400.0

In [10]:
# current_timestamp = end_timestamp
# url = 'https://api.pushshift.io/reddit/search/submission/'
while current_timestamp > start_timestamp:
    if (current_timestamp - end_timestamp) % 86400 == 0:
        date_1 = datetime.fromtimestamp(current_timestamp - 86400, timezone.utc)
        date_2 = datetime.fromtimestamp(current_timestamp, timezone.utc)
        print('Crawling from {} to {}...'.format(date_1, date_2))
    params = {
        'subreddit': 'CasualConversation',
        'sort': 'desc',
        'sort_type': 'created_utc',
        'after': str(int(current_timestamp - 3600 - 1)),
        'before': str(int(current_timestamp)),
        'size': '100'
    }
    res = requests.get(url, params = params)
    if is_json(res.text):
        submissions = res.json()['data']
        for submission in submissions:

            all_cols_exist = True
            for col in submission_cols:
                if col not in submission:
                    all_cols_exist = False
                    break
            if not all_cols_exist:
                continue

            for col in submission_cols:
                submission_dict[col].append(submission[col])

            for col in submission_cols_opt:
                if col in submission:
                    submission_dict[col].append(submission[col])
                else:
                    submission_dict[col].append(None)

    current_timestamp = current_timestamp - 3600

Crawling from 2021-05-29 00:00:00+00:00 to 2021-05-30 00:00:00+00:00...
Crawling from 2021-05-28 00:00:00+00:00 to 2021-05-29 00:00:00+00:00...
Crawling from 2021-05-27 00:00:00+00:00 to 2021-05-28 00:00:00+00:00...
Crawling from 2021-05-26 00:00:00+00:00 to 2021-05-27 00:00:00+00:00...
Crawling from 2021-05-25 00:00:00+00:00 to 2021-05-26 00:00:00+00:00...
Crawling from 2021-05-24 00:00:00+00:00 to 2021-05-25 00:00:00+00:00...
Crawling from 2021-05-23 00:00:00+00:00 to 2021-05-24 00:00:00+00:00...
Crawling from 2021-05-22 00:00:00+00:00 to 2021-05-23 00:00:00+00:00...
Crawling from 2021-05-21 00:00:00+00:00 to 2021-05-22 00:00:00+00:00...
Crawling from 2021-05-20 00:00:00+00:00 to 2021-05-21 00:00:00+00:00...
Crawling from 2021-05-19 00:00:00+00:00 to 2021-05-20 00:00:00+00:00...
Crawling from 2021-05-18 00:00:00+00:00 to 2021-05-19 00:00:00+00:00...
Crawling from 2021-05-17 00:00:00+00:00 to 2021-05-18 00:00:00+00:00...
Crawling from 2021-05-16 00:00:00+00:00 to 2021-05-17 00:00:00+0

In [11]:
print(len(submission_dict['id']), len(set(submission_dict['id'])))

33428 33428


In [12]:
submission_df = pd.DataFrame(submission_dict)

In [13]:
submission_df

Unnamed: 0,id,created_utc,title,selftext,num_comments,subreddit,subreddit_id,score,author_fullname,link_flair_text
0,rt6lc0,1640994901,"Utterly alone on New Year’s Eve, as has been t...",The last time I celebrated New Year’s I went t...,0,CasualConversation,t5_323oy,1,t2_6ie2sdjz,:chat: Just Chatting
1,rt6km3,1640994833,Rest in Peace Betty White,[removed],1,CasualConversation,t5_323oy,1,t2_d7ujoy14,:chat: Just Chatting
2,rt6igm,1640994650,RIP Betty White,[removed],1,CasualConversation,t5_323oy,1,t2_d7ujoy14,no context⇢
3,rt6hfy,1640994559,Off to a fresh start!,[removed],0,CasualConversation,t5_323oy,1,t2_i4drbzzc,:chat: Just Chatting
4,rt6hal,1640994543,Does anyone remember the name of this multi-co...,"When I was younger, I used this cookie dough s...",0,CasualConversation,t5_323oy,2,t2_408k9r0a,:food: Food &amp; Drinks
...,...,...,...,...,...,...,...,...,...,...
33423,n28al4,1619832785,I was thrown a really nice birthday party but ...,It’s really hard for me to like things. It’s n...,1,CasualConversation,t5_323oy,1,t2_4fcntziq,:chat: Just Chatting
33424,n289pw,1619832696,I'm a Christian let's talk I'm bored,[removed],1,CasualConversation,t5_323oy,1,t2_3qx10vn1,:chat: Just Chatting
33425,n289b2,1619832652,My bus driver calls me big guy almost every ti...,It's nice. I was a real scrawny kid and I've o...,167,CasualConversation,t5_323oy,1,t2_42tqbxjo,
33426,n285ek,1619832240,Ever notice youre a human being and suddenly e...,Im not sure if a fish knows its a fish and is ...,9,CasualConversation,t5_323oy,1,t2_ab978iz7,


In [14]:
submission_df.to_csv('../data/reddit/raw/casual_conv_submissions_20210501_20211231.csv', index = False)

# Crawl Comments According to the Submissions

In [15]:
submission_df = pd.read_csv('../data/reddit/raw/casual_conv_submissions_20210501_20211231.csv')

In [16]:
submission_df

Unnamed: 0,id,created_utc,title,selftext,num_comments,subreddit,subreddit_id,score,author_fullname,link_flair_text
0,rt6lc0,1640994901,"Utterly alone on New Year’s Eve, as has been t...",The last time I celebrated New Year’s I went t...,0,CasualConversation,t5_323oy,1,t2_6ie2sdjz,:chat: Just Chatting
1,rt6km3,1640994833,Rest in Peace Betty White,[removed],1,CasualConversation,t5_323oy,1,t2_d7ujoy14,:chat: Just Chatting
2,rt6igm,1640994650,RIP Betty White,[removed],1,CasualConversation,t5_323oy,1,t2_d7ujoy14,no context⇢
3,rt6hfy,1640994559,Off to a fresh start!,[removed],0,CasualConversation,t5_323oy,1,t2_i4drbzzc,:chat: Just Chatting
4,rt6hal,1640994543,Does anyone remember the name of this multi-co...,"When I was younger, I used this cookie dough s...",0,CasualConversation,t5_323oy,2,t2_408k9r0a,:food: Food &amp; Drinks
...,...,...,...,...,...,...,...,...,...,...
33423,n28al4,1619832785,I was thrown a really nice birthday party but ...,It’s really hard for me to like things. It’s n...,1,CasualConversation,t5_323oy,1,t2_4fcntziq,:chat: Just Chatting
33424,n289pw,1619832696,I'm a Christian let's talk I'm bored,[removed],1,CasualConversation,t5_323oy,1,t2_3qx10vn1,:chat: Just Chatting
33425,n289b2,1619832652,My bus driver calls me big guy almost every ti...,It's nice. I was a real scrawny kid and I've o...,167,CasualConversation,t5_323oy,1,t2_42tqbxjo,
33426,n285ek,1619832240,Ever notice youre a human being and suddenly e...,Im not sure if a fish knows its a fish and is ...,9,CasualConversation,t5_323oy,1,t2_ab978iz7,


In [17]:
def is_json(json_data):
    try:
        json_object = json.loads(json_data)
    except:
        return False
    return True

In [18]:
comment_ids = []

In [None]:
url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_df.iloc[33426]['id'])
res = requests.get(url)
res.text

In [33]:
for i in tqdm(range(1956, submission_df.shape[0])):
    submission_id = submission_df.iloc[i]['id']
    url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_id)
    res = requests.get(url)
    if is_json(res.text):
        comment_ids += res.json()['data']

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 31472/31472 [25:16:57<00:00,  2.89s/it]


In [34]:
len(comment_ids)

313366

In [35]:
pd.DataFrame({'id': comment_ids}).to_csv('../data/reddit/raw/casual_conv_comment_ids_20210501_20211231.csv', index = False)

### Continue crawling the comment ids

In [20]:
len(comment_ids)

0

In [169]:
comment_ids[-10:]

['dc76tfw',
 'dd1cdhv',
 'dbx20a3',
 'dc6qxsl',
 'dcosxo6',
 'dcow3f7',
 'dc5dmfn',
 'dcq5plm',
 'dc08ifx',
 'dbz8074']

In [138]:
url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_df.iloc[26182+9702]['id'])
res = requests.get(url)
print(res.json()['data'])
for i in res.json()['data']:
    if i in comment_ids:
        print(True)

['djp74h6', 'djp7bjg', 'djp7pzm', 'djp8srl', 'djp9wz0', 'djpac3w', 'djpag5e', 'djpakig', 'djpasvb', 'djpax12', 'djpb54f', 'djpb6rz', 'djpb7h7', 'djpbcl5', 'djpbqrm', 'djpbyrl', 'djpcceq', 'djpd9e4']


In [140]:
url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_df.iloc[26182+9701]['id'])
res = requests.get(url)
res.json()['data']

['djp7nh2',
 'djp7qbn',
 'djp7t1v',
 'djp7xyd',
 'djp7yfs',
 'djp81zg',
 'djp82el',
 'djp83ct',
 'djp8xdp',
 'djp9hkd',
 'djpa0z3',
 'djpeu2c']

In [141]:
for i in tqdm(range(26182+9702, submission_df.shape[0])):
    submission_id = submission_df.iloc[i]['id']
    url = 'https://api.pushshift.io/reddit/submission/comment_ids/{}'.format(submission_id)
    res = requests.get(url)
    if is_json(res.text):
        comment_ids += res.json()['data']

100%|████████████████████████████████████████████████████████| 19659/19659 [12:21:52<00:00,  2.26s/it]


In [142]:
len(comment_ids)

1017407

In [143]:
pd.DataFrame({'id': comment_ids}).to_csv('pushshift/casual_conv_comment_ids_20170101_20171231.csv', index = False)

--------------------

In [59]:
comment_cols = ['id', 'link_id', 'parent_id', 'created_utc', 'body', 'subreddit', 'subreddit_id']
comment_cols_opt = ['score', 'author_fullname']

In [60]:
comment_dict = {col: [] for col in comment_cols + comment_cols_opt}

In [61]:
def crawl_comments(comment_query):
    url = 'https://api.pushshift.io/reddit/comment/search?ids={}'.format(comment_query)
    res = requests.get(url)
    if not is_json(res.text):
        return
    comments = res.json()['data']

    for comment in comments:
        all_cols_exist = True
        for col in comment_cols:
            if col not in comment:
                all_cols_exist = False
                break
        if not all_cols_exist:
            continue

        for col in comment_cols:
            comment_dict[col].append(comment[col])

        for col in comment_cols_opt:
            if col in comment:
                comment_dict[col].append(comment[col])
            else:
                comment_dict[col].append(None)

In [62]:
N = len(comment_ids)
batch_size = 500
num_batches = N // batch_size

for batch in tqdm(range(num_batches)):
    s = batch * batch_size
    t = s + batch_size
    comment_query = ','.join(comment_ids[s:t])
    crawl_comments(comment_query)

if N % batch_size != 0:
    comment_query = ','.join(comment_ids[(num_batches*batch_size):N])
    crawl_comments(comment_query)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 626/626 [1:17:43<00:00,  7.45s/it]


In [63]:
comment_df = pd.DataFrame(comment_dict)

In [64]:
comment_df

Unnamed: 0,id,link_id,parent_id,created_utc,body,subreddit,subreddit_id,score,author_fullname
0,hm5iqwj,t3_r2nmvy,t3_r2nmvy,1637933885,"Yeah, they’re fun! Conscious Club is campy in ...",CasualConversation,t5_323oy,1,t2_5yhjayls
1,hm5iup9,t3_r2nlu7,t3_r2nlu7,1637933947,"Sounds like fate, but also disgusting. I hope ...",CasualConversation,t5_323oy,62,t2_g7sdcvpm
2,hm5iuth,t3_r2nmvy,t1_hm5iqwj,1637933949,I know!,CasualConversation,t5_323oy,1,t2_dvd2sbpi
3,hm5iw7v,t3_r2nlu7,t3_r2nlu7,1637933973,That is an amazing story and even bigger coinc...,CasualConversation,t5_323oy,18,t2_xeg58lg
4,hm5j61n,t3_r2nlu7,t1_hm5iup9,1637934133,He’s a doctor now so I hope he’s grown out of ...,CasualConversation,t5_323oy,66,t2_5t1dsc17
...,...,...,...,...,...,...,...,...,...
311179,gxy5eus,t3_n8vd4o,t3_n8vd4o,1620885281,"Happy birthday, love! Sending you lots of hugs...",CasualConversation,t5_323oy,1,t2_52mchyz5
311180,gxyb84x,t3_n8vd4o,t3_n8vd4o,1620890228,Aw a little late but Happy Birthday! 🎉 Sorry t...,CasualConversation,t5_323oy,1,t2_4dpz39jn
311181,gyf3af4,t3_n8tdxi,t1_gxleuvl,1621235314,Wow...I don't even know what to say. I meet os...,CasualConversation,t5_323oy,1,t2_800dwtbb
311182,h8yfw7r,t3_n8uq0y,t1_gxkjuf1,1628975848,This is too relatable. My parents divorced whe...,CasualConversation,t5_323oy,1,t2_6bxxuxxo


In [65]:
comment_df.to_csv('../data/reddit/raw/casual_conv_comments_20210501_20211231.csv', index = False)