In [1]:
import snscrape.modules.reddit as snreddit
import pandas as pd
import json
from typing import Any
from datetime import datetime

In [2]:
DATE_FORMAT = '%Y-%m-%dT%H:%M:%S+00:00'
JSON_FORMAT = dict[str, Any]
MAX_POSTS_PER_CHUNK = 1000
query = 'musk'

invalid_authors = (
    'moderator',
    'mod',
    'bot',
)

In [3]:
scrapes: list[JSON_FORMAT] = []

In [4]:
def check_invalid_author(author: str) -> bool:
    return any(map(author.lower().__contains__, invalid_authors))

In [5]:
keys_to_pop = (
    'date',
    'id',
    'parentId',
    'subreddit',
    'url',
    'link',
    'title',
)

def process_json(scrape: str) -> JSON_FORMAT:
    scrape_dict: JSON_FORMAT = json.loads(scrape)
    for key_to_pop in keys_to_pop:
        scrape_dict.pop(key_to_pop, None)
    return scrape_dict

In [6]:
def scrape_chunk(scrapper: snreddit.RedditSearchScraper, max_posts: int) -> None:
    for i, scrape in enumerate(scrapper.get_items()):
        if i > max_posts:
            break
        if check_invalid_author(scrape.author):
            continue 
        scrape_dict = process_json(scrape.json())
        scrapes.append(scrape_dict)

In [7]:
def get_earliest_date() -> datetime:
    return min(datetime.strptime(scrape.get('created'), DATE_FORMAT) for scrape in scrapes)

In [8]:
def scrape(n_times: int, max_posts: int) -> None:
    for scrape_num in range(n_times):
        after = n_times - scrape_num
        before = after - 1
        print(f'{scrape_num=} between {before}-{after} days ago with {max_posts=}, scrapes count={len(scrapes)}')
        try:
            scrapper = snreddit.RedditSearchScraper(
                query,
                before=f'{before}d',
                after=f'{after}d'
            )
            scrape_chunk(scrapper, max_posts)
        except:
            print('error, going to the next scrape')
            continue

In [9]:
scrape(100, MAX_POSTS_PER_CHUNK)

scrape_num=0 between 99-100 days ago with max_posts=1000, scrapes count=0
scrape_num=1 between 98-99 days ago with max_posts=1000, scrapes count=979
scrape_num=2 between 97-98 days ago with max_posts=1000, scrapes count=1965
scrape_num=3 between 96-97 days ago with max_posts=1000, scrapes count=2951
scrape_num=4 between 95-96 days ago with max_posts=1000, scrapes count=3919
scrape_num=5 between 94-95 days ago with max_posts=1000, scrapes count=4906
scrape_num=6 between 93-94 days ago with max_posts=1000, scrapes count=5882
scrape_num=7 between 92-93 days ago with max_posts=1000, scrapes count=6866
scrape_num=8 between 91-92 days ago with max_posts=1000, scrapes count=7851
scrape_num=9 between 90-91 days ago with max_posts=1000, scrapes count=8827
scrape_num=10 between 89-90 days ago with max_posts=1000, scrapes count=9803
scrape_num=11 between 88-89 days ago with max_posts=1000, scrapes count=10791
scrape_num=12 between 87-88 days ago with max_posts=1000, scrapes count=11780


Error retrieving https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=87d&since=88d: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))
4 requests to https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=87d&since=88d failed, giving up.
Errors: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))


error, going to the next scrape
scrape_num=13 between 86-87 days ago with max_posts=1000, scrapes count=11780
scrape_num=14 between 85-86 days ago with max_posts=1000, scrapes count=12754
scrape_num=15 between 84-85 days ago with max_posts=1000, scrapes count=13734
scrape_num=16 between 83-84 days ago with max_posts=1000, scrapes count=14708
scrape_num=17 between 82-83 days ago with max_posts=1000, scrapes count=15683
scrape_num=18 between 81-82 days ago with max_posts=1000, scrapes count=16671
scrape_num=19 between 80-81 days ago with max_posts=1000, scrapes count=17660
scrape_num=20 between 79-80 days ago with max_posts=1000, scrapes count=18651


Error retrieving https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=79d&since=80d: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))
4 requests to https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=79d&since=80d failed, giving up.
Errors: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))


error, going to the next scrape
scrape_num=21 between 78-79 days ago with max_posts=1000, scrapes count=18651
scrape_num=22 between 77-78 days ago with max_posts=1000, scrapes count=19643
scrape_num=23 between 76-77 days ago with max_posts=1000, scrapes count=20629
scrape_num=24 between 75-76 days ago with max_posts=1000, scrapes count=21617
scrape_num=25 between 74-75 days ago with max_posts=1000, scrapes count=22604
scrape_num=26 between 73-74 days ago with max_posts=1000, scrapes count=23584
scrape_num=27 between 72-73 days ago with max_posts=1000, scrapes count=24576
scrape_num=28 between 71-72 days ago with max_posts=1000, scrapes count=25566
scrape_num=29 between 70-71 days ago with max_posts=1000, scrapes count=26544
scrape_num=30 between 69-70 days ago with max_posts=1000, scrapes count=27529
scrape_num=31 between 68-69 days ago with max_posts=1000, scrapes count=28488
scrape_num=32 between 67-68 days ago with max_posts=1000, scrapes count=29464
scrape_num=33 between 66-67 days

Error retrieving https://api.pushshift.io/reddit/search/submission?q=musk&limit=1000&until=51d&since=52d: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))
4 requests to https://api.pushshift.io/reddit/search/submission?q=musk&limit=1000&until=51d&since=52d failed, giving up.
Errors: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))


error, going to the next scrape
scrape_num=49 between 50-51 days ago with max_posts=1000, scrapes count=45051


Error retrieving https://api.pushshift.io/reddit/search/submission?q=musk&limit=1000&until=50d&since=51d: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))
4 requests to https://api.pushshift.io/reddit/search/submission?q=musk&limit=1000&until=50d&since=51d failed, giving up.
Errors: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))


error, going to the next scrape
scrape_num=50 between 49-50 days ago with max_posts=1000, scrapes count=45051
scrape_num=51 between 48-49 days ago with max_posts=1000, scrapes count=46026
scrape_num=52 between 47-48 days ago with max_posts=1000, scrapes count=47009
scrape_num=53 between 46-47 days ago with max_posts=1000, scrapes count=47988
scrape_num=54 between 45-46 days ago with max_posts=1000, scrapes count=48970
scrape_num=55 between 44-45 days ago with max_posts=1000, scrapes count=49954
scrape_num=56 between 43-44 days ago with max_posts=1000, scrapes count=50925
scrape_num=57 between 42-43 days ago with max_posts=1000, scrapes count=51910
scrape_num=58 between 41-42 days ago with max_posts=1000, scrapes count=52893
scrape_num=59 between 40-41 days ago with max_posts=1000, scrapes count=53868
scrape_num=60 between 39-40 days ago with max_posts=1000, scrapes count=54844
scrape_num=61 between 38-39 days ago with max_posts=1000, scrapes count=55822
scrape_num=62 between 37-38 days

Error retrieving https://api.pushshift.io/reddit/search/submission?q=musk&limit=1000&until=15d&since=16d: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))
4 requests to https://api.pushshift.io/reddit/search/submission?q=musk&limit=1000&until=15d&since=16d failed, giving up.
Errors: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))


error, going to the next scrape
scrape_num=85 between 14-15 days ago with max_posts=1000, scrapes count=78340
scrape_num=86 between 13-14 days ago with max_posts=1000, scrapes count=79329
scrape_num=87 between 12-13 days ago with max_posts=1000, scrapes count=80311
scrape_num=88 between 11-12 days ago with max_posts=1000, scrapes count=81285
scrape_num=89 between 10-11 days ago with max_posts=1000, scrapes count=82270
scrape_num=90 between 9-10 days ago with max_posts=1000, scrapes count=83256
scrape_num=91 between 8-9 days ago with max_posts=1000, scrapes count=84242
scrape_num=92 between 7-8 days ago with max_posts=1000, scrapes count=85226
scrape_num=93 between 6-7 days ago with max_posts=1000, scrapes count=85226
scrape_num=94 between 5-6 days ago with max_posts=1000, scrapes count=85226
scrape_num=95 between 4-5 days ago with max_posts=1000, scrapes count=85226


Error retrieving https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=4d&since=5d: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))
4 requests to https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=4d&since=5d failed, giving up.
Errors: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))


error, going to the next scrape
scrape_num=96 between 3-4 days ago with max_posts=1000, scrapes count=85226


Error retrieving https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=3d&since=4d: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))
4 requests to https://api.pushshift.io/reddit/search/comment?q=musk&limit=1000&until=3d&since=4d failed, giving up.
Errors: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)"))


error, going to the next scrape
scrape_num=97 between 2-3 days ago with max_posts=1000, scrapes count=85226
scrape_num=98 between 1-2 days ago with max_posts=1000, scrapes count=85226
scrape_num=99 between 0-1 days ago with max_posts=1000, scrapes count=85226


In [10]:
scrapes

[{'_type': 'snscrape.modules.reddit.Comment',
  'author': 'UNCwesRPh',
  'body': 'I had been on twitter prior to the musk takeover. But I’m talking more about directly to the news source. Shouting on Twitter/Facebook/Reddit gets notice inside the industry, but until a major media outlet picks up the story (which we all have plenty of examples off) we are going nowhere with this. \n\nOf note, I’m still on regular contact with the NYT about the issues and will bring up to them at next discussion.',
  'created': '2023-01-30T18:49:31+00:00'},
 {'_type': 'snscrape.modules.reddit.Submission',
  'author': 'orangybara3',
  'selftext': None,
  'created': '2023-01-30T18:49:30+00:00'},
 {'_type': 'snscrape.modules.reddit.Comment',
  'author': 'wgp3',
  'body': "That article does not say what you imply at all. It only states that the feature turns off before crashes. Which tesla admits is default behavior and has never tried to hide. Hence the whole counting as a crash if disengaged within 5 secon

In [16]:
scrapes_df = pd.DataFrame(scrapes)
scrapes_df.sample(5)

Unnamed: 0,_type,author,body,created,selftext
13126,snscrape.modules.reddit.Comment,Grogosh,You do realize that South African apartheid El...,2023-02-13T17:01:41+00:00,
80490,snscrape.modules.reddit.Comment,enkidu_johnson,&gt; is there something wrong with Twitter?\n...,2023-04-27T17:51:07+00:00,
33712,snscrape.modules.reddit.Comment,Jimbobgixxer,I'm afraid Musk is slowly destroying his image...,2023-03-07T17:49:00+00:00,
35269,snscrape.modules.reddit.Comment,Invaderk2,It says that before Musk took over it seemed m...,2023-03-08T15:59:32+00:00,
49804,snscrape.modules.reddit.Comment,McGeeze,The resort looks like the beginning of a post-...,2023-03-25T06:26:11+00:00,


In [17]:
scrapes_df['created'] = pd.to_datetime(scrapes_df['created'])

In [18]:
def fill_body(row_id: int, body: str, selftext: str) -> None:
    if body is pd.NA:
        scrapes_df.iloc[[row_id]]['body'] = selftext


for row_id, row in scrapes_df.iterrows():
    fill_body(row_id, row['body'], row['selftext'])

scrapes_df.drop('selftext', axis=1, inplace=True)
scrapes_df.rename({'body': 'text', '_type': 'type'}, axis=1, inplace=True)

In [19]:
scrapes_df['type'] = scrapes_df['type'].apply(lambda x: x.replace('snscrape.modules.reddit.', ''))
scrapes_df.sample(10)

Unnamed: 0,type,author,text,created
40488,Comment,firesquasher,You forget reddit rode Musk dick HARD during t...,2023-03-14 16:18:22+00:00
27903,Submission,ExotiXa,,2023-03-01 16:24:50+00:00
6773,Submission,WhatsNewWorldDotInfo,,2023-02-05 10:41:02+00:00
52692,Comment,luxusbuerg,Bruh totally missing the point. We're talking ...,2023-03-28 14:36:44+00:00
52336,Submission,Ancoragae45,,2023-03-28 16:30:48+00:00
81551,Submission,iDevice_Help,,2023-04-28 17:08:43+00:00
1818,Submission,PsychologicalSite917,,2023-01-31 13:41:30+00:00
57859,Submission,cjcave,,2023-04-03 18:37:07+00:00
20638,Comment,banneryear1868,The musk of fellow comrades,2023-02-22 18:45:40+00:00
57561,Comment,Dangling_chains7689,"Hehe lmao\n\nIn Pune, we had something similar...",2023-04-02 14:53:58+00:00


In [20]:
print(scrapes_df.shape)
scrapes_df.drop_duplicates(subset='text', inplace=True)
scrapes_df.dropna(axis=0, subset='text', inplace=True)
scrapes_df.shape

(85226, 4)


(61620, 4)

In [21]:
scrapes_df.to_csv(query + '.csv')