In [326]:
import snscrape.modules.reddit as snreddit
import pandas as pd
import numpy as np
import json
import typing
from datetime import datetime

In [327]:
DATE_FORMAT = '%Y-%m-%dT%H:%M:%S+00:00'
MAX_POSTS_PER_CHUNK = 1000
SUBREDDIT_NAME = 'MechanicalKeyboards'

invalid_authors = (
    'Estebansg7',
    'AutoModerator',
    'Deadbolt11',
)

In [328]:
scrapes: list[dict[str, str]] = []

In [329]:
def check_invalid_author(author: str) -> bool:
    return any(map(author.__contains__, invalid_authors))

In [330]:
keys_to_pop = (
    'date',
    'id',
    'parentId',
    'subreddit',
    'url',
    'link',
    'title',
)

def process_json(scrape: str) -> dict[str, typing.Any]:
    scrape_dict = json.loads(scrape)
    for key_to_pop in keys_to_pop:
        scrape_dict.pop(key_to_pop, None)
    return scrape_dict

In [331]:
def scrape_chunk(scrapper: snreddit.RedditSearchScraper, max_posts: int) -> None:
    for i, scrape in enumerate(scrapper.get_items()):
        if i > max_posts:
            break
        if check_invalid_author(scrape.author):
            continue 
        scrape_dict = process_json(scrape.json())
        scrapes.append(scrape_dict)

In [332]:
def get_earliest_date() -> datetime:
    return min(datetime.strptime(scrape.get('created'), DATE_FORMAT) for scrape in scrapes)

In [333]:
def format_date(date: datetime) -> str:
    return date.strftime(DATE_FORMAT)

In [334]:
def scrape(n_times: int, max_posts: int) -> None:
    date_before = format_date(datetime.now())
    for scrape_id in range(n_times):
        scrapper = snreddit.RedditSearchScraper(
            SUBREDDIT_NAME,)
            # before=date_before)
        scrape_chunk(scrapper, max_posts)
        date_before = format_date(get_earliest_date())

In [335]:
scrape(2, MAX_POSTS_PER_CHUNK)

Error retrieving https://api.pushshift.io/reddit/search/submission?q=MechanicalKeyboards&limit=1000&until=2023-05-09T19%3A51%3A32%2B00%3A00: non-200 status code
4 requests to https://api.pushshift.io/reddit/search/submission?q=MechanicalKeyboards&limit=1000&until=2023-05-09T19%3A51%3A32%2B00%3A00 failed, giving up.
Errors: non-200 status code, non-200 status code, ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.pushshift.io', port=443): Read timed out. (read timeout=10)")), non-200 status code


ScraperException: 4 requests to https://api.pushshift.io/reddit/search/submission?q=MechanicalKeyboards&limit=1000&until=2023-05-09T19%3A51%3A32%2B00%3A00 failed, giving up.

In [None]:
scrapes

[{'_type': 'snscrape.modules.reddit.Submission',
  'author': 'IDontHaveNameHELP',
  'selftext': 'so i dont have much money and i am trying to find a good mechanical keyboard cuz my keyboard is glitching very often. Should i buy any keyboards from ***Silentium PC***?',
  'created': '2023-05-01T16:20:13+00:00'},
 {'_type': 'snscrape.modules.reddit.Comment',
  'author': 'sterlinghawktech',
  'body': "Since my previous post about the product I've been working on, I made a few improvements.\n\nFirst of all, this is SterlingKey™, a device which can turn your mechanical keyboard into bluetooth! It can be attached to keyboards with detachable cables, both Micro USB and Type C versions will be made, as well as non-detachable keyboards using a small adapter, as seen in the photos.\n\nIt can be connected to 3 devices and using a keyboard shortcut I can seamlessly switch between them. The battery lasts around 15 hours in this prototype version, but my goal is to achieve 50 hours of continuous use.

In [None]:
scrapes_df = pd.DataFrame(scrapes)
scrapes_df.sample(5)

Unnamed: 0,_type,author,selftext,created,body
29688,snscrape.modules.reddit.Comment,FreshCheekiBreeki,,2023-04-27T23:54:01+00:00,do whatever you find reasonable to put monitor...
318,snscrape.modules.reddit.Comment,FansForFlorida,,2023-05-01T10:04:55+00:00,See [this comment](https://reddit.com/r/Mechan...
25564,snscrape.modules.reddit.Comment,night-tide,,2023-04-30T06:10:16+00:00,My guess is you’re not likely to find a mechan...
20379,snscrape.modules.reddit.Comment,troyoz_,,2023-04-29T16:34:46+00:00,Mechanical keyboard reviews on YT really helpe...
23962,snscrape.modules.reddit.Comment,LevanderFela,,2023-05-01T06:23:59+00:00,You start by some [reading](https://www.keyboa...


In [None]:
scrapes_df['created'] = pd.to_datetime(scrapes_df['created'])

In [None]:
def fill_body(row_id: int, body: str, selftext: str) -> None:
    if body is pd.NA:
        scrapes_df.iloc[[row_id]]['body'] = selftext


for row_id, row in scrapes_df.iterrows():
    fill_body(row_id, row['body'], row['selftext'])

scrapes_df.drop('selftext', axis=1, inplace=True)
scrapes_df.rename({'body': 'text', '_type': 'type'}, axis=1, inplace=True)

In [None]:
scrapes_df['type'] = scrapes_df['type'].apply(lambda x: x.replace('snscrape.modules.reddit.', ''))
scrapes_df.sample(10)

Unnamed: 0,type,author,created,text
25569,Comment,cheat_bot,2023-04-30 01:58:42+00:00,I'm looking for a budget (under $100) mechanic...
25792,Comment,agonzal7,2023-05-01 15:21:12+00:00,He said he’s “not” super into mechanical keybo...
28571,Comment,TheBeardedDave,2023-05-01 03:38:19+00:00,TBH I don't really get the whole obsession wit...
25862,Comment,8BallBrad,2023-04-30 07:46:04+00:00,Mechanical keyboards is missing
14564,Comment,EternityinHeaven,2023-04-29 08:29:14+00:00,"Hello, MechanicalKeyboards community on Reddit..."
22605,Submission,olaamapolox,2023-04-28 10:03:25+00:00,
12384,Submission,dreamerr666,2023-04-29 19:50:14+00:00,
24331,Comment,Japetheone,2023-04-30 07:00:02+00:00,Not sure where my description went but I found...
24260,Comment,mrmivo,2023-05-01 12:44:59+00:00,"It always comes down to preference, even more ..."
22330,Comment,ThePsychedelicSeal,2023-04-27 21:38:07+00:00,Store: [https://www.etsy.com/shop/SynthandKeys...


In [None]:
scrapes_df.to_csv(SUBREDDIT_NAME + '.csv')