In [None]:
import requests
import pandas as pd
import time
import random

## Scraping function

In [None]:
def subreddit_post_scraping(url, post_count):
    posts = []
    after = None
    pages = round(post_count/25)

    for a in range(pages):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after
        print(current_url)
        res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
        
        if res.status_code != 200:
            print('Status error', res.status_code)
            break
        
        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']
        
        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,60)
        print(sleep_duration)
        time.sleep(sleep_duration)

    return posts

In [None]:
def subreddit_post_scraping_flair_filter(url, post_count, flair_name):
    posts = []
    after = None
    pages = round(post_count/25)

    for a in range(pages):
        if after == None:
            current_url = url
        else:
            current_url = url + '&after=' + after
        print(current_url)
        res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
        
        if res.status_code != 200:
            print('Status error', res.status_code)
            break
        
        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        
        # Add tag to posts
        for post in current_posts:
            post['tag'] = flair_name

        posts.extend(current_posts)
        after = current_dict['data']['after']
        
        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,15)
        print(sleep_duration)
        time.sleep(sleep_duration)

    return posts

## Looping through the posts, 25 posts at a time (total of ~1000 posts)

### Scrape main subreddit page

In [None]:
# Steam Deck subreddit
steam_deck_posts = subreddit_post_scraping(url='https://www.reddit.com/r/SteamDeck.json', post_count=5000)

# Save to csv
pd.DataFrame(steam_deck_posts).to_csv('../data/steam_deck_reddit_posts.csv', index=False)
pd.DataFrame(steam_deck_posts).to_json('../data/steam_deck_reddit_posts.json', orient='records', index=False)

### Scrape flair filtering page

In [None]:
flair_url_list = {
    'Picture': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Picture%22',
    'Configuration': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Configuration%22',
    'Video': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Video%22',
    'Guide': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Guide%22',
    'Meme / Shitpost': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Meme%20%2F%20Shitpost%22',
    'News': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22News%22',
    'MEGATHREAD': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22MEGATHREAD%22',
    'Feature Request': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Feature%20Request%22',
    'Hot Wasabi': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Hot%20Wasabi%22',
    'Meta': 'https://www.reddit.com/r/SteamDeck/search.json?q=flair%3A%22Meta%22'
}

In [None]:
# Steam Deck subreddit
steam_deck_posts_flair = []
for flair in flair_url_list:
    steam_deck_posts_flair += subreddit_post_scraping_flair_filter(url=flair_url_list[flair], post_count=500, flair_name=flair)

# Save to csv
pd.DataFrame(steam_deck_posts_flair).to_csv('../data/steam_deck_reddit_posts_flair.csv', index=False)
pd.DataFrame(steam_deck_posts_flair).to_json('../data/steam_deck_reddit_posts_flair.json', orient='records', index=False)