In [1]:
import praw
import pandas as pd
import re
from dotenv import load_dotenv
import os
import numpy as np
from datetime import date

In [2]:
load_dotenv()

REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID",)
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT")
REDDIT_USER_NAME = os.getenv("REDDIT_USER_NAME")
REDDIT_USER_PASSWORD = os.getenv("REDDIT_USER_PASSWORD")

assert REDDIT_CLIENT_ID != ""
assert REDDIT_CLIENT_SECRET != ""
assert REDDIT_USER_AGENT != ""

In [3]:
# create reddit
reddit = praw.Reddit(
    client_id = REDDIT_CLIENT_ID,
    client_secret = REDDIT_CLIENT_SECRET,
    user_agent = REDDIT_USER_AGENT
)

In [4]:
# Function to extract posts from a subreddit
def extract_subreddit_data(subreddit, limit=100, sorts = ["controversial","hot","new","rising","top"]):
    posts = []
    subreddit_instance = reddit.subreddit(subreddit)

    for s in sorts:
        for submission in getattr(subreddit_instance,s)(limit=limit):
            # Extract relevant data from each post
            posts.append({
                'author':submission.author,
                'author_flair_text':submission.author_flair_text,
                'created_utc': submission.created_utc,
                'is_self' : submission.is_self,
                'link_flair_text':submission.link_flair_text,
                'name':submission.name,
                'num_comments':submission.num_comments,
                'score':submission.score,
                'selftext':submission.selftext,
                'stickied':submission.stickied,
                'title':submission.title,
                'upvote_ratio':submission.upvote_ratio,
                'url': submission.url
            })
    
    return posts

In [5]:
posts = extract_subreddit_data("collegeresults", limit = None)

In [6]:
# convert to Data Frame
df = pd.DataFrame(posts)
df = df[~df.stickied]

# Remove duplicate posts based on the full name
df = df.drop_duplicates(subset="name",keep="first",ignore_index=True)

# dimension of data
df.shape

(2119, 13)

In [8]:
df.head(5)

Unnamed: 0,author,author_flair_text,created_utc,is_self,link_flair_text,name,num_comments,score,selftext,stickied,title,upvote_ratio,url
0,jbut9524,,1651963000.0,True,3.8+|1500+/34+|STEM,t3_uko4vp,40,0,"**Gender/Race:** Asian, Male\n\n**Residence:**...",False,Asian Male in CS gets rejected from everywhere...,0.47,https://www.reddit.com/r/collegeresults/commen...
1,rowale1,,1697767000.0,True,3.2+|1300+/28+|Art/Hum,t3_17bztn3,212,6,**Demographics**\n\n**Gender**: Male\n\n**Race...,False,"RESULTS FOR NYU ADMIT, CLASS OF 2023",0.52,https://www.reddit.com/r/collegeresults/commen...
2,Minute_Champion_3188,,1703573000.0,True,3.8+|1500+/34+|STEM,t3_18r264u,97,69,"female, private school, first gen, NY\n\n- Ran...",False,Grades weren’t the best but accepted to Stanfo...,0.6,https://www.reddit.com/r/collegeresults/commen...
3,Known_Car4289,,1649100000.0,True,3.6+|1300+/28+|STEM,t3_twagph,61,20,**Note:** Using throwaway account to post so I...,False,asian + male + cs + IVY + crazy story,0.56,https://www.reddit.com/r/collegeresults/commen...
4,dumbasscorgi1,,1680219000.0,True,3.8+|1500+/34+|Art/Hum,t3_1272uf8,17,2,"Demographics: White, gay, non-binary, Jewish, ...",False,Turns out I’m a failure :),0.52,https://www.reddit.com/r/collegeresults/commen...


In [9]:
# save dataset
filename = "../data/" + "collegeresults-scraped-" + date.today().strftime("%m-%d-%Y") + ".csv"
print(filename)
df.to_csv(filename,index=False)

../data/collegeresults-scraped-01-07-2024.csv


In [14]:
# what happens if I get random posts using subreddit.random()?
# will it yield new posts?
post_fullnames = set(df["name"])

subreddit = reddit.subreddit("collegeresults")
random_submission_fullnames = []

for i in range(1000):
    try:
        random_submission_fullnames.append(subreddit.random().name)
    except:
        print("Too many requests (Received 429 HTTP response)... Ending random post retrieval")
        break

submission_exists = np.array([i in post_fullnames for i in random_submission_fullnames])
submission_exists.mean()


Too many requests (Received 429 HTTP response)... Ending random post retrieval


1.0

In [15]:
len(submission_exists)

505