In [1]:
import praw
import pandas as pd
import re
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID",)
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT")
REDDIT_USER_NAME = os.getenv("REDDIT_USER_NAME")
REDDIT_USER_PASSWORD = os.getenv("REDDIT_USER_PASSWORD")

assert REDDIT_CLIENT_ID != ""
assert REDDIT_CLIENT_SECRET != ""
assert REDDIT_USER_AGENT != ""

In [3]:
reddit = praw.Reddit(
    client_id = REDDIT_CLIENT_ID,
    client_secret = REDDIT_CLIENT_SECRET,
    user_agent = REDDIT_USER_AGENT
)

In [28]:
# Function to extract posts from a subreddit
def extract_subreddit_data(subreddit, limit=100):
    posts = []
    for submission in reddit.subreddit(subreddit).hot(limit=limit):
        # Extract relevant data from each post
        posts.append({
            'author':submission.author,
            'author_flair_text':submission.author_flair_text,
            'created_utc': submission.created_utc,
            'is_self' : submission.is_self,
            'link_flair_text':submission.link_flair_text,
            'name':submission.name,
            'num_comments':submission.num_comments,
            'score':submission.score,
            'selftext':submission.selftext,
            'stickied':submission.stickied,
            'title':submission.title,
            'upvote_ratio':submission.upvote_ratio,
            'url': submission.url
        })
    return posts

In [29]:
# Extract data from r/collegeresults
data = extract_subreddit_data('collegeresults')

# convert to pd dataframe
df = pd.DataFrame(data)

# remove stickied 
df = df[~df.stickied]

In [66]:
# check link flair text
df.link_flair_text.apply(lambda x: re.sub('\+',"",x.split('|')[0]) if x!= 'Other' else 'a')
df.link_flair_text.loc[95]

'Other|1400+/31+|Art/Hum'

In [68]:
# extract demographics from link flair text
df['gpa'] = df.link_flair_text.apply(lambda x: re.sub('\+',"",x.split('|')[0]) if x.split('|')[0]!= 'Other' else None)

df['test_scores'] = df.link_flair_text.apply(lambda x: (x.split('|'))[1])

df['major'] = df.link_flair_text.apply(lambda x: (x.split('|'))[2])

In [62]:
def extract_demographics(text):
    demographics = {}

    # Define patterns for each demographic category
    patterns = {
        'Gender': r'Gender\s*:\s*([^\n]+)',
        'Race/ethnicity': r'Race/ethnicity\s*:\s*([^\n]+)',
        'Residence': r'Residence\s*:\s*([^\n]+)',
        'Income Bracket': r'Income Bracket\s*:\s*([^\n]+)',
        'Type of School': r'Type of School\s*:\s*([^\n]+)',
        'Hooks': r'Hooks\s*:\s*([^\n]+)'
    }

    # Extract information for each category
    for category, pattern in patterns.items():
        match = re.search(pattern, text)
        demographics[category] = match.group(1).strip() if match else None

    return demographics

# Example usage
demographics_text = "* Gender: Male\n* Race/ethnicity: Asian (chinese)\n* Residence: NY (luckily not nyc)\n* Income Bracket: 200k+\n* Type of School: kinda competitive public\n* Hooks (Recruited Athlete, URM, First-Gen, Geographic, Legacy, etc.): none!"
demographics_info = extract_demographics(demographics_text)
print(demographics_info)

{'Gender': 'Male', 'Race/ethnicity': 'Asian (chinese)', 'Residence': 'NY (luckily not nyc)', 'Income Bracket': '200k+', 'Type of School': 'kinda competitive public', 'Hooks': None}


In [63]:

def extract_section(post, section_title):
    pattern = r"\*\*{}.*?\*\*(.*?)(?=\*\*[A-Z]|$)".format(re.escape(section_title))
    match = re.search(pattern, post, re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else None

def parse_key_value_list(section_text):
    items = re.findall(r"\*\s*(.*?):\s*(.*)", section_text)
    return {item[0].strip(): item[1].strip() for item in items}

def parse_numbered_list(section_text):
    items = re.findall(r"^(\d+)\.\s*(.*?)$", section_text, re.MULTILINE)
    return {f"#{num}": item.strip() for num, item in items}

def parse_simple_list(section_text):
    items = re.findall(r"\*\s*(.*)", section_text)
    return items


def parse_demographics(post):

    section = extract_section(post, "Academics")

    demographics = {}

    # Define patterns for each demographic category
    patterns = {
        'Gender': r'Gender\s*:\s*([^\n]+)',
        'Race/ethnicity': r'Race/ethnicity\s*:\s*([^\n]+)',
        'Residence': r'Residence\s*:\s*([^\n]+)',
        'Income Bracket': r'Income Bracket\s*:\s*([^\n]+)',
        'Type of School': r'Type of School\s*:\s*([^\n]+)',
        'Hooks': r'Hooks\s*:\s*([^\n]+)'
    }

    # Extract information for each category
    for category, pattern in patterns.items():
        match = re.search(pattern, post)
        demographics[category] = match.group(1).strip() if match else None

    return demographics


def parse_academics(post):
    section = extract_section(post, "Academics")
    return parse_key_value_list(section) if section else {}

def parse_standardized_testing(post):
    section = extract_section(post, "Standardized Testing")
    return parse_key_value_list(section) if section else {}

def parse_extracurriculars(post):
    section = extract_section(post, "Extracurriculars/Activities")
    return parse_numbered_list(section) if section else {}

def parse_awards(post):
    section = extract_section(post, "Awards/honors")
    return parse_numbered_list(section) if section else {}

def parse_letters_of_recommendation(post):
    section = extract_section(post, "Letters of Recommendation")
    return parse_simple_list(section) if section else {}

def parse_essays(post):
    section = extract_section(post, "Essays")
    return parse_key_value_list(section) if section else {}

def parse_decisions(post):
#     section = extract_section(post, "Decisions")
#     return parse_key_value_list(section) if section else {}
    # Find the section that starts with 'Acceptances:'
    # Find the section that starts with 'Acceptances:'
    acceptances_section = re.search(r'(Acceptances|Accepted):\s*\*?\s*(.*?)\s*(?=\*\*[A-Z]|$)', post, re.IGNORECASE | re.DOTALL)

    if acceptances_section:
        acceptances_text = acceptances_section.group(1)
        
        # Extract individual universities
        universities = re.findall(r'[\n\*]\s*(.+?)(?:\s*\n|$)', acceptances_text)
        return [uni.strip() for uni in universities if uni.strip()]
    else:

        return []
    

# Example usage
    
post_content = df.selftext.iloc[3]

demographics = parse_demographics(post_content)
academics = parse_academics(post_content)
standardized_testing = parse_standardized_testing(post_content)
extracurriculars = parse_extracurriculars(post_content)
awards = parse_awards(post_content)
letters_of_recommendation = parse_letters_of_recommendation(post_content)
essays = parse_essays(post_content)
decisions = parse_decisions(post_content)




# Output the results
print("Demographics:", demographics)
print("Academics:", academics)
print("Standardized Testing:", standardized_testing)
print("Extracurriculars:", extracurriculars)
print("Awards:", awards)
print("Letters of Recommendation:", letters_of_recommendation)
print("Essays:", essays)
print("Decisions:", decisions)




Demographics: {'Gender': 'F', 'Race/ethnicity': None, 'Residence': 'Illinois suburbs', 'Income Bracket': 'single parent makes around 70,000-90,000', 'Type of School': 'fairly competitive public high school', 'Hooks': 'One parent earned a Masters from WashU, not sure if that counts'}
Academics: {'GPA (UW/W)': '3.9 UW, 4.6 W', 'Rank (or percentile)': 'unranked, but I’d estimate top 20%?', 'AP Music Theory (5': 'Non-Aural 5, Aural 4)'}
Standardized Testing: {}
Extracurriculars: {'#1': 'Sport (leaving this vague, but no scholarships are offered in the US for this sport unless at a national/Olympic level): captain for a year, 4 year varsity, state top 12', '#2': 'Family responsibilities (in my circumstances, I felt like it was appropriate to list)', '#3': 'Local business job: worked for 3 years, assistant manager for 2 years, often had solo shifts', '#4': 'Keyboard-building (like the only thing I have relevant to my intended major): just a personal hobby I spent time on, no competitions or 

In [54]:
post_content = df.selftext.iloc[3]
print(post_content)

‼️ this was for the 2022-2023 admissions cycle (I’m class of 2027) ‼️

please don't dox me again... this is a repost since someone didn't hush😭. I made this post to try to help others like me who have relatively 'average' stats, but **please respect my privacy**! If i get doxxed again I will take down this post permanently, but all I wanted to do with this was to hopefully assist others with my experience of college apps and to just relieve some of the stress that comes with such an arduous, difficult process.

This is super late considering it's January 2024, but I've wanted to post my results here since I've lurked on A2C and this subreddit for the entirety of my college app season...I'm very grateful for my parents' support and friends' advice when I was applying, and so I hope that my stats and essay info can similarly help others going through the devilish process that is college app season! (If you know who I am shhhhh)

\*More tips and personal opinions I have at the bottom of t

In [41]:
post_content = df.selftext.loc[20]

extract_section(post_content, r'Gender\s*:\s*([^\n]+)')


In [26]:
post_content

'female, private school, first gen, NY\n\n- Rank: 10-15%\n- GPA: 3.94/4.0 (6 A-‘s, 1 B)\n- SAT: 1530\n- AP: 5544443322\n\n**Before I get attacked again😭 I meant I didn’t have perfect stats like most admitted students (referring to my AP scores and rank). I never said they were bad; I just said they “were’t the best” compared to others accepted to the same schools and peers at my school.**\n\nResearch\n- National Junior Science & Humanities Symposium \n- Initiated two independent biomed projects\n- Interned at my local university lab over the summer\n\nMusic\n- YoungArts Finalist\n- Attended prestigious summer music camp\n- Won a lot of international competitions \n- Famous musician wrote one of my letter of recs\n\nLeadership/Community\n-  President & founder of 2 organizations that address educational disparities\n- President’s Volunteer Service Award, Gold\n\nStanford REA ✅'

In [20]:
import re

def extract_section(post, section_title):
    # Creating a pattern that is case-insensitive
    pattern = r"\*\*{}.*?\*\*(.*?)(?=\*\*[A-Z]|$)".format(re.escape(section_title))
    match = re.search(pattern, post, re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else None


def parse_key_value_section(section_text):
    data = {}
    for line in section_text.split('\n\n'):
        if ':' in line:
            key, value = line.split(':', 1)
            data[key.strip()] = value.strip()
    return data

def parse_list_section(section_text):
    items = re.findall(r"\\#(\d+)\s*(.*)", section_text)
    return {f"#{num}": item.strip() for num, item in items}

# Assuming `post_content` contains the text of the Reddit post
post_content = df.selftext.loc[35]

# Parsing each section
demographics = extract_demographics(extract_section(post_content, "Demographics"))
academics = parse_key_value_section(extract_section(post_content, "Academics"))
standardized_testing = parse_key_value_section(extract_section(post_content, "Standardized Testing"))
extracurriculars = parse_list_section(extract_section(post_content, "Extracurriculars/Activities"))
awards = parse_list_section(extract_section(post_content, "Awards/Honors"))
letters_of_recommendation = parse_key_value_section(extract_section(post_content, "Letters of Recommendation"))
essays = parse_key_value_section(extract_section(post_content, "Essays"))
decisions = parse_key_value_section(extract_section(post_content, "Decisions"))

# Output the results
print("Demographics:", demographics)
print("Academics:", academics.keys())
print("Standardized Testing:", standardized_testing)
print("Extracurriculars:", extracurriculars)
print("Awards:", awards)
print("Letters of Recommendation:", letters_of_recommendation)
print("Essays:", essays)
print("Decisions:", decisions)


TypeError: expected string or bytes-like object, got 'NoneType'

In [19]:
# Assuming `post_content` contains the text of the Reddit post
post_content = df.selftext.loc[35]

# Parsing each section
demographics = parse_key_value_section(extract_section(post_content, "Demographics"))

print(demographics)

post_content[:1000]

NameError: name 'parse_key_value_section' is not defined

In [17]:
df["selftext"].iloc[1]

"Demographics\n\nGender: Female (trans)\n\nRace/Ethnicity: White\n\nResidence: NJ\n\nIncome Bracket: Middle class\n\nType of School: Public (wealthy area)\n\nHooks (Recruited Athlete, URM, First-Gen, Geographic, Legacy, etc.):  N/A\n\nIntended Major(s): Computer Science and Math\n\n&#x200B;\n\nAcademics\n\nGPA (UW/W):  3.9 / 4.5\n\nRank (or percentile): Top 10%\n\n\\# of Honors/AP/IB/Dual Enrollment/etc.: 10 APs (Calc AB+BC, Stat, Bio, Chem, Physics C, Pysch, Econ, Lang, CS A), rest honors\n\nSenior Year Course Load: 4 APs, 1 honors, study hall\n\n&#x200B;\n\nStandardized Testing\n\nSAT I: took once - 1540 (780RW, 760M lol)\n\n&#x200B;\n\nExtracurriculars/ActivitiesList all extracurricular involvements, including leadership roles, time commitments, major achievements, etc.\n\n\\#1   Self taught pianist - 10 years - Taught myself from age 7 using sheet music and gradually became better over the years by seeking out more and more challenging music that interested me.\n\n\\#2   Chess Club

In [9]:
for i in posts:
    print(i.is_original_content)

False
False
False
False
False
False
False
False
False
False


In [16]:
tmp = list(submission)

In [21]:
type(tmp[0])

praw.models.reddit.submission.Submission