In [1]:
import praw
import pandas as pd
import re


In [2]:
# Set up PRAW with your Reddit API credentials

reddit = praw.Reddit(client_id = "", 
                     client_secret = "",
                     user_agent = "", 
                     username="",
                    password = "")


# Function to extract posts from a subreddit
def extract_subreddit_data(subreddit, limit=100):
    posts = []
    for submission in reddit.subreddit(subreddit).hot(limit=limit):
        # Extract relevant data from each post
        posts.append({
            'author':submission.author,
            'author_flair_text':submission.author_flair_text,
            'created_utc': submission.created_utc,
            'is_self' : submission.is_self,
            'link_flair_text':submission.link_flair_text,
            'name':submission.name,
            'num_comments':submission.num_comments,
            'score':submission.score,
            'selftext':submission.selftext,
            'stickied':submission.stickied,
            'title':submission.title,
            'upvote_ratio':submission.upvote_ratio,
            'url': submission.url
        })
    return posts



# Extract data from r/collegeresults
data = extract_subreddit_data('collegeresults')

In [3]:

# Convert data to a pandas DataFrame for easier processing
df = pd.DataFrame(data)

In [4]:
df = df[~df.stickied]


In [5]:
df.link_flair_text.apply(lambda x: re.sub('\+',"",x.split('|')[0]) if x!= 'Other' else 'a').loc[95]



'Other'

In [6]:
df.link_flair_text.loc[95]

'Other|1400+/31+|Art/Hum'

In [7]:
df['gpa'] = df.link_flair_text.apply(lambda x: re.sub('\+',"",x.split('|')[0]) if x.split('|')[0]!= 'Other' else None)

df['test_scores'] = df.link_flair_text.apply(lambda x: (x.split('|'))[1])

df['major'] = df.link_flair_text.apply(lambda x: (x.split('|'))[2])



In [8]:
df['sat_score'] = df.test_scores.apply(lambda x: re.sub('\+',"",x.split('/')[0]) if x.split('/')[0]!='Other' else None).astype(float)
df['act_score'] = df.test_scores.apply(lambda x: re.sub('\+',"",x.split('/')[1]) if x!='Other' else None).astype(float)

In [9]:

def extract_section(post, section_title):
    pattern = r"\*\*{}.*?\*\*(.*?)(?=\*\*[A-Z]|$)".format(re.escape(section_title))
    match = re.search(pattern, post, re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else None

def parse_key_value_list(section_text):
    items = re.findall(r"\*\s*(.*?):\s*(.*)", section_text)
    return {item[0].strip(): item[1].strip() for item in items}

def parse_numbered_list(section_text):
    items = re.findall(r"^(\d+)\.\s*(.*?)$", section_text, re.MULTILINE)
    return {f"#{num}": item.strip() for num, item in items}

def parse_simple_list(section_text):
    items = re.findall(r"\*\s*(.*)", section_text)
    return items


def parse_demographics(post):

    section = extract_section(post, "Academics")

    demographics = {}

    # Define patterns for each demographic category
    patterns = {
        'Gender': r'Gender\s*:\s*([^\n]+)',
        'Race/ethnicity': r'Race/ethnicity\s*:\s*([^\n]+)',
        'Residence': r'Residence\s*:\s*([^\n]+)',
        'Income Bracket': r'Income Bracket\s*:\s*([^\n]+)',
        'Type of School': r'Type of School\s*:\s*([^\n]+)',
        'Hooks': r'Hooks\s*:\s*([^\n]+)'
    }

    # Extract information for each category
    for category, pattern in patterns.items():
        match = re.search(pattern, post)
        demographics[category] = match.group(1).strip() if match else None

    return demographics


def parse_academics(post):
    section = extract_section(post, "Academics")
    return parse_key_value_list(section) if section else {}

def parse_standardized_testing(post):
    section = extract_section(post, "Standardized Testing")
    return parse_key_value_list(section) if section else {}

def parse_extracurriculars(post):
    section = extract_section(post, "Extracurriculars/Activities")
    return parse_numbered_list(section) if section else {}

def parse_awards(post):
    section = extract_section(post, "Awards/honors")
    return parse_numbered_list(section) if section else {}

def parse_letters_of_recommendation(post):
    section = extract_section(post, "Letters of Recommendation")
    return parse_simple_list(section) if section else {}

def parse_essays(post):
    section = extract_section(post, "Essays")
    return parse_key_value_list(section) if section else {}

def parse_decisions(post):
#     section = extract_section(post, "Decisions")
#     return parse_key_value_list(section) if section else {}
    # Find the section that starts with 'Acceptances:'
    # Find the section that starts with 'Acceptances:'
    acceptances_section = re.search(r'(Acceptances|Accepted):\s*\*?\s*(.*?)\s*(?=\*\*[A-Z]|$)', post, re.IGNORECASE | re.DOTALL)

    if acceptances_section:
        acceptances_text = acceptances_section.group(1)
        
        # Extract individual universities
        universities = re.findall(r'[\n\*]\s*(.+?)(?:\s*\n|$)', acceptances_text)
        return [uni.strip() for uni in universities if uni.strip()]
    else:

        return []
    

# Example usage

demographics = parse_demographics(post_content)
academics = parse_academics(post_content)
standardized_testing = parse_standardized_testing(post_content)
extracurriculars = parse_extracurriculars(post_content)
awards = parse_awards(post_content)
letters_of_recommendation = parse_letters_of_recommendation(post_content)
essays = parse_essays(post_content)
decisions = parse_decisions(post_content)




# Output the results
print("Demographics:", demographics)
print("Academics:", academics)
print("Standardized Testing:", standardized_testing)
print("Extracurriculars:", extracurriculars)
print("Awards:", awards)
print("Letters of Recommendation:", letters_of_recommendation)
print("Essays:", essays)
print("Decisions:", decisions)




NameError: name 'post_content' is not defined

In [None]:
df['demographics'] = df.selftext.apply(lambda x: parse_demographics(x))
# df['academics']
# df['standardized_testing']
# df['extracurriculars']
# df['awards']
# df['letters_of_recommendation']
# df['essays']
df['decisions'] = df.selftext.apply(lambda x: parse_decisions(x))

df['decisions']

In [None]:
df.selftext.loc[98]

In [None]:
demographics_df = pd.json_normalize(df['demographics'])

new_df = pd.concat([df,demographics_df],axis=1)

In [None]:
new_df

In [None]:
sum(df.decisions.apply(lambda x: len(x))==0)

In [None]:
demographics['Gender'],demographics['Race/ethnicity'],
demographics['Residence'],demographics['Income Bracket'],
demographics['Type of School'],

In [None]:
pd.Series(demographics)

pd.Series(academics)

In [None]:
df.selftext.apply(lambda x: )

In [None]:
final_data = df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data into features (X) and target (y)
X = df[['GPA', 'TestScores']]  # Include other features as necessary
y = df['Accepted']  # You need a target variable indicating acceptance

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (Random Forest as an example)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))


In [None]:
def get_user_input():
    # Get user data (GPA, Test Scores, etc.)
    # For example:
    user_gpa = float(input("Enter your GPA: "))
    user_test_scores = float(input("Enter your test scores: "))
    return [user_gpa, user_test_scores]

def predict_acceptance(model, user_input):
    # Predict using the model
    prediction = model.predict([user_input])
    return prediction

# Example usage
user_input = get_user_input()
prediction = predict_acceptance(model, user_input)
print("Probability of acceptance:", prediction)
