In [1]:
import praw
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast

In [2]:
import json
with open('keys.json') as f:
    keys = json.load(f)

app_id = keys['reddit']['app_id']
app_secret = keys['reddit']['app_secret']
username = keys['reddit']['username']
password = keys['reddit']['password']

In [3]:
reddit = praw.Reddit(client_id = app_id,
                    client_secret = app_secret,
                    user_agent = f'lse/0.0.1 by {username}',
                    username = username,
                    password = password)

In [4]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
model = TFAutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
specific_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer);






All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [5]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize

nltk.download('punkt')
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
def analyze_sentiment(text):

    sentences = nltk.sent_tokenize(text)
    compound_scores = []

    for sentence in sentences:
        sentiment = analyzer.polarity_scores(sentence)
        compound_scores.append(sentiment['compound'])

    if compound_scores:
        avg_compound_score = np.mean(compound_scores)
    else:
        avg_compound_score = 0

    return avg_compound_score

In [7]:
import re
from datetime import datetime
import emoji

def remove_emojis(text):
    return emoji.get_emoji_regexp().sub(u'', text)

subreddit = reddit.subreddit("6thForm")

post_text_lst= []
comment_text_lst = []

# for index, submission in enumerate(subreddit.search("LSE+Lse+lse", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("UCL+ucl+Ucl", limit=1000,sort="relevance")):
for index, submission in enumerate(subreddit.search("Imperial+IMPERIAL+imperial+icl", limit=10,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Kings+KINGS+KCL+Kcl+kcl", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Cambridge+Oxford+Oxbridge", limit=1000,sort="relevance")):
    if submission.link_flair_text is not None:
        post_flair = remove_emojis(submission.link_flair_text)
    else:
        post_flair = None
    
    post_text_lst.append((index, remove_emojis(submission.selftext),post_flair, "post_text", datetime.utcfromtimestamp(submission.created_utc)))
    if submission.num_comments > 0:
        comments = submission.comments.replace_more(limit=0)            
        for comments in submission.comments.list():
            if comments.author != "AutoModerator":
                comment_text_lst.append((index, remove_emojis(comments.body), post_flair, "comment_text", datetime.utcfromtimestamp(comments.created_utc)))

In [8]:
df = pd.DataFrame(post_text_lst + comment_text_lst, columns=['post_index', 'text', 'flair', 'post_type', "date"])
df

Unnamed: 0,post_index,text,flair,post_type,date
0,0,ICL = I Can't Lie and\nICL = Imperial College ...,MEME,post_text,2023-12-30 15:09:53
1,1,"So I'm an international student from Canada, a...",DISCUSSION,post_text,2024-04-09 09:13:03
2,2,"Hello, recently saw two people comparing UCL a...",DISCUSSION,post_text,2024-04-26 20:03:56
3,3,I want to apply for physics and I'm hopefully ...,UNI / UCAS,post_text,2024-04-03 14:13:32
4,4,Hello! \nI got an offer from imperial college ...,DISCUSSION,post_text,2024-02-27 16:46:51
...,...,...,...,...,...
156,9,[removed],UNI / UCAS,comment_text,2022-03-17 11:01:04
157,9,[removed],UNI / UCAS,comment_text,2022-03-17 10:59:34
158,9,[removed],UNI / UCAS,comment_text,2022-03-17 11:11:47
159,9,[removed],UNI / UCAS,comment_text,2022-03-17 11:13:56


In [9]:
df2 = df[df['text']!=""].copy()
df2['text'] = df2['text'].str.split(r'\.\s+|\n+')
df_exploded = df2.explode('text')
df_exploded = df_exploded[df_exploded['text']!=""].copy()
df_exploded['Word_Count'] = df_exploded['text'].apply(lambda x: len(str(x).split()))
df_exploded = df_exploded[df_exploded['Word_Count'] >= 10]
df_exploded


Unnamed: 0,post_index,text,flair,post_type,date,Word_Count
1,1,"So I'm an international student from Canada, a...",DISCUSSION,post_text,2024-04-09 09:13:03,30
1,1,I am having trouble deciding on which offer to...,DISCUSSION,post_text,2024-04-09 09:13:03,10
1,1,My main dilemma stems from the fact that I lik...,DISCUSSION,post_text,2024-04-09 09:13:03,37
1,1,For context I would like to go into some type ...,DISCUSSION,post_text,2024-04-09 09:13:03,15
1,1,U of T has much more freedom in terms of my de...,DISCUSSION,post_text,2024-04-09 09:13:03,12
...,...,...,...,...,...,...
146,9,doing x course led me to research further into...,UNI / UCAS,comment_text,2022-03-17 12:32:51,19
148,9,Would you say that the test was one of the mos...,UNI / UCAS,comment_text,2022-03-18 12:55:27,23
149,9,That’s very good news! I’ve read lots of books...,UNI / UCAS,comment_text,2022-03-17 09:27:55,18
152,9,Based on how they have worded emails it sounds...,UNI / UCAS,comment_text,2022-03-18 14:26:14,22


In [11]:
df_exploded['sentiment'] = df_exploded['text'].apply(lambda x: specific_model(x, truncation=True, padding=True, max_length=128)[0])
df_exploded['vader_compound_score'] = df_exploded['text'].apply(lambda x: analyze_sentiment(x))
df_exploded.to_csv('test_sentiment_data.csv', index=False)

In [17]:
csv_files = [
    'data/lse_sentiment_data.csv',
    'data/ucl_sentiment_data.csv',
    'data/imperial_sentiment_data.csv',
    'data/kings_sentiment_data.csv',
    'data/oxbridge_sentiment_data.csv'
]
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df['vader_compound_score'] = df['text'].apply(lambda x: analyze_sentiment(x))
    df.to_csv(csv_file[:-4] + "_2.csv", index=False)