In [1]:
import praw # importing praw to access reddit api
import pandas as pd
import numpy as np
import nltk # importing nltk to acc
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize

In [2]:
import json
with open('keys.json') as f:
    keys = json.load(f)

app_id = keys['reddit']['app_id']
app_secret = keys['reddit']['app_secret']
username = keys['reddit']['username']
password = keys['reddit']['password']

In [3]:
reddit = praw.Reddit(client_id = app_id,
                    client_secret = app_secret,
                    user_agent = f'lse/0.0.1 by {username}',
                    username = username,
                    password = password)

In [4]:
import warnings
warnings.filterwarnings('ignore')
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
model = TFAutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
specific_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer);





All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [5]:
nltk.download('punkt')
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
def analyze_sentiment(text):

    sentences = nltk.sent_tokenize(text)
    compound_scores = []

    for sentence in sentences:
        sentiment = analyzer.polarity_scores(sentence)
        compound_scores.append(sentiment['compound'])

    if compound_scores:
        avg_compound_score = np.mean(compound_scores)
    else:
        avg_compound_score = 0

    return avg_compound_score

In [7]:
import re
from datetime import datetime
import emoji

def remove_emojis(text):
    return emoji.get_emoji_regexp().sub(u'', text)

subreddit = reddit.subreddit("6thForm")

post_text_lst= []
comment_text_lst = []

for index, submission in enumerate(subreddit.search("LSE+Lse+lse", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("UCL+ucl+Ucl", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Imperial+IMPERIAL+imperial+icl", limit=10,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Kings+KINGS+KCL+Kcl+kcl", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Cambridge+Oxford+Oxbridge", limit=1000,sort="relevance")):
    if submission.link_flair_text is not None:
        post_flair = remove_emojis(submission.link_flair_text)
    else:
        post_flair = None
    
    post_text_lst.append((index, remove_emojis(submission.selftext),post_flair, "post_text", datetime.utcfromtimestamp(submission.created_utc)))
    if submission.num_comments > 0:
        comments = submission.comments.replace_more(limit=0)            
        for comments in submission.comments.list():
            if comments.author != "AutoModerator":
                comment_text_lst.append((index, remove_emojis(comments.body), post_flair, "comment_text", datetime.utcfromtimestamp(comments.created_utc)))

In [8]:
df = pd.DataFrame(post_text_lst + comment_text_lst, columns=['post_index', 'text', 'flair', 'post_type', "date"])
df

Unnamed: 0,post_index,text,flair,post_type,date
0,0,Its coming up to deadline and has anyone else ...,UNI / UCAS,post_text,2024-04-12 11:31:40
1,1,I’ve heard that many people are dying to get a...,UNI / UCAS,post_text,2022-12-14 07:39:30
2,2,I know there isn't a conclusive answer anybody...,UNI / UCAS,post_text,2024-04-11 13:56:05
3,3,"A* A* A A(epq) predictions, 31 LNAT\ngot rejec...",UNI / UCAS,post_text,2024-04-26 19:40:48
4,4,LSE pure econ :((((((((\n\nPlease give me tips...,DISCUSSION,post_text,2024-04-25 11:04:22
...,...,...,...,...,...
5398,246,"Lol, look up what a normal distribution looks ...",UNI / UCAS,comment_text,2023-12-09 18:19:11
5399,246,Mhm cos thats why they dont know how averages ...,UNI / UCAS,comment_text,2023-12-08 18:55:39
5400,246,Yea and the normal distribution in this case h...,UNI / UCAS,comment_text,2023-12-09 18:43:20
5401,246,"how do you know that, you realise also that di...",UNI / UCAS,comment_text,2023-12-09 18:52:55


In [9]:
df2 = df[df['text']!=""].copy()
df2['text'] = df2['text'].str.split(r'\.\s+|\n+')
df_exploded = df2.explode('text')
df_exploded = df_exploded[df_exploded['text']!=""].copy()
df_exploded['Word_Count'] = df_exploded['text'].apply(lambda x: len(str(x).split()))
df_exploded = df_exploded[df_exploded['Word_Count'] >= 10]
df_exploded


Unnamed: 0,post_index,text,flair,post_type,date,Word_Count
0,0,Its coming up to deadline and has anyone else ...,UNI / UCAS,post_text,2024-04-12 11:31:40,53
0,0,I'm wondering if I should send them an email a...,UNI / UCAS,post_text,2024-04-12 11:31:40,26
1,1,I’ve heard that many people are dying to get a...,UNI / UCAS,post_text,2022-12-14 07:39:30,27
2,2,I know there isn't a conclusive answer anybody...,UNI / UCAS,post_text,2024-04-11 13:56:05,28
2,2,I applied in October for econ and have been ac...,UNI / UCAS,post_text,2024-04-11 13:56:05,40
...,...,...,...,...,...,...
5400,246,Yea and the normal distribution in this case h...,UNI / UCAS,comment_text,2023-12-09 18:43:20,18
5401,246,"how do you know that, you realise also that di...",UNI / UCAS,comment_text,2023-12-09 18:52:55,20
5401,246,That could just be rng remember its à multiple...,UNI / UCAS,comment_text,2023-12-09 18:52:55,11
5401,246,Further looking at the tmua distribution it’s ...,UNI / UCAS,comment_text,2023-12-09 18:52:55,17


In [None]:
df_exploded['sentiment'] = df_exploded['text'].apply(lambda x: specific_model(x, truncation=True, padding=True, max_length=128)[0])
df_exploded['vader_compound_score'] = df_exploded['text'].apply(lambda x: analyze_sentiment(x))
df_exploded.to_csv('data/lse_sentiment_data.csv', index=False)
# df_exploded.to_csv('data/ucl_sentiment_data.csv', index=False)
# df_exploded.to_csv('data/imperial_sentiment_data.csv', index=False)
# df_exploded.to_csv('data/kings_sentiment_data.csv', index=False)
# df_exploded.to_csv('data/oxbridge_sentiment_data.csv', index=False)