In [None]:
import praw
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast

In [None]:
import json
with open('keys.json') as f:
    keys = json.load(f)

app_id = keys['reddit']['app_id']
app_secret = keys['reddit']['app_secret']
username = keys['reddit']['username']
password = keys['reddit']['password']

In [None]:
reddit = praw.Reddit(client_id = app_id,
                    client_secret = app_secret,
                    user_agent = f'lse/0.0.1 by {username}',
                    username = username,
                    password = password)

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
model = TFAutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
specific_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer);


In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize

nltk.download('punkt')
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()


In [None]:
def analyze_sentiment(text):

    sentences = nltk.sent_tokenize(text)
    compound_scores = []

    for sentence in sentences:
        sentiment = analyzer.polarity_scores(sentence)
        compound_scores.append(sentiment['compound'])

    if compound_scores:
        avg_compound_score = np.mean(compound_scores)
    else:
        avg_compound_score = 0

    return avg_compound_score

In [None]:
import re
from datetime import datetime
import emoji

def remove_emojis(text):
    return emoji.get_emoji_regexp().sub(u'', text)

subreddit = reddit.subreddit("6thForm")

post_text_lst= []
comment_text_lst = []

for index, submission in enumerate(subreddit.search("LSE+Lse+lse", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("UCL+ucl+Ucl", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Imperial+IMPERIAL+imperial+icl", limit=10,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Kings+KINGS+KCL+Kcl+kcl", limit=1000,sort="relevance")):
# for index, submission in enumerate(subreddit.search("Cambridge+Oxford+Oxbridge", limit=1000,sort="relevance")):
    if submission.link_flair_text is not None:
        post_flair = remove_emojis(submission.link_flair_text)
    else:
        post_flair = None
    
    post_text_lst.append((index, remove_emojis(submission.selftext),post_flair, "post_text", datetime.utcfromtimestamp(submission.created_utc)))
    if submission.num_comments > 0:
        comments = submission.comments.replace_more(limit=0)            
        for comments in submission.comments.list():
            if comments.author != "AutoModerator":
                comment_text_lst.append((index, remove_emojis(comments.body), post_flair, "comment_text", datetime.utcfromtimestamp(comments.created_utc)))

In [None]:
df = pd.DataFrame(post_text_lst + comment_text_lst, columns=['post_index', 'text', 'flair', 'post_type', "date"])
df

In [None]:
df2 = df[df['text']!=""].copy()
df2['text'] = df2['text'].str.split(r'\.\s+|\n+')
df_exploded = df2.explode('text')
df_exploded = df_exploded[df_exploded['text']!=""].copy()
df_exploded['Word_Count'] = df_exploded['text'].apply(lambda x: len(str(x).split()))
df_exploded = df_exploded[df_exploded['Word_Count'] >= 10]
df_exploded


In [None]:
df_exploded['sentiment'] = df_exploded['text'].apply(lambda x: specific_model(x, truncation=True, padding=True, max_length=128)[0])
df_exploded['vader_compound_score'] = df_exploded['text'].apply(lambda x: analyze_sentiment(x))
df_exploded.to_csv('data/lse_sentiment_data.csv', index=False)

In [None]:
csv_files = [
    'data/lse_sentiment_data.csv',
    'data/ucl_sentiment_data.csv',
    'data/imperial_sentiment_data.csv',
    'data/kings_sentiment_data.csv',
    'data/oxbridge_sentiment_data.csv'
]
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df['vader_compound_score'] = df['text'].apply(lambda x: analyze_sentiment(x))
    df.to_csv(csv_file[:-4] + "_2.csv", index=False)