### Huggingface model
- cardiffnlp/twitter-roberta-base-sentiment (58M)
- spacesedan/reddit-sentiment-analysis-longformer (149M)

In [46]:
import pandas as pd

In [47]:
df = pd.read_csv('reddit_sentiment_data.csv')

In [48]:
df.shape

(55546, 8)

In [51]:
# Import required libraries for sentiment analysis
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [52]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [53]:
# Load the RoBERTa sentiment model
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Download label mapping
labels = []
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

print(f"Labels: {labels}")
print("Model loaded successfully!")

Loading tokenizer and model...
Labels: ['negative', 'neutral', 'positive']
Model loaded successfully!


In [54]:
# Function to get sentiment score (0=Negative, 1=Neutral, 2=Positive)
def get_sentiment_score(text):
    if pd.isna(text) or text == '' or str(text).strip() == '':
        return 0  # Return 0 (Negative) for NaN or empty values
    
    try:
        # Preprocess the text
        processed_text = preprocess(str(text))
        
        # Tokenize and get model prediction
        encoded_input = tokenizer(processed_text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        
        # Get the predicted class (0=Negative, 1=Neutral, 2=Positive)
        predicted_class = np.argmax(scores)
        return int(predicted_class)
    
    except Exception as e:
        print(f"Error processing text: {str(text)[:50]}... - {e}")
        return 0  # Return 0 (Negative) for errors as well

In [57]:
%%time

# Apply sentiment analysis to title and body columns
print("Analyzing sentiment for titles...")
df['title_score_roberta'] = df['title'].apply(get_sentiment_score)

print("Analyzing sentiment for bodies...")
df['body_score_roberta'] = df['body'].apply(get_sentiment_score)

print("Sentiment analysis complete!")

Analyzing sentiment for titles...
Analyzing sentiment for bodies...
Sentiment analysis complete!
CPU times: user 1h 59min 51s, sys: 31min 44s, total: 2h 31min 35s
Wall time: 1h 39min 33s


In [59]:
df.to_csv('reddit_sentiment_data_models.csv', index=False)

In [61]:
import matplotlib.pyplot as plt

In [62]:
df.columns

Index(['title', 'score', 'comms_num', 'body', 'date', 'stock',
       'title_sentiment', 'body_sentiment', 'title_score_roberta',
       'body_score_roberta'],
      dtype='object')

In [65]:
df.index

RangeIndex(start=0, stop=55546, step=1)