In [1]:
# import all the required libraries
import pandas as pd
import praw

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
from sklearn.metrics import confusion_matrix

#### Data acquisition

I am using Reddit API to collect data. The data is concerned with comments in different reddit subcommunities. Here I have considered the 'politics' subcommunity. In here I am searching for the post titles containing the word 'Trump' or 'Harris'.

The titles that contain one of these words will be taken and all the comments for them will be added to a Python list.
Comments for the titles containing 'Trump' and 'Harris' will be stored in separate lists. 

I am acquiring and preparing these data for sentiment analysis task. So to find out the overall public sentiment towards Trump and Harris in Reddit space. Although by no means it can be near to a true speculation about the topic, but to follow the standard procedure of working on a machine learning task.





## Ongoing challenges
- When running the cells that sends request to Reddit API endpoint, due to large amount of data available, it takes several minutes (10-12) to complete.


In [2]:


# Initialize Reddit API connection
reddit = praw.Reddit(
    client_id='tZwMe0a2cneHp6qZz_x09w',
    client_secret='Hp-RZTXgWHFSayjB5177ZyKRPfVpQw',
    user_agent='Mean_Stuff7937'
)

# Lists to store the titles and comments
politics_titles_trump = []
politics_titles_harris = []

# List contains comments about posts related to Trump
comments_trump = []

# List contains comments about posts related to Harris
comments_harris = []

# Access the 'politics' subreddit
subreddit = reddit.subreddit('politics')

# Function to safely handle comments for a given post
def handle_comments(post, title_list, comment_list):
    title_list.append(post.title)
    post.comments.replace_more(limit=0)  # Collapse "load more comments"
    for comment in post.comments.list():
        comment_list.append(comment.body)

# Limit to a certain number of posts to avoid overwhelming the API
for post in subreddit.hot(limit=None):  # Fetch top 100 posts
    try:
        title_lower = post.title.lower()  # Make title lowercase for case-insensitive match

        # Check if 'trump' is in the title
        if 'trump' in title_lower:
            handle_comments(post, politics_titles_trump, comments_trump)

        # Check if 'harris' is in the title
        elif 'harris' in title_lower:
            handle_comments(post, politics_titles_harris, comments_harris)

    except Exception as e:
        print(f"An error occurred with post: {post.title}, error: {e}")
        time.sleep(2)  # Delay to avoid rate-limiting issues

# Print the count of titles containing 'Trump' and 'Harris'
print(f"Titles containing Trump: {len(politics_titles_trump)}")
print(f"Titles containing Harris: {len(politics_titles_harris)}")
print(f"There are total {len(comments_trump)} comments for Trump")
print(f"There are total {len(comments_harris)} comments for Harris")


Titles containing Trump: 406
Titles containing Harris: 88
There are total 46000 comments for Trump
There are total 6037 comments for Harris


#### Creating data frame for titles for each, Trump and Harris

In [3]:
# convert the above lists into dataframes
df_politics_titles_trump = pd.DataFrame(politics_titles_trump, columns=['politics_titles_Trump'])

# save the dataframe into a csv file
df_politics_titles_trump.to_csv("df_politics_titles_trump.csv")

df_politics_comments_trump

NameError: name 'df_politics_comments_trump' is not defined

In [None]:
# convert it to a dataframe
df_politics_titles_harris = pd.DataFrame(politics_titles_harris, columns=['politics_titles_harris'])

# save it to a csv file
df_politics_titles_harris.to_csv("df_politics_titles_harris.csv", index=False)

df_politics_titles_harris

#### Creating data frames of comments for each, Trump and Harris

In [None]:
# dataframe for comments
df_politics_comments_trump = pd.DataFrame(comments_trump, columns=["Comments_trump"])

# save the dataframe into a csv file
df_politics_comments_trump.to_csv("politics_comments_trump.csv", index=False)

df_politics_comments_trump

In [None]:
# create a data frame for the comments
df_politics_comments_harris = pd.DataFrame(comments_harris, columns=['Comments_harris'])

# save it to a csv file
df_politics_comments_harris.to_csv("politics_comments_harris.csv")

df_politics_comments_harris

### Data exploration
Here we explore our datasets to get a better idea about them.

#### This cell will be for my note taking

- Explore all the subreddit communities
- we will look and search for comments related to a specific word, such as 'Donald Trump'
- We will compare the sentiments in different subreddit communitites
- The previous step will give us some idea how overall public sentiments are towards 'Donald Trump'
- The subreddit communities identified so far:
  - News
  - Politics
  - sports
  - World News
  - Funny
      


#### How to proceed?
I will gather comments from subreddit communities and will do some exploration.


## Sentiment Analysis using VADER

I use VADER here to label the datasets, as labelling them manually is not feasible at the moment.

We used VADER, a rule-based sentiment analysis tool for social media text (Hutto & Gilbert, 2014), to analyze the polarity of comments in Reddit.

### References
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.



In [None]:
# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment label using VADER
def get_sentiment_label(comment):
    sentiment_dict = analyzer.polarity_scores(comment)
    compound_score = sentiment_dict['compound']
    
    # Assign sentiment label based on compound score
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Assuming you have a DataFrame of Reddit comments (replace with your collected data)
df_sentiments_trump = pd.DataFrame({
    'Comment': df_politics_comments_trump['Comments_trump']
})

# Apply VADER to auto-label comments
df_sentiments_trump['Sentiment'] = df_sentiments_trump['Comment'].apply(get_sentiment_label)

In [None]:
print(df_sentiments_trump)


In [None]:
# Assuming you have a DataFrame of Reddit comments (replace with your collected data)
df_sentiments_harris = pd.DataFrame({
    'Comment': df_politics_comments_harris['Comments_harris']
})

# Apply VADER to auto-label comments
df_sentiments_harris['Sentiment'] = df_sentiments_harris['Comment'].apply(get_sentiment_label)

In [None]:
df_sentiments_harris

In [None]:
# Count of sentiments in the dataset for Harris
sentiment_counts_harris = df_sentiments_harris['Sentiment'].value_counts()
print(sentiment_counts_harris)

In [None]:
# count of sentiments in the dataset for Trump
sentiment_counts_trump = df_sentiments_trump['Sentiment'].value_counts()
print(sentiment_counts_trump)

#### Some exploratory steps

According to the labels assigned to each comment in the dataset, we check the proportion of each class of sentiment (i.e positive, negative and neutral)

As it can be seen below, from the comments collected, there are almost equal proportion of negative and positive for Trump while the proportion of negative sentiments for Harris is lower.

### What proportion of comments are related to Harris vs Trump


In [None]:
# compare the counts of comments for Harris vs Trump 
trump_comments_count = len(df_politics_comments_trump)

harris_comments_count = len(df_politics_comments_harris)

print(f"No of comments related to Trump: {trump_comments_count}")
print(f"No of comments related to Harris: {harris_comments_count}")


In [None]:
# Data for plotting
labels = ['Trump', 'Harris']
comment_counts = [trump_comments_count, harris_comments_count]

# Create a bar chart
plt.figure(figsize=(8, 6))  # Adjusts the size of the plot
bars = plt.bar(labels, comment_counts, color=['#E50000', 'blue'])

# Add labels, title, and grid
plt.title('Comparison of Comments for Trump vs Harris', fontsize=16)
plt.xlabel('Person', fontsize=14)
plt.ylabel('Number of Comments', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the bars with the actual counts
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 50, int(yval), ha='center', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
total_counts_harris = sentiment_counts_harris.sum()

# Calculate proportions for each sentiment
sentiment_proportions_harris = sentiment_counts_harris / total_counts_harris

# Create a bar plot for proportions
plt.figure(figsize=(7,5))
sentiment_proportions_harris.plot(kind='bar', color=['#15B01A', '#E50000', 'gray'])

# Add labels and title
plt.title('Sentiment Distribution (Proportions)')
plt.xlabel('Sentiment')
plt.ylabel('Proportion')

# Set x-axis ticks for better readability
plt.xticks(rotation=0)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

print(sentiment_proportions_harris)


In [None]:
total_counts_trump = sentiment_counts_trump.sum()

# Calculate proportions for each sentiment
sentiment_proportions_trump = sentiment_counts_trump / total_counts_trump


# Create a bar plot
plt.figure(figsize=(7, 5))
sentiment_proportions_trump.plot(kind='bar', color=['#15B01A', '#E50000', 'gray'])

# Add labels and title
plt.title('Sentiment Distribution (Proportions)')
plt.xlabel('Sentiment')
plt.ylabel('Proportion')

# Set x-axis ticks for better readability
plt.xticks(rotation=0)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

print(sentiment_proportions_trump)

### Merging of both the datasets

I want to merge both the comments datasets with their corresponding sentiments as labels.
I am preparing this merged dataset for sentiment classification task

In [None]:
df_sentiments_harris

In [None]:
df_sentiments_trump

In [None]:
df_sentiments_harris.info

In [None]:
# Append df_sentiments_trump to df_sentiments_harris
df_merged = pd.concat([df_sentiments_harris, df_sentiments_trump], ignore_index=True)

# Display the shape of the merged dataframe
print(df_merged.shape)

# Optional: Display the first few rows of the merged dataframe to verify
print(df_merged.head())


## Clean the data a bit

Basic cleaning processes in the following cell

In [None]:
# Step 1: Strip leading and trailing whitespaces from the 'Comment' column
df_merged['Comment'] = df_merged['Comment'].str.strip()

# Step 2: Remove rows where 'Comment' is empty or too short (e.g., less than 5 characters)
df_merged = df_merged[df_merged['Comment'].str.len() > 5]

# Step 3: Drop duplicate rows if any exist
df_merged = df_merged.drop_duplicates()

# Step 4: Standardize the 'Sentiment' column to be lowercase
df_merged['Sentiment'] = df_merged['Sentiment'].str.lower()

# Step 5: Check and drop any rows with missing values
df_merged = df_merged.dropna()

# Optional: Reset index after cleaning
df_merged = df_merged.reset_index(drop=True)

# Display the cleaned DataFrame shape and first few rows
print(df_merged.shape)
print(df_merged.head())

In [None]:
# 1. Preprocess the data
# Convert 'Sentiment' to numeric form (e.g., 'positive' = 1, 'negative' = 0, 'neutral' = 2)
df_merged['Sentiment'] = df_merged['Sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2})

# Check if there are any NaN values after encoding
df_merged = df_merged.dropna()

# 2. Split the data into training and testing sets
X = df_merged['Comment']  # Feature: the text comments
y = df_merged['Sentiment']  # Target: sentiment labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Convert text data into numerical form using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 4. Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# 5. Make predictions and evaluate the model
y_pred = model.predict(X_test_tfidf)

# 6. Evaluate the performance of the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Classification report dictionary for precision, recall, and f1-score
report = {
    'Sentiment': ['Negative', 'Positive', 'Neutral'],
    'Precision': [0.82, 0.82, 0.76],
    'Recall': [0.77, 0.80, 0.85],
    'F1-Score': [0.79, 0.81, 0.80]
}

# Create a DataFrame for easy plotting
df_report = pd.DataFrame(report)

# Plot precision, recall, and f1-score
fig, ax = plt.subplots(figsize=(10, 6))
df_report.set_index('Sentiment').plot(kind='bar', ax=ax)

plt.title('Precision, Recall, and F1-Score for Each Sentiment Class', fontsize=16)
plt.ylabel('Score')
plt.ylim(0.5, 1.0)
plt.xticks(rotation=0)
plt.legend(loc='lower right', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show plot
plt.tight_layout()
plt.show()

# 2. Confusion Matrix Visualization
y_pred = model.predict(X_test_tfidf)
cm = confusion_matrix(y_test, y_pred)

# Labels for the confusion matrix
labels = ['Negative', 'Positive', 'Neutral']

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)

plt.title('Confusion Matrix for Sentiment Analysis', fontsize=16)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# 1. Binarize the labels for One-vs-Rest strategy (0 = negative, 1 = positive, 2 = neutral)
y_bin = label_binarize(y, classes=[0, 1, 2])
n_classes = y_bin.shape[1]

# Split the data into training and testing sets
X_train, X_test, y_train_bin, y_test_bin = train_test_split(X, y_bin, test_size=0.2, random_state=42)

# Convert the text into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 2. Train a One-vs-Rest Logistic Regression model
classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000))
classifier.fit(X_train_tfidf, y_train_bin)

# 3. Predict probabilities for the test set
y_score = classifier.decision_function(X_test_tfidf)

# 4. Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot the ROC curves
plt.figure(figsize=(10, 8))

colors = ['blue', 'green', 'red']
labels = ['Negative', 'Positive', 'Neutral']

for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, 
             label=f'ROC curve for {labels[i]} (area = {roc_auc[i]:0.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC Curves for Sentiment Analysis (One-vs-Rest)', fontsize=16)
plt.legend(loc='lower right', fontsize=12)
plt.grid()

plt.show()