In [24]:
pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp310-cp310-macosx_14_0_x86_64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading scipy-1.14.1-cp310-cp310-macosx_14_0_x86_64.whl (25.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.5/25.5 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.5.2 scipy-1.14.1 threadpoolctl-3.5.0
Note: you may nee

In [1]:
import pandas as pd
# Update the file path to include the full path to the dataset file
df = pd.read_csv("Reddit_Data.csv")

# Remove rows with NaN or None in the 'clean_comment' column
df = df[df['clean_comment'].notna()]

# Reduce the dataset to 300 samples
df = df.sample(n=1000, random_state=42).reset_index(drop=True)

# Map category values to sentiment labels
sentiment_map = {-1: "Negative", 0: "Neutral", 1: "Positive"}
df['Sentiment_Label'] = df['category'].map(sentiment_map)

df.head(5)

Unnamed: 0,clean_comment,category,Sentiment_Label
0,after seeing this uvjx3kwoehw video seems anyt...,0,Neutral
1,you killed karma,-1,Negative
2,was voluntary sale not forced anyone then wha...,1,Positive
3,weird see this because was just talking about...,1,Positive
4,modi undoubtedly the worst thing that has happ...,-1,Negative


In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import pipeline

# Initialize sentiment analysis models
vader = SentimentIntensityAnalyzer()
bert = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Define VADER prediction function with standard ranges
def predict_vader(text):
    compound = vader.polarity_scores(text)['compound']
    if compound > 0.05:
        return "Positive"
    elif compound < -0.05:
        return "Negative"
    else:
        return "Neutral"

# Define TextBlob prediction function with standard ranges
def predict_textblob(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:  # Updated threshold for positive
        return "Positive"
    elif polarity < -0.1:  # Updated threshold for negative
        return "Negative"
    else:
        return "Neutral"

# Define BERT prediction function with standard ranges
def predict_bert(text):
    result = bert(text[:512])[0]  # Limit text to 512 tokens
    label = result['label']
    score = result['score']

    if score < 0.6:  # Low confidence means Neutral
        return "Neutral"
    elif label == "POSITIVE":
        return "Positive"
    elif label == "NEGATIVE":
        return "Negative"

# Apply sentiment analysis models to the dataset
df['VADER_Predicted'] = df['clean_comment'].apply(predict_vader)
df['TextBlob_Predicted'] = df['clean_comment'].apply(predict_textblob)
df['BERT_Predicted'] = df['clean_comment'].apply(predict_bert)

# Evaluate accuracy for each model
def calculate_accuracy(predicted_col, true_col):
    return (df[predicted_col] == df[true_col]).mean()

vader_accuracy = calculate_accuracy('VADER_Predicted', 'Sentiment_Label')
textblob_accuracy = calculate_accuracy('TextBlob_Predicted', 'Sentiment_Label')
bert_accuracy = calculate_accuracy('BERT_Predicted', 'Sentiment_Label')

# Create a dictionary of model accuracies
accuracy_data = {
    'Model': ['VADER', 'TextBlob', 'BERT'],
    'Accuracy': [vader_accuracy, textblob_accuracy, bert_accuracy]
}

# Convert the dictionary into a DataFrame
accuracy_df = pd.DataFrame(accuracy_data)

# Display the accuracy DataFrame
print(accuracy_df)


  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


      Model  Accuracy
0     VADER     0.642
1  TextBlob     0.818
2      BERT     0.354


In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import pipeline
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Initialize sentiment analysis models
vader = SentimentIntensityAnalyzer()
bert = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Define VADER prediction function with standard ranges
def predict_vader(text):
    compound = vader.polarity_scores(text)['compound']
    if compound > 0.05:
        return "Positive"
    elif compound < -0.05:
        return "Negative"
    else:
        return "Neutral"

# Define TextBlob prediction function with standard ranges
def predict_textblob(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:  # Updated threshold for positive
        return "Positive"
    elif polarity < -0.1:  # Updated threshold for negative
        return "Negative"
    else:
        return "Neutral"

# Define BERT prediction function with standard ranges
def predict_bert(text):
    result = bert(text[:512])[0]  # Limit text to 512 tokens
    label = result['label']
    score = result['score']

    if score < 0.6:  # Low confidence means Neutral
        return "Neutral"
    elif label == "POSITIVE":
        return "Positive"
    elif label == "NEGATIVE":
        return "Negative"

# Apply sentiment analysis models to the dataset
df['VADER_Predicted'] = df['clean_comment'].apply(predict_vader)
df['TextBlob_Predicted'] = df['clean_comment'].apply(predict_textblob)
df['BERT_Predicted'] = df['clean_comment'].apply(predict_bert)

# Define function to generate metrics and confusion matrix
def evaluate_model(predicted_col, true_col):
    true_labels = df[true_col]
    predicted_labels = df[predicted_col]

    # Classification report
    report = classification_report(true_labels, predicted_labels, output_dict=True)

    # Confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels, labels=["Positive", "Neutral", "Negative"])

    return report, cm

# Function to clean classification report for display
def clean_classification_report(report):
    # Convert report to DataFrame
    report_df = pd.DataFrame(report).transpose()
    # Drop 'support' column and unwanted rows
    report_df = report_df.drop(columns='support', errors='ignore')
    report_df = report_df.drop(index=['macro avg', 'weighted avg'], errors='ignore')
    return report_df

# Evaluate each model
vader_report, vader_cm = evaluate_model('VADER_Predicted', 'Sentiment_Label')
textblob_report, textblob_cm = evaluate_model('TextBlob_Predicted', 'Sentiment_Label')
bert_report, bert_cm = evaluate_model('BERT_Predicted', 'Sentiment_Label')

# Clean and display VADER report
vader_cleaned_report = clean_classification_report(vader_report)
print("\nVADER Classification Report (Cleaned):")
print(vader_cleaned_report)
print("\nVADER Confusion Matrix:")
print(vader_cm)

# Clean and display TextBlob report
textblob_cleaned_report = clean_classification_report(textblob_report)
print("\nTextBlob Classification Report (Cleaned):")
print(textblob_cleaned_report)
print("\nTextBlob Confusion Matrix:")
print(textblob_cm)

# Clean and display BERT report
bert_cleaned_report = clean_classification_report(bert_report)
print("\nBERT Classification Report (Cleaned):")
print(bert_cleaned_report)
print("\nBERT Confusion Matrix:")
print(bert_cm)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



VADER Classification Report (Cleaned):
          precision    recall  f1-score
Negative   0.447811  0.627358  0.522593
Neutral    0.765957  0.591781  0.667697
Positive   0.695962  0.692671  0.694313
accuracy   0.642000  0.642000  0.642000

VADER Confusion Matrix:
[[293  38  92]
 [ 77 216  72]
 [ 51  28 133]]

TextBlob Classification Report (Cleaned):
          precision    recall  f1-score
Negative   1.000000  0.599057  0.749263
Neutral    0.667276  1.000000  0.800439
Positive   1.000000  0.770686  0.870494
accuracy   0.818000  0.818000  0.818000

TextBlob Confusion Matrix:
[[326  97   0]
 [  0 365   0]
 [  0  85 127]]

BERT Classification Report (Cleaned):
          precision    recall  f1-score
Negative   0.275148  0.877358  0.418919
Neutral    0.588235  0.027397  0.052356
Positive   0.514658  0.373522  0.432877
accuracy   0.354000  0.354000  0.354000

BERT Confusion Matrix:
[[158   5 260]
 [125  10 230]
 [ 24   2 186]]
