Importing and Preprocessing Datasets

In [None]:
import pandas as pd

test_data = pd.read_csv('sentiment-topic-test.tsv', sep='\t')
test_data.drop('sentence_id', axis=1, inplace=True)
print(test_data.head())

                                            sentence sentiment   topic
0  The stadium was alive with the roar of the cro...  positive  sports
1  That last-minute goal had me jumping out of my...  positive  sports
2  I couldn’t put the book down; it swept me into...  positive    book
3  The story had its moments, though some parts f...   neutral    book
4  I enjoyed the way the timelines shifted, even ...   neutral    book


In [73]:
sentences = pd.read_csv('stanfordSentimentTreebank\datasetSentences.txt', sep='\t')
sentiments = pd.read_csv('stanfordSentimentTreebank\sentiment_labels.txt', sep='|', engine='python')
sentiments.columns = ['sentence_index', 'sentiment_value']

train_data_stf = pd.merge(sentences, sentiments, on='sentence_index')
train_data_stf = train_data_stf.rename(columns={'sentence': 'text', 'sentiment_value': 'rating'})
train_data_stf.drop('sentence_index', axis=1, inplace=True)
print(train_data_stf.head())

                                                text   rating
0  The Rock is destined to be the 21st Century 's...  0.50000
1  The gorgeously elaborate continuation of `` Th...  0.44444
2                     Effective but too-tepid biopic  0.50000
3  If you sometimes like to go to the movies to h...  0.42708
4  Emerges as something rare , an issue movie tha...  0.37500


In [68]:
from datasets import load_dataset

dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", 
    "raw_review_Movies_and_TV", 
    split="full[:100000]", 
    trust_remote_code=True
    )

df = pd.DataFrame(dataset)  # Convert to pandas DataFrame
train_data = df[['rating', 'text']]
# train_data = train_data[train_data['text'].str.split().str.len() <= 30]

# Check original distribution
print("Original rating distribution:")
print(train_data['rating'].value_counts().sort_index())

# Find the minimum count across all rating classes
min_count = train_data['rating'].value_counts().min()
print(f"\nMinimum count: {min_count}")

# Balance the classes by sampling equal amounts from each rating
balanced_data = train_data.groupby('rating').apply(
    lambda x: x.sample(n=min_count, random_state=42)
).reset_index(drop=True)

print(f"\nBalanced rating distribution:")
print(balanced_data['rating'].value_counts().sort_index())

print(f"\nOriginal dataset size: {len(train_data)}")
print(f"Balanced dataset size: {len(balanced_data)}")

# Use balanced_data for your training
train_data_amazon = balanced_data

Original rating distribution:
rating
1.0     6322
2.0     4423
3.0     8588
4.0    16531
5.0    64136
Name: count, dtype: int64

Minimum count: 4423

Balanced rating distribution:
rating
1.0    4423
2.0    4423
3.0    4423
4.0    4423
5.0    4423
Name: count, dtype: int64

Original dataset size: 100000
Balanced dataset size: 22115


  balanced_data = train_data.groupby('rating').apply(


In [72]:
train_data_amazon.head()

Unnamed: 0,rating,text
0,1.0,The same exact thing over and over. Nothing bu...
1,1.0,I turned it off after 10 minutes because it wa...
2,1.0,"Wow, I was SO EXCITED when this movie hit Amaz..."
3,1.0,Hollywood style attempt at recreating the horr...
4,1.0,"The Office is one of my very favorite shows, s..."


In [None]:
train_data_stf['rating'] = (train_data_stf['rating'] * 5).clip(1, 5).round()
train_data_combined = pd.concat([train_data_amazon, train_data_stf], ignore_index=True)

print(train_data_combined.head())

       rating                                               text
5         1.0         Don't be fooled, there is no mystery here.
6         1.0                                         Too boring
7         1.0  This is a scam.  Got an email for a free digit...
8         1.0  Another shock jock trying to pass off as humor...
9         1.0           Sorry I even watched 15 minutes of this.
...       ...                                                ...
33965     5.0                                    A real snooze .
33966     5.0                                     No surprises .
33967     5.0  We 've seen the hippie-turned-yuppie plot befo...
33968     5.0  Her fans walked out muttering words like `` ho...
33969     5.0                                In this case zero .

[33965 rows x 2 columns]


Analysis with Multinominal Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

def rating_to_sentiment(rating):
    if rating >= 3.0:
        return 'positive'
    elif rating <= 2.0:
        return 'negative'
    else:
        return 'neutral'

vectorizer = CountVectorizer(min_df=3)

train_data_combined['sentiment'] = train_data_combined['rating'].apply(rating_to_sentiment)
X_train_text = train_data_combined['text']
y_train_labels = train_data_combined['sentiment']

# Fit vectorizer on training text and transform both datasets
X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec = vectorizer.transform(test_data['sentence'])  # Use 'sentence' column from test data

# Train the model
clf = MultinomialNB()
clf.fit(X_train_vec, y_train_labels)
y_pred = clf.predict(X_test_vec)

y_test_actual = test_data['sentiment']

target_names = ['negative', 'neutral', 'positive']
print(classification_report(y_test_actual, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    negative       0.56      0.83      0.67         6
     neutral       0.00      0.00      0.00         6
    positive       0.44      0.67      0.53         6

    accuracy                           0.50        18
   macro avg       0.33      0.50      0.40        18
weighted avg       0.33      0.50      0.40        18



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Analysis with VADER Model

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import spacy

nlp = spacy.load("en_core_web_sm")
vader_model = SentimentIntensityAnalyzer()
pos = set()

def run_vader(sentence, lemmatize=True, parts_of_speech_to_consider=pos):
    """
    Run VADER on a sentence and return the scores.
    """
    doc = nlp(sentence)
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:
            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    return scores

def vader_output_to_label(vader_output):
    """
    Convert VADER output to a label.
    """
    compound = vader_output['compound']
    
    if compound > 0:
        return 'positive'
    elif compound < 0:
        return 'negative'
    else:
        return 'neutral'
    
tweets = []
all_vader_output = []
gold = []

for id_, tweet_info in my_tweets.items():
    the_tweet = tweet_info['text_of_tweet']
    vader_output = run_vader(tweet_info['text_of_tweet']) # run vader
    vader_label = vader_output_to_label(vader_output) # convert vader output to category
    
    tweets.append(the_tweet)
    all_vader_output.append(vader_label)
    gold.append(tweet_info['sentiment_label'])
    

print(tweets[2])
print(f'prediction: {all_vader_output[2]}, gold: {gold[2]}')

print(classification_report(gold, all_vader_output))
print(confusion_matrix(gold, all_vader_output))
print(f'accuracy: {accuracy_score(gold, all_vader_output)}')