Importing and Preprocessing Datasets

In [1]:
import pandas as pd

test_data = pd.read_csv('sentiment-topic-test.tsv', sep='\t')
test_data.drop('sentence_id', axis=1, inplace=True)
print(test_data.head())

                                            sentence sentiment   topic
0  The stadium was alive with the roar of the cro...  positive  sports
1  That last-minute goal had me jumping out of my...  positive  sports
2  I couldn’t put the book down; it swept me into...  positive    book
3  The story had its moments, though some parts f...   neutral    book
4  I enjoyed the way the timelines shifted, even ...   neutral    book


In [4]:
sentences = pd.read_csv('stanfordSentimentTreebank\datasetSentences.txt', sep='\t')
sentiments = pd.read_csv('stanfordSentimentTreebank\sentiment_labels.txt', sep='|', engine='python')
sentiments.columns = ['sentence_index', 'sentiment_value']

train_data_stf = pd.merge(sentences, sentiments, on='sentence_index')
train_data_stf = train_data_stf.rename(columns={'sentence': 'text', 'sentiment_value': 'rating'})
train_data_stf.drop('sentence_index', axis=1, inplace=True)
print(train_data_stf.head())

                                                text   rating
0  The Rock is destined to be the 21st Century 's...  0.50000
1  The gorgeously elaborate continuation of `` Th...  0.44444
2                     Effective but too-tepid biopic  0.50000
3  If you sometimes like to go to the movies to h...  0.42708
4  Emerges as something rare , an issue movie tha...  0.37500


In [14]:
from datasets import load_dataset

dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", 
    "raw_review_Books", 
    split="full[:100000]", 
    trust_remote_code=True
    )

df = pd.DataFrame(dataset)
train_data = df[['rating', 'text']]

print("Original rating distribution:")
print(train_data['rating'].value_counts().sort_index())

min_count = train_data['rating'].value_counts().min()
print(f"\nMinimum count: {min_count}")

train_data_amazon = train_data.groupby('rating').apply(
    lambda x: x.sample(n=min_count, random_state=42)
).reset_index(drop=True)

print(f"\nBalanced rating distribution:")
print(train_data_amazon['rating'].value_counts().sort_index())
print(f"\nOriginal dataset size: {len(train_data)}")
print(f"Balanced dataset size: {len(train_data_amazon)}")

Original rating distribution:
rating
1.0     2773
2.0     3666
3.0    10006
4.0    23875
5.0    59680
Name: count, dtype: int64

Minimum count: 2773

Balanced rating distribution:
rating
1.0    2773
2.0    2773
3.0    2773
4.0    2773
5.0    2773
Name: count, dtype: int64

Original dataset size: 100000
Balanced dataset size: 13865


  train_data_amazon = train_data.groupby('rating').apply(


In [33]:
train_data_stf['rating'] = (train_data_stf['rating'] * 5).clip(1, 5).round()
train_data_combined = pd.concat([train_data_amazon, train_data_stf], ignore_index=True)
train_data_combined = train_data_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print(train_data_combined.head())

   rating                                               text
0     1.0  The premise of this book was a good one. A boo...
1     5.0  Simply and eloquently articulates the tangled ...
2     5.0  Janet Stevens is a wonderful illustrator.  Whe...
3     5.0  The film makes a tragic error by going on for ...
4     5.0  and your reward will be a thoughtful , emotion...


Multinominal Naive Bayes and Logistic Regression Implementation

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def rating_to_sentiment(rating):
    if rating >= 3.0:
        return 'positive'
    elif rating <= 2.0:
        return 'negative'
    else:
        return 'neutral'

#vectorizer = CountVectorizer(min_df=2)
vectorizer = TfidfVectorizer(min_df=2) # Changing min_df doesnt make difference

train_data_combined['sentiment'] = train_data_combined['rating'].apply(rating_to_sentiment)

X_train, X_test, y_train, y_test = train_test_split(
    train_data_combined['text'], 
    train_data_combined['sentiment'], 
    test_size=0.2, 
    random_state=42)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

#clf = MultinomialNB()
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.79      0.53      0.63      1062
    positive       0.89      0.96      0.92      4082

    accuracy                           0.87      5144
   macro avg       0.84      0.74      0.78      5144
weighted avg       0.87      0.87      0.86      5144



VADER Implementation

In [35]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import spacy

nlp = spacy.load("en_core_web_sm")
vader_model = SentimentIntensityAnalyzer()
pos = set()

def run_vader(sentence, lemmatize=True, parts_of_speech_to_consider=pos):
    """
    Run VADER on a sentence and return the scores.
    """
    doc = nlp(sentence)
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:
            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    return scores

def vader_output_to_label(vader_output):
    """
    Convert VADER output to a label.
    """
    compound = vader_output['compound']
    
    if compound >= 0.1:
        return 'positive'
    elif compound <= -0.1:
        return 'negative'
    else:
        return 'neutral'
    
train_data_half = train_data_combined.sample(frac=0.5, random_state=42)
predictions = train_data_half['text'].apply(lambda x: vader_output_to_label(run_vader(x)))
gold = train_data_half['sentiment']

print(f"Sample prediction: {predictions.iloc[2]}, gold: {gold.iloc[2]}")
print(classification_report(gold, predictions))
print(confusion_matrix(gold, predictions))
print(f'accuracy: {accuracy_score(gold, predictions)}')

Sample prediction: positive, gold: positive


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    negative       0.35      0.42      0.38      2753
     neutral       0.00      0.00      0.00         0
    positive       0.83      0.64      0.73     10107

    accuracy                           0.60     12860
   macro avg       0.39      0.36      0.37     12860
weighted avg       0.73      0.60      0.65     12860

[[1161  300 1292]
 [   0    0    0]
 [2156 1443 6508]]
accuracy: 0.5963452566096423
