### Tariq Walid Bin Abd Aziz (SW01083016)
### Montasir Kamal Eldin Mohamed (IS01080844)

In [1]:
import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Get nrows of data from CSV file
df = pd.read_csv("Reviews.csv", nrows=70000, encoding="ISO-8859-1")
df.shape

(70000, 10)

# 1. Text Preprocessing

In [3]:
# Get english stop words
stop_words = stopwords.words('english')

# Define custom stop words
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

# Stemmer and lemmatizer
stemmer = nltk.SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [4]:
# Function to clean and standardize
def preprocess(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text within square brackets
    text = re.sub(r'http\S+\s*\S+', '', text)  # Remove URLs starting with http
    text = re.sub(r'www\.\S+', '', text)  # Remove URLs starting with www
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\b\w*\d\w*\b', '', text)  # Remove words containing numbers
    text = ' '.join(word for word in text.split(' ') if word not in stop_words) # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split(' ')) # Stemming
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split(' ')) # Lemmatizing   
    return text

In [5]:
# Map 'Score' in dataset to 'label' column (positive, negative, neutral)
df['label'] = df.Score.map({1:'negative', 2:'negative', 3:'neutral', 4:'positive', 5:'positive'})

# Apply text preprocessing to dataset
df['text_clean'] = df['Text'].apply(preprocess)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,label,text_clean
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,positive,bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,negative,product arriv label jumbo salt peanutsth peanu...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,positive,confect around centuri light pillowi citrus g...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,negative,look secret ingredi robitussin believ found g...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,positive,great taffi great price wide assort yummi taf...


# 2. Feature Extraction
## Bag-of-Words for ML Model

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

texts = df['text_clean']
labels = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.4, random_state=42)

# Extract features (bag-of-words representation)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# 3. Model Selection & Evaluation
## Lexicon-based Approach

In [7]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report
from tabulate import tabulate
import multiprocessing
import time

# Function to perform sentiment analysis on a single review
def analyze_sentiment(review):
    # TextBlob
    blob = TextBlob(review)
    tb_polarity = blob.sentiment.polarity
    if tb_polarity > 0:
        tb_label = 'positive'
    elif tb_polarity < 0:
        tb_label = 'negative'
    else:
        tb_label = 'neutral'
    
    # VADER
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(review)
    vader_compound = vs['compound']
    if vader_compound > 0.05:
        vader_label = 'positive'
    elif vader_compound < -0.05:
        vader_label = 'negative'
    else:
        vader_label = 'neutral'
    
    return tb_label, vader_label

if __name__ == '__main__':
    reviews = df['text_clean']
    actual_labels = df['label']

    print("LEXICON-BASED APPROACH:")

    # Get start time
    start_time = time.time()
    
    # Initialize multiprocessing pool
    pool = multiprocessing.Pool()
    
    # Perform sentiment analysis using multiprocessing
    results = pool.map(analyze_sentiment, reviews)
    
    # Close the pool to release resources
    pool.close()
    pool.join()

    # Record end time
    end_time = time.time()
    
    # Unpack the results
    tb_labels, vader_labels = zip(*results)
    
    # Calculate classification report for TextBlob
    tb_report = classification_report(actual_labels, tb_labels)
    
    # Calculate classification report for VADER
    vader_report = classification_report(actual_labels, vader_labels)
    
    # Print classification report for TextBlob
    print("\nTextBlob Classification Report:")
    print(tb_report)
    
    # Print classification report for VADER
    print("\nVADER Classification Report:")
    print(vader_report)

    # Calculate and print execution time
    execution_time = end_time - start_time
    print(f"\nExecution Time: {execution_time} seconds")

LEXICON-BASED APPROACH:

TextBlob Classification Report:
              precision    recall  f1-score   support

    negative       0.40      0.26      0.31     10537
     neutral       0.06      0.04      0.04      5538
    positive       0.81      0.90      0.85     53925

    accuracy                           0.73     70000
   macro avg       0.42      0.40      0.40     70000
weighted avg       0.69      0.73      0.71     70000


VADER Classification Report:
              precision    recall  f1-score   support

    negative       0.49      0.26      0.34     10537
     neutral       0.10      0.05      0.06      5538
    positive       0.81      0.93      0.87     53925

    accuracy                           0.76     70000
   macro avg       0.47      0.41      0.42     70000
weighted avg       0.71      0.76      0.72     70000


Execution Time: 489.63700795173645 seconds


## Machine-Learning-Based Approach

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import multiprocessing
import time

In [9]:
# Funtion for training classifier
def train_classifier(classifier, X_train, y_train):
    classifier.fit(X_train, y_train)
    return classifier

# Initialize classifiers
nb_classifier = MultinomialNB()
svm_classifier = SVC(kernel='linear')

start_time = time.time()

# Initialize multiprocessing pool
pool = multiprocessing.Pool()

# Train classifiers using multiprocessing pool
nb_classifier = pool.apply(train_classifier, args=(nb_classifier, X_train, y_train))
svm_classifier = pool.apply(train_classifier, args=(svm_classifier, X_train, y_train))

# Close multiprocessing pool
pool.close()
pool.join()

end_time = time.time()

print("TRAINING COMPLETED")
# Calculate and print execution time
execution_time = end_time - start_time
print(f"\nExecution Time: {execution_time} seconds")

TRAINING COMPLETED

Execution Time: 2547.965203523636 seconds


In [11]:
# Machine-learning-based approach using Naive Bayes and SVM
print("MACHINE-LEARNING-BASED APPROACH:")

start_time = time.time()

# Predict sentiment using classifiers
for text, actual_label in zip(X_test, y_test):
    # Predict sentiment using Naive Bayes
    nb_prediction = nb_classifier.predict(text)[0]

    # Predict sentiment using SVM
    svm_prediction = svm_classifier.predict(text)[0]

end_time = time.time()

# Calculate classification report for Naive Bayes
nb_classification_report = classification_report(y_test, nb_classifier.predict(X_test), target_names=['negative', 'neutral', 'positive'])

# Calculate classification report for SVM
svm_classification_report = classification_report(y_test, svm_classifier.predict(X_test), target_names=['negative', 'neutral', 'positive'])

# Print classification report for Naive Bayes
print("\nClassification Report for Naive Bayes:")
print(nb_classification_report)

# Print classification report for SVM
print("\nClassification Report for SVM:")
print(svm_classification_report)

# Calculate and print execution time
execution_time = end_time - start_time
print(f"\nExecution Time: {execution_time} seconds")

MACHINE-LEARNING-BASED APPROACH:

Classification Report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.73      0.48      0.58      4231
     neutral       0.44      0.06      0.10      2221
    positive       0.84      0.98      0.90     21548

    accuracy                           0.83     28000
   macro avg       0.67      0.50      0.53     28000
weighted avg       0.79      0.83      0.79     28000


Classification Report for SVM:
              precision    recall  f1-score   support

    negative       0.65      0.66      0.65      4231
     neutral       0.36      0.27      0.31      2221
    positive       0.90      0.92      0.91     21548

    accuracy                           0.83     28000
   macro avg       0.64      0.62      0.62     28000
weighted avg       0.82      0.83      0.83     28000


Execution Time: 308.5540678501129 seconds


# 4. Discussion

#### Both lexicon-based and ML approaches perform well in classifying positive sentiments. However, ML methods (using Naive Bayes and SVM classifiers) perform slightly better over lexicon-based models in positive sentiment classification.

#### Both lexicon-based and ML approaches struggle poorly in accurately classifying neutral reviews, with lexicon methods almost always misclassifying them. Eventhough Naive Bayes and SVM classifiers outperform TextBlob and VADER analyzers, they still show weaknees in predicting neutral sentiments.

#### Both ML classification methods show decent performance in classifying negative reviews, with Naive Bayes demonstrating a slight edge over the SVM classifier. On the contrary, TextBlob and SVM analyzers show poor performance in classifying negative reviews.

#### In terms of accuracy, ML classification methods outperform lexicon-based analyzers by a margin of 7%. 