In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv('Reviews.csv')
df = df[['Score', 'Text']]

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

def map_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

df['Sentiment'] = df['Score'].apply(map_sentiment)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    cleaned = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned)

df['CleanText'] = df['Text'].apply(clean_text)
df[['CleanText', 'Sentiment']].to_csv('processed_reviews.csv', index=False)

# Step 2: Feature Extraction

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['CleanText'])
y = df['Sentiment']

# Step 3: Train/Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Machine Learning Models

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Linear SVC': LinearSVC(),
    'Random Forest': RandomForestClassifier()
}


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n===== {name} =====")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Step 5: Lexicon-Based Sentiment (VADER)

from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    score = sid.polarity_scores(text)['compound']
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df_sample = df.sample(5000, random_state=42)
df_sample['VADER_Predicted'] = df_sample['Text'].apply(vader_sentiment)

from sklearn.metrics import accuracy_score
vader_accuracy = accuracy_score(df_sample['Sentiment'], df_sample['VADER_Predicted'])
print(f"\n===== VADER Lexicon-Based Sentiment Accuracy: {vader_accuracy:.4f} =====")

print("""
Name: NUR SYAKILA BINTI IZWAN HADI WONG
Student ID: IS01082922

Discussion:
Lexicon-based sentiment analysis using VADER is simple and doesn't require training data,
but it struggles with neutral tones and complex expressions. Among ML models, Linear SVC
and Logistic Regression performed best, especially in handling large feature spaces like TF-IDF.
They can capture subtleties in review texts, though they require more processing and training time.

In conclusion, ML-based models provide better performance and flexibility for sentiment analysis
on this Amazon review dataset compared to lexicon-based approaches.
""")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nsbih\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nsbih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



===== Naive Bayes =====
[[ 2923    12  8351]
 [  311    21  5553]
 [  284     3 61277]]
              precision    recall  f1-score   support

    negative       0.83      0.26      0.39     11286
     neutral       0.58      0.00      0.01      5885
    positive       0.82      1.00      0.90     61564

    accuracy                           0.82     78735
   macro avg       0.74      0.42      0.43     78735
weighted avg       0.80      0.82      0.76     78735


===== Logistic Regression =====
[[ 7275   446  3565]
 [ 1343   863  3679]
 [ 1506   564 59494]]
              precision    recall  f1-score   support

    negative       0.72      0.64      0.68     11286
     neutral       0.46      0.15      0.22      5885
    positive       0.89      0.97      0.93     61564

    accuracy                           0.86     78735
   macro avg       0.69      0.59      0.61     78735
weighted avg       0.83      0.86      0.84     78735


===== Linear SVC =====
[[ 7352   203  3731]
 [ 1468

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nsbih\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



===== VADER Lexicon-Based Sentiment Accuracy: 0.7968 =====

Name: NUR SYAKILA BINTI IZWAN HADI WONG
Student ID: IS01082922

Discussion:
Lexicon-based sentiment analysis using VADER is simple and doesn't require training data,
but it struggles with neutral tones and complex expressions. Among ML models, Linear SVC
and Logistic Regression performed best, especially in handling large feature spaces like TF-IDF.
They can capture subtleties in review texts, though they require more processing and training time.

In conclusion, ML-based models provide better performance and flexibility for sentiment analysis
on this Amazon review dataset compared to lexicon-based approaches.



In [4]:
df_processed = pd.read_csv('processed_reviews.csv')

midpoint = len(df_processed) // 2
df_part1 = df_processed.iloc[:midpoint]
df_part2 = df_processed.iloc[midpoint:]

df_part1.to_csv('processed_reviews_part1.csv', index=False)
df_part2.to_csv('processed_reviews_part2.csv', index=False)