This code details the process of tokenizing the words from the reviews and extracting a sentiment using NLTK packages and resources. The analysis is further strengthened through the use of pre-trained word embeddings and

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim.downloader as api
import numpy as np

# Load your dataset
df = pd.read_excel('/content/drive/MyDrive/DSA3101 Data/Clean Data/dataset_tripadvisor-reviews_2024_cleaned.xlsx')

# Initialize nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load pre-trained GloVe embeddings
glove = api.load("glove-wiki-gigaword-100")

# Define sentiment dictionaries
positive_words = ['good', 'excellent', 'lovely', 'amazing', 'perfect', 'awesome']
negative_words = ['bad', 'expensive', 'rude', 'terrible', 'poor', 'awful']
neutral_words = ['average', 'okay', 'fine', 'decent', 'satisfactory']

# Function to get word vector from embeddings
def get_word_vector(word):  # **New Change: Get word vector function**
    return glove[word] if word in glove else np.zeros(glove.vector_size)

# Function to preprocess text and tokenize
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]  # Remove stopwords and lemmatize
    word_vectors = [get_word_vector(word) for word in tokens]
    return tokens, word_vectors  # Return both tokens and their vectors

# Function to handle mixed feelings around "but"
def handle_but(tokens):
    if 'but' in tokens:
        but_index = tokens.index('but')
        before_but = tokens[:but_index]
        after_but = tokens[but_index + 1:]
        return before_but, after_but
    else:
        return tokens, []

# Function to handle negations
def handle_negations(tokens):
    negations = ["not", "no", "never", "isn't", "wasn't"]
    new_tokens = []
    negate = False

    for word in tokens:
        if word in negations:
            negate = True
        else:
            if negate:
                new_tokens.append("not_" + word)
                negate = False
            else:
                new_tokens.append(word)

    return new_tokens

# Function to compute sentiment based on rating and text
def compute_sentiment(row):
    rating = row['rating']
    text = row['text']

    # Review length feature
    review_length = len(text.split())  # **New Change: Review length as a feature**

    # Start with the rating sentiment
    if rating in [1, 2]:
        sentiment_score = -1  # Likely negative
    elif rating in [4, 5]:
        sentiment_score = 1  # Likely positive
    else:
        sentiment_score = 0  # Likely neutral

    # Tokenize and analyze the text and word vectors
    tokens, word_vectors = preprocess_text(text)

    # Handle negations
    tokens = handle_negations(tokens)

    # Handle mixed feelings with "but"
    before_but, after_but = handle_but(tokens)

    # Adjust the score based on words before and after "but"
    if before_but:
        for word in before_but:
            if word in positive_words:
                sentiment_score += 1
            elif word in negative_words:
                sentiment_score -= 1

    if after_but:
        sentiment_score = 0  # Reset to focus on post-"but" sentiment
        for word in after_but:
            if word in positive_words:
                sentiment_score += 1
            elif word in negative_words:
                sentiment_score -= 1

    # Handle short or empty reviews by down-weighting their impact
    if review_length < 5:
        sentiment_score *= 0.5  # Reduce influence if review is too short

    # Return final sentiment (-1 for negative, 0 for neutral, 1 for positive)
    if sentiment_score > 0:
        return 1
    elif sentiment_score < 0:
        return -1
    else:
        return 0

# Apply the sentiment function to your dataset
df['computed_sentiment'] = df.apply(compute_sentiment, axis=1)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...




In [None]:
# Save the result to a new Excel file
df.to_excel('/content/processed_reviews_with_improved_sentiment.xlsx', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# export as csv
csv_file_name = "Edited_SentimentAnalysisBelle.csv"
csv_folder_path = '/content/drive/MyDrive/DSA3101 Data/Subgroup A/'

df.to_csv(csv_folder_path + csv_file_name, index=False)  # exclude index
print('Done')

Done
