## Aspect-Based Sentiment Analysis

## Load and preprocess data

In [1]:
import pandas as pd
import stanza

# Load the Excel file
file_path = './imdb_movies.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows of the dataframe
print(df.head())

# Drop rows with missing values
df.dropna(inplace=True)

# Split semicolon-separated reviews into individual entries
df['reviews'] = df['reviews'].apply(lambda x: [review.strip().strip('"') for review in x.split(';')])

# Explode the reviews into separate rows
df_exploded = df.explode('reviews')

# Print the length of the reviews column array
print(f'Number of reviews: {len(df_exploded["reviews"])}')

# Display the first few rows after processing
print(df_exploded.head())

           movie_title                     genres  year  \
0         The Fall Guy        action;comedy;drama  2024   
1              Hellboy  action;adventure; fantasy  2019   
2       Train to Busan     action;horror;thriller  2016   
3  Mother of the Bride       comedy;drama;romance  2024   
4        The Iron Claw      biography;drama;sport  2023   

                                             reviews  
0  "A disappointing spectacle ."; "Absolute pure ...  
1  "Terrible Writing, Weird Directing and Awful C...  
2  "Unforgettable experience! One of the best zom...  
3  "Poor acting."; "Another rom com nightmare .";...  
4  "To me, this was good, but not great."; "Wheth...  
Number of reviews: 36
    movie_title                     genres  year  \
0  The Fall Guy        action;comedy;drama  2024   
0  The Fall Guy        action;comedy;drama  2024   
0  The Fall Guy        action;comedy;drama  2024   
1       Hellboy  action;adventure; fantasy  2019   
1       Hellboy  action;adventure

## Define aspect keywords and their synonyms with Stanza

In [5]:
stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,lemma')

# Aspect definition
aspects_keywords = {
    "Meaning": {
        "positive": ["good", "great", "meaningful"],
        "negative": ["lacks", "emptiness", "doomed"]
    },
    "Quality": {
        "positive": ["excellent", "impressive", "outstanding", "superb", "amazing"],
        "negative": ["poor", "bad", "awful", "terrible", "disappointing", "mediocre", "low budget"]
    },
    "Satisfaction": {
        "positive": ["enjoyable", "satisfying", "pleasurable", "entertaining"],
        "negative": ["boring", "tedious", "dull", "displeasing"]
    }
}

# Get synonyms using Stanza for each aspect keyword
def get_synonyms(word):
    doc = nlp(word)
    synonyms = set()
    for sentence in doc.sentences:
        for word in sentence.words:
            synonyms.add(word.lemma)
    return list(synonyms)

# Create a dictionary of synonyms for aspect keywords
synonyms_dict = {}
for aspect, words in aspects_keywords.items():
    synonyms_dict[aspect] = {
        "positive": [],
        "negative": []
    }
    for polarity in ["positive", "negative"]:
        for keyword in words[polarity]:
            synonyms = get_synonyms(keyword)
            synonyms_dict[aspect][polarity].extend(synonyms)

print(synonyms_dict)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-06-02 20:13:10 INFO: Downloaded file to C:\Users\Hoang Anh\stanza_resources\resources.json
2024-06-02 20:13:10 INFO: Downloading default packages for language: en (English) ...
2024-06-02 20:13:11 INFO: File exists: C:\Users\Hoang Anh\stanza_resources\en\default.zip
2024-06-02 20:13:16 INFO: Finished downloading models and saved to C:\Users\Hoang Anh\stanza_resources
2024-06-02 20:13:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-06-02 20:13:16 INFO: Downloaded file to C:\Users\Hoang Anh\stanza_resources\resources.json
2024-06-02 20:13:17 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2024-06-02 20:13:17 INFO: Using device: cpu
2024-06-02 20:13:17 INFO: Loading: tokenize
2024-06-02 20:13:17 INFO: Loading: mwt
2024-06-02 20:13:17 INFO: Loading: lemma
2024-06-02 20:13:17 INFO: Done loading processors!


{'Meaning': {'positive': ['good', 'great', 'meaningful'], 'negative': ['lack', 'emptiness', 'doom']}, 'Quality': {'positive': ['excellent', 'impressive', 'outstanding', 'superb', 'amazing'], 'negative': ['poor', 'bad', 'awful', 'terrible', 'disappointing', 'mediocre', 'budget', 'low']}, 'Satisfaction': {'positive': ['enjoyable', 'satisfy', 'pleasurable', 'entertaining'], 'negative': ['boring', 'tedious', 'dull', 'displease']}}


## Classifying words

In [8]:
# Initialize Stanza for sentiment analysis
nlp_sentiment = stanza.Pipeline('en', processors='tokenize,sentiment')

# Function to classify words to aspects using Stanza
def classify_words(sentence):
    aspect_positive_words = {aspect: [] for aspect in aspects_keywords.keys()}
    aspect_negative_words = {aspect: [] for aspect in aspects_keywords.keys()}
    
    doc = nlp_sentiment(sentence)
    for sent in doc.sentences:
        for word in sent.words:
            token = word.text.lower()
            for aspect, words in synonyms_dict.items():
                if token in words["positive"]:
                    aspect_positive_words[aspect].append(token)
                elif token in words["negative"]:
                    aspect_negative_words[aspect].append(token)
                else:
                    # Using Stanza sentiment analysis at the word level
                    sentiment_score = sent.sentiment
                    if sentiment_score > 0:
                        aspect_positive_words[aspect].append(token)
                    elif sentiment_score < 0:
                        aspect_negative_words[aspect].append(token)
    return aspect_positive_words, aspect_negative_words

# Apply the function to classify words in each review
df_exploded['Sentence'] = df_exploded['reviews']
classifications = df_exploded['Sentence'].apply(classify_words)

# Extract aspect-based sentiment into separate columns
for aspect in aspects_keywords.keys():
    positive_col = f"{aspect}_positive"
    negative_col = f"{aspect}_negative"
    df_exploded[positive_col] = classifications.apply(lambda x: " ".join(x[0][aspect]))
    df_exploded[negative_col] = classifications.apply(lambda x: " ".join(x[1][aspect]))

# Display the dataframe with aspect-based sentiment columns
print(df_exploded.head())


2024-06-02 20:17:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-06-02 20:17:03 INFO: Downloaded file to C:\Users\Hoang Anh\stanza_resources\resources.json
2024-06-02 20:17:03 INFO: Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | combined       |
| mwt       | combined       |
| sentiment | sstplus_charlm |

2024-06-02 20:17:03 INFO: Using device: cpu
2024-06-02 20:17:03 INFO: Loading: tokenize
2024-06-02 20:17:03 INFO: Loading: mwt
2024-06-02 20:17:03 INFO: Loading: sentiment
2024-06-02 20:17:04 INFO: Done loading processors!


    movie_title                     genres  year  \
0  The Fall Guy        action;comedy;drama  2024   
0  The Fall Guy        action;comedy;drama  2024   
0  The Fall Guy        action;comedy;drama  2024   
1       Hellboy  action;adventure; fantasy  2019   
1       Hellboy  action;adventure; fantasy  2019   

                                             reviews  \
0                        A disappointing spectacle .   
0      Absolute pure fun, you won't be disappointed.   
0   One Of The Most Entertaining Films Of The Year .   
1  Terrible Writing, Weird Directing and Awful CGI .   
1         Neither a complete disaster nor a triumph.   

                                            Sentence  \
0                        A disappointing spectacle .   
0      Absolute pure fun, you won't be disappointed.   
0   One Of The Most Entertaining Films Of The Year .   
1  Terrible Writing, Weird Directing and Awful CGI .   
1         Neither a complete disaster nor a triumph.   

             

## Output file

In [9]:
# Save the processed dataframe to a new Excel file
output_file = './imdb_movies_aspect_sentiment.xlsx'
df_exploded.to_excel(output_file, index=False)

print(f"Results saved to {output_file}")

Results saved to ./imdb_movies_aspect_sentiment.xlsx
