<a href="https://www.kaggle.com/code/yorkyong/aspect-based-sentiment-analysis-with-llama-3-1?scriptVersionId=201631764" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
#with reference from https://www.kaggle.com/code/lucamassaron/fine-tune-llama-2-for-sentiment-analysis

In [2]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip install -q -U -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q -U peft

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import re
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoConfig,
                          AutoModelForCausalLM, 
                          AutoTokenizer,
                          LlamaForCausalLM, 
                          LlamaTokenizer,
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split


In [6]:
pd.set_option('display.max_colwidth', None)  # Show full width of each column

In [7]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.4.0


In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


# Preparing the data and the core evaluation functions

In [9]:
# filename= '/kaggle/input/mcdonalds-review-dataset-6-months-clean-sentiment/cleaned_data _for_ABSA.xlsx'
filename= '/kaggle/input/mcdonalds-store-reviews/McDonald_s_Reviews.csv' #To identify use more Neutral sentiments + Aspects = Price for data augmentation

In [11]:
df = pd.read_csv(filename,encoding="utf8", encoding_errors="replace")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   reviewer_id    33396 non-null  int64  
 1   store_name     33396 non-null  object 
 2   category       33396 non-null  object 
 3   store_address  33396 non-null  object 
 4   latitude       32736 non-null  float64
 5   longitude      32736 non-null  float64
 6   rating_count   33396 non-null  object 
 7   review_time    33396 non-null  object 
 8   review         33396 non-null  object 
 9   rating         33396 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 2.5+ MB


In [13]:
df.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,"Why does it look like someone spit on my food?\nI had a normal transaction, everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.",1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,"It'd McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a difference. They are all friendly, accommodating and always smiling. Makes for a more pleasant experience than many other fast food places.",4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and checked it in.\nLine was not moving so I had to leave otherwise I���������������������������d be late for work.\nNever got the refund in the app.\nI called them and they said I could only get my money back in person because it was stuck in the system.\nWent there in person the next day and the manager told me she wasn�,1 star
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ������������������������������������ customer service was quick and p,5 stars
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, and she still manage to mess it up , it was suppose to be a large meal double filet of fish with large fries , no cheese . It was all wrong , they either need to pay close attention to the order being made , understand English or they need not to work at a drive thru",1 star


In [14]:
df.review_time.unique()

array(['3 months ago', '5 days ago', 'a month ago', '2 months ago',
       '3 weeks ago', 'a year ago', '6 months ago', '5 months ago',
       '7 months ago', '10 months ago', '4 years ago', '8 months ago',
       '4 months ago', '3 years ago', '2 years ago', '9 months ago',
       '11 months ago', '5 years ago', '6 years ago', '7 years ago',
       '8 years ago', '3 days ago', 'a week ago', '2 days ago',
       '2 weeks ago', '10 years ago', '9 years ago', '6 days ago',
       '4 weeks ago', '21 hours ago', '4 days ago', '11 years ago',
       '12 years ago', 'a day ago', '23 hours ago', '6 hours ago',
       '20 hours ago', '22 hours ago', '8 hours ago'], dtype=object)

In [15]:
def map_rating_to_sentiment(rating):
    if rating in ['4 stars', '5 stars']:
        return 'positive'
    elif rating in ['1 star', '2 stars']:
        return 'negative'
    else:
        return 'neutral'

# Assuming your DataFrame is named 'df' and the ratings column is named 'ratings'
df['sentiment'] = df['rating'].apply(map_rating_to_sentiment)

In [16]:
def convert_to_datetime(time_str):
    current_time = dt.datetime.now()
    if 'year' in time_str:
        years = int(re.findall(r'\d+', time_str)[0]) if re.search(r'\d+', time_str) else 1
        return current_time - dt.timedelta(days=years*365)
    elif 'month' in time_str:
        months = int(re.findall(r'\d+', time_str)[0]) if re.search(r'\d+', time_str) else 1
        return current_time - dt.timedelta(days=months*30)
    elif 'week' in time_str:
        weeks = int(re.findall(r'\d+', time_str)[0]) if re.search(r'\d+', time_str) else 1
        return current_time - dt.timedelta(weeks=weeks)
    elif 'day' in time_str:
        days = int(re.findall(r'\d+', time_str)[0]) if re.search(r'\d+', time_str) else 1
        return current_time - dt.timedelta(days=days)
    elif 'hour' in time_str:
        hours = int(re.findall(r'\d+', time_str)[0]) if re.search(r'\d+', time_str) else 1
        return current_time - dt.timedelta(hours=hours)
    else:
        return current_time

# Apply the function to create a new column with actual dates
df['actual_date'] = df['review_time'].apply(convert_to_datetime)


In [17]:
# Drop unknown and missing rows

df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]+', '', x) if isinstance(x, str) else x).copy()

In [19]:
df.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,sentiment,actual_date
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food\nI had a normal transaction everyone was chill and polite but now i dont want to eat this Im trying not to think about what this milky whiteclear substance is all over my food i d sure am not coming back,1 star,negative,2024-07-19 04:07:20.534412
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Itd McDonalds It is what it is as far as the food and atmosphere go The staff here does make a difference They are all friendly accommodating and always smiling Makes for a more pleasant experience than many other fast food places,4 stars,positive,2024-10-12 04:07:20.534628
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and checked it in\nLine was not moving so I had to leave otherwise Id be late for work\nNever got the refund in the app\nI called them and they said I could only get my money back in person because it was stuck in the system\nWent there in person the next day and the manager told me she wasn,1 star,negative,2024-10-12 04:07:20.534639
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc Crispy chicken sandwich was customer service was quick and p,5 stars,positive,2024-09-17 04:07:20.534647
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,I repeat my order 3 times in the drive thru and she still manage to mess it up it was suppose to be a large meal double filet of fish with large fries no cheese It was all wrong they either need to pay close attention to the order being made understand English or they need not to work at a drive thru,1 star,negative,2024-08-18 04:07:20.534653


In [20]:
print(f"The dataframe for modelling has {len(df)} rows.")

The dataframe for modelling has 33396 rows.


In [21]:
# Group by review_time and sentiment, then compute the size of each group
sentiment_counts = df.groupby(['review_time', 'sentiment']).size().unstack(fill_value=0)

sentiment_counts

sentiment,negative,neutral,positive
review_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10 months ago,216,73,214
10 years ago,11,15,12
11 months ago,201,48,208
11 years ago,7,1,2
12 years ago,0,1,3
2 days ago,9,3,20
2 months ago,272,77,276
2 weeks ago,68,17,52
2 years ago,1448,498,1946
20 hours ago,1,0,0


In [38]:
# Define the relevant time periods for filtering
time_periods = ['11 months ago', '10 months ago', '9 months ago', '8 months ago', '7 months ago']

# Filter the dataframe based on review_time
filtered_df = df[df['review_time'].isin(time_periods)]

filtered_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 2374 entries, 18 to 33380
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   reviewer_id    2374 non-null   int64         
 1   store_name     2374 non-null   object        
 2   category       2374 non-null   object        
 3   store_address  2374 non-null   object        
 4   latitude       2309 non-null   float64       
 5   longitude      2309 non-null   float64       
 6   rating_count   2374 non-null   object        
 7   review_time    2374 non-null   object        
 8   review         2374 non-null   object        
 9   rating         2374 non-null   object        
 10  sentiment      2374 non-null   object        
 11  actual_date    2374 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(1), object(8)
memory usage: 241.1+ KB


In [39]:
# Group by review_time and sentiment, then compute the size of each group
sentiment_counts1 = filtered_df.groupby(['review_time', 'sentiment']).size().unstack(fill_value=0)

sentiment_counts1

sentiment,negative,neutral,positive
review_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10 months ago,216,73,214
11 months ago,201,48,208
7 months ago,179,72,221
8 months ago,234,47,217
9 months ago,162,65,217


# Prompt

In [40]:
def generate_prompt(review):
    return f"""
Analyze the sentiment of the following review for five specific aspects: service, cleanliness, food, price, and others.
For each aspect, assign a sentiment score based on the following scale:
0: Aspect is not mentioned
1: Negative sentiment
2: Neutral sentiment
3: Positive sentiment
IMPORTANT:
- If the review is too brief or generic and not about specific aspects, assess the overall sentiment and assign it to others.
- Only provide scores for aspects that are EXPLICITLY mentioned or can be DIRECTLY inferred from the review.
- If an aspect is not mentioned or cannot be directly inferred, assign it a score of 0.
- Use the 'others' category to capture any significant aspects mentioned that don't fit into service, cleanliness, food, or price.
- Do NOT repeat the review or provide any additional analysis or explanations beyond the scores.
- Provide the scores ONLY ONCE in the exact format shown below.
Provide your analysis in the following format ONLY:
service=[score]
cleanliness=[score]
food=[score]
price=[score]
others=[score]
Review: {review}
Analysis:
"""

In [41]:
# # Generate the full prompt (input) and retain sentiment as a separate column
# df['prompt'] = df.apply(generate_prompt, axis=1)

# Now convert the DataFrame to a dataset
df_data = Dataset.from_pandas(filtered_df)

In [42]:
df_data

Dataset({
    features: ['reviewer_id', 'store_name', 'category', 'store_address', 'latitude ', 'longitude', 'rating_count', 'review_time', 'review', 'rating', 'sentiment', 'actual_date', '__index_level_0__'],
    num_rows: 2374
})

# Testing model without fine-tuning

In [43]:
# # For clearing GPU memory
# import gc

# del [
#     model, 
#     tokenizer, 
# ]
# # del [df, X_train, X_eval]

In [44]:
# for _ in range(100):
#     torch.cuda.empty_cache()
#     gc.collect()

In [45]:
model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"
# model_name = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
# model_name = "/kaggle/input/llama-3.1/transformers/405b-instruct/1"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [46]:
print(len(tokenizer))  # Check vocabulary size

128258


In [51]:
def predict(df_data, model, tokenizer, debug=False):
    y_pred = []
    aspects = ['service', 'cleanliness', 'food', 'price', 'others']
    
    for i in tqdm(range(len(df_data))):
        review = df_data[i]['review']
        prompt = generate_prompt(review)
        
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=21, 
                        temperature=0.07,
                       )
        try:
            result = pipe(prompt)
            generated_text = result[0]['generated_text']
            
            # Extract only the part that matches our expected format
            answer = re.search(r'Analysis:[\s\S]*', generated_text)
            if answer:
                answer = answer.group(0)
            else:
                raise ValueError("Unexpected output format")
            
            aspect_sentiments = {}
            
            # Updated regex to capture scores more accurately, even with partial output
            matches = re.findall(r'(\w+)=\[?(\d)\]?', answer)
            
            for match in matches:
                aspect, score = match
                aspect = aspect.lower()
                score = int(score)
                if aspect in aspects and score in [0, 1, 2, 3]:
                    aspect_sentiments[aspect] = score
            
            # If no aspects were found or all aspects have zero scores, it's treated as a brief/generic review
            if not aspect_sentiments or all(score == 0 for score in aspect_sentiments.values()):
                aspect_sentiments = {aspect: 0 for aspect in aspects}
            
            # Fill in any missing aspects with 0
            for aspect in aspects:
                if aspect not in aspect_sentiments:
                    aspect_sentiments[aspect] = 0
            
            y_pred.append(aspect_sentiments)
            
            if debug:
                print(f"\nItem {i}:")
                print(f"Review: {review}")
                print(f"Generated: {answer}")
                print(f"Parsed sentiments: {aspect_sentiments}")
        
        except Exception as e:
            print(f"Error processing item {i}: {str(e)}")
            y_pred.append({aspect: 0 for aspect in aspects})
    
    return y_pred

In [53]:
y_pred = predict(df_data, model, tokenizer, debug=False)

100%|██████████| 2374/2374 [1:36:26<00:00,  2.44s/it]


In [57]:
# Convert y_pred to a DataFrame
y_pred_df = pd.DataFrame(y_pred)

# Reset index of X_test to ensure it aligns with y_pred_df
filtered_df = filtered_df.reset_index(drop=True)

# Concatenate X_test with y_pred_df
result_df = pd.concat([filtered_df, y_pred_df], axis=1)

In [58]:
# Display the first few rows of the result
result_df.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,sentiment,actual_date,service,cleanliness,food,price,others
0,19,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,7 months ago,I am not happy at all today I went and bought 4 Strawberry Banana smoothies and and 2 hash browns and my smoothies tasted like straight ice and water smh And I am very disappointed in there service my smoothie should never taste like that never especially the fact I bought it for not only me but my kids I need someone to contact me from corporate or something I am not happy at all waste of money,1 star,negative,2024-03-21 04:07:20.534762,1,0,1,0,1
1,20,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,7 months ago,The staff seems to always be different which makes ordering sometimes a little difficult because they are in a state of perpetual learning which can also lead to waiting a bit longer but the lobby is always clean as well as the bathrooms,4 stars,positive,2024-03-21 04:07:20.534769,1,3,0,0,0
2,21,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,10 months ago,Cant believe experience with order taker Went from drive thru and employee directly asks what do you want He keeps rushing while taking order with no other cars placing order We wanted to add one more item and asked for a minute He keeps asking what else When I said we are ready no response There were no other car placing order that time either I told again we are ready responded after a pause Again what do you want excuse me who talks to customers like this \nI asked for his name to put a review and he flat out denied I asked for receipt before handing my credit card He swipes it and returns back my card I m waiting for a receipt with my family in the car He starts inputing another order on his tablet Waited for sometime and then I waive my hand several times He opens window and said you can get receipt from next window Excuse me \nWent to next window picked up food asked for Manager and got his name JAMAR My receipt states 08062022 720pm Order 05\nI didnt want to complain but this was intolerable,1 star,negative,2023-12-22 04:07:20.534777,1,0,0,0,1
3,24,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,8 months ago,This is consistently the worst meal you will pay for Microwaved nuggets cold fries and a 30 minute wait What more can you ask for Dont go here ever Your better off going hungry or giving the meal you payed for to the homeless by the donation box behind the drive thru,1 star,negative,2024-02-20 04:07:20.534799,1,1,1,1,1
4,32,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,9 months ago,This location is wack\nThey will literally stop taking orders and make people sit in the parking lot for like 12 minutes until 300 am so that they dont get to order dinner food anymore\nWe have gone multiple times right after leaving the pinballz arcade just to sit in the parking lot in the DriveThru line waiting while they do nothing but wait for the order menu to change over\nLook McDonalds if I wanted a breakfast deluxe I wouldnt have shown up at 230 in the morning thank you,2 stars,negative,2024-01-21 04:07:20.534858,1,0,0,0,1


In [59]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2374 entries, 0 to 2373
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   reviewer_id    2374 non-null   int64         
 1   store_name     2374 non-null   object        
 2   category       2374 non-null   object        
 3   store_address  2374 non-null   object        
 4   latitude       2309 non-null   float64       
 5   longitude      2309 non-null   float64       
 6   rating_count   2374 non-null   object        
 7   review_time    2374 non-null   object        
 8   review         2374 non-null   object        
 9   rating         2374 non-null   object        
 10  sentiment      2374 non-null   object        
 11  actual_date    2374 non-null   datetime64[ns]
 12  service        2374 non-null   int64         
 13  cleanliness    2374 non-null   int64         
 14  food           2374 non-null   int64         
 15  price          2374 n

In [74]:
# List of columns to analyze
columns = ['service', 'cleanliness', 'food', 'price', 'others']

# Pivoting data based on sentiment
summary_df = result_df.groupby('sentiment')[columns].apply(lambda x: x.apply(pd.Series.value_counts).reindex([0, 1, 2, 3], fill_value=0))

# Reformat the result for better readability
summary_df = summary_df.stack(level=0).unstack().fillna(0)

# Add a row subtotal for each sentiment
summary_df['Total'] = summary_df.sum(axis=1)

# Add a column subtotal for each category (0, 1, 2, 3 types across sentiments)
summary_df.loc['Total'] = summary_df.sum(axis=0)

summary_df

Unnamed: 0,service,cleanliness,food,price,others,Total
"(negative, 0)",206.0,882.0,619.0,944.0,151.0,2802.0
"(negative, 1)",766.0,105.0,357.0,47.0,829.0,2104.0
"(negative, 2)",0.0,4.0,3.0,0.0,1.0,8.0
"(negative, 3)",20.0,1.0,13.0,1.0,11.0,46.0
"(neutral, 0)",150.0,268.0,203.0,281.0,158.0,1060.0
"(neutral, 1)",128.0,27.0,75.0,19.0,130.0,379.0
"(neutral, 2)",2.0,2.0,7.0,1.0,4.0,16.0
"(neutral, 3)",25.0,8.0,20.0,4.0,13.0,70.0
"(positive, 0)",406.0,830.0,762.0,1033.0,563.0,3594.0
"(positive, 1)",98.0,12.0,49.0,13.0,183.0,355.0


In [75]:
result_df.to_csv('Labelled_7mth_11mth.csv')