In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [6]:
df = pd.read_csv("final_data/cleaned_dataset.csv") 
df.head()

Unnamed: 0,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,business_desc,avg_rating,num_of_reviews,label
0,110199730286805608874,Michelle Banks,2018-02-11 03:33:20,5.0,It's a beautiful place to read books and have ...,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,"['Book store', 'Cafe', 'Childrens book store',...",4.6,1719,relevant
1,106455408371866150988,Steven DeRyck [Staff],2018-09-08 20:53:20,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,"['Deli', 'Takeout Restaurant', 'Sandwich shop']",4.1,706,relevant
2,110442613019980396910,Stevey Markovich,2020-07-23 18:46:40,5.0,Absolutely love this office! Afton is truly am...,0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,['Dentist'],4.9,318,relevant
3,102404509430936241440,William Campbell,2018-12-09 21:13:20,3.0,The food is as good as it usually is,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,"['Restaurant', 'Asian restaurant', 'Chinese re...",3.8,128,relevant
4,111658148729564710703,Beverly Thorman,2018-04-08 01:13:20,5.0,We came in without an appointment on a Saturda...,0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,"['Hair salon', 'Beauty salon']",4.3,168,relevant


# Sentiment Analysis on `review`

In [9]:
df = df[df['review'].notnull()].reset_index(drop=True)

# Initialize sentiment analysis pipeline
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to get label and score
def get_sentiment_with_score(text):
    result = classifier(text[:512])[0]  # truncate long text
    return pd.Series([result['label'], result['score']])

# Apply function
df[['sentiment', 'sentiment_score']] = df['review'].apply(get_sentiment_with_score)

# Assign human-readable labels with Neutral class
def label_with_neutral(row, threshold_low=0.4, threshold_high=0.6):
    if row['sentiment_score'] >= threshold_high:
        return "Positive" if row['sentiment'] == "POSITIVE" else "Negative"
    elif row['sentiment_score'] <= threshold_low:
        return "Negative" if row['sentiment'] == "NEGATIVE" else "Positive"
    else:
        return "Neutral"

df['sentiment_label'] = df.apply(label_with_neutral, axis=1)

# Optional: check distribution
print(df['sentiment_label'].value_counts())

Device set to use mps:0


sentiment_label
Positive    7846
Negative    2915
Neutral      123
Name: count, dtype: int64


# Length of `review`

In [16]:
df['review_length_words'] = df['review'].astype(str).apply(lambda x: len(x.split()))
df[['review', 'review_length_words']].head()

Unnamed: 0,review,review_length_words
0,It's a beautiful place to read books and have ...,17
1,"As previous reviews have stated, two small pie...",39
2,Absolutely love this office! Afton is truly am...,16
3,The food is as good as it usually is,9
4,We came in without an appointment on a Saturda...,48


# Embedding of `review`

In [None]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Keep only non-null reviews and reset index
df = df[df['review'].notnull()].reset_index(drop=True)

# Generate embeddings
embeddings = model.encode(df['review'].astype(str).tolist(), batch_size=32, show_progress_bar=True)

# Append embeddings as a new column
df['review_embedding'] = list(embeddings)

Batches: 100%|██████████| 341/341 [00:23<00:00, 14.54it/s]


# Embedding of `business_desc`

In [26]:
import ast

# Convert string to list
df['business_desc_list'] = df['business_desc'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Join into a single string per row
df['business_desc_str'] = df['business_desc_list'].apply(lambda x: " ".join(x))

# Generate embeddings
desc_embeddings = model.encode(df['business_desc_str'].tolist(), batch_size=32, show_progress_bar=True)

# Append embeddings as a new column
df['business_desc_embedding'] = list(desc_embeddings)

# Optional: drop temporary columns
df.drop(columns=['business_desc_list', 'business_desc_str'], inplace=True)

Batches: 100%|██████████| 341/341 [00:07<00:00, 45.41it/s]
