In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [67]:
df = pd.read_csv("final_data/cleaned_dataset.csv") 
df.head()

Unnamed: 0,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,business_desc,avg_rating,num_of_reviews,label
0,110199730286805608874,Michelle Banks,2018-02-11 03:33:20,5.0,It's a beautiful place to read books and have ...,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,"['Book store', 'Cafe', 'Childrens book store',...",4.6,1719,relevant
1,106455408371866150988,Steven DeRyck [Staff],2018-09-08 20:53:20,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,"['Deli', 'Takeout Restaurant', 'Sandwich shop']",4.1,706,relevant
2,110442613019980396910,Stevey Markovich,2020-07-23 18:46:40,5.0,Absolutely love this office! Afton is truly am...,0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,['Dentist'],4.9,318,relevant
3,102404509430936241440,William Campbell,2018-12-09 21:13:20,3.0,The food is as good as it usually is,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,"['Restaurant', 'Asian restaurant', 'Chinese re...",3.8,128,relevant
4,111658148729564710703,Beverly Thorman,2018-04-08 01:13:20,5.0,We came in without an appointment on a Saturda...,0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,"['Hair salon', 'Beauty salon']",4.3,168,relevant


# Sentiment Analysis on `review`

In [68]:
df = df[df['review'].notnull()].reset_index(drop=True)

# Initialize sentiment analysis pipeline
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to get label and score
def get_sentiment_with_score(text):
    result = classifier(text[:512])[0]  # truncate long text
    return pd.Series([result['label'], result['score']])

# Apply function
df[['sentiment', 'sentiment_score']] = df['review'].apply(get_sentiment_with_score)

# Assign human-readable labels with Neutral class
def label_with_neutral(row, threshold_low=0.4, threshold_high=0.6):
    if row['sentiment_score'] >= threshold_high:
        return "Positive" if row['sentiment'] == "POSITIVE" else "Negative"
    elif row['sentiment_score'] <= threshold_low:
        return "Negative" if row['sentiment'] == "NEGATIVE" else "Positive"
    else:
        return "Neutral"

df['sentiment_label'] = df.apply(label_with_neutral, axis=1)

# Optional: check distribution
print(df['sentiment_label'].value_counts())

Device set to use mps:0


sentiment_label
Positive    7846
Negative    2915
Neutral      123
Name: count, dtype: int64


# `Sentiment Analysis` against `rating`

In [69]:
def check_mismatch(row):
    if row['rating'] >= 4 and row['sentiment_label'] == "Negative":
        return "Suspicious"
    elif row['rating'] <= 2 and row['sentiment_label'] == "Positive":
        return "Suspicious"
    else:
        return "Legit"

df['suspicion'] = df.apply(check_mismatch, axis=1)

In [70]:
print(df['suspicion'].value_counts())

suspicion
Legit         9561
Suspicious    1323
Name: count, dtype: int64


# Length of `review`

In [16]:
df['review_length_words'] = df['review'].astype(str).apply(lambda x: len(x.split()))
df[['review', 'review_length_words']].head()

Unnamed: 0,review,review_length_words
0,It's a beautiful place to read books and have ...,17
1,"As previous reviews have stated, two small pie...",39
2,Absolutely love this office! Afton is truly am...,16
3,The food is as good as it usually is,9
4,We came in without an appointment on a Saturda...,48


# Embedding of `review`

In [None]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Keep only non-null reviews and reset index
df = df[df['review'].notnull()].reset_index(drop=True)

# Generate embeddings
embeddings = model.encode(df['review'].astype(str).tolist(), batch_size=32, show_progress_bar=True)

# Append embeddings as a new column
df['review_embedding'] = list(embeddings)

Batches: 100%|██████████| 341/341 [00:23<00:00, 14.54it/s]


# Embedding of `business_desc`

In [26]:
import ast

# Convert string to list
df['business_desc_list'] = df['business_desc'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Join into a single string per row
df['business_desc_str'] = df['business_desc_list'].apply(lambda x: " ".join(x))

# Generate embeddings
desc_embeddings = model.encode(df['business_desc_str'].tolist(), batch_size=32, show_progress_bar=True)

# Append embeddings as a new column
df['business_desc_embedding'] = list(desc_embeddings)

# Optional: drop temporary columns
df.drop(columns=['business_desc_list', 'business_desc_str'], inplace=True)

Batches: 100%|██████████| 341/341 [00:07<00:00, 45.41it/s]


# `Name Validity` boolean column

In [65]:
import spacy

# Download the en_core_web_sm model
# python -m spacy download en_core_web_sm

# Load English NLP pipeline with NER (Named Entity Recognition)
nlp = spacy.load("en_core_web_sm")

def get_validity_of_name(name):
    """
    Check if the string contains a PERSON entity according to spaCy.
    """
    # Handle NaN or non-string inputs
    if not isinstance(name, str) or not name.strip():
        return False
    
    # Handle all string inputs
    doc = nlp(str(name))
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return True
    return False


# Apply the function to the 'user_name' column
df["name_validity"] = df["user_name"].apply(get_validity_of_name)
df.head()

Unnamed: 0,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,business_desc,avg_rating,num_of_reviews,label,sentiment,sentiment_score,sentiment_label,review_length_words,embedding,business_desc_embedding,name_validity
0,110199730286805608874,Michelle Banks,2018-02-11 03:33:20,5.0,It's a beautiful place to read books and have ...,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,"['Book store', 'Cafe', 'Childrens book store',...",4.6,1719,relevant,POSITIVE,0.999885,Positive,17,"[0.019050553, -0.007937221, -0.021419825, 0.06...","[-0.001835555, 0.00961866, 0.028814249, 0.0438...",True
1,106455408371866150988,Steven DeRyck [Staff],2018-09-08 20:53:20,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,"['Deli', 'Takeout Restaurant', 'Sandwich shop']",4.1,706,relevant,POSITIVE,0.997185,Positive,39,"[-0.03985757, 0.0456756, 0.03532694, 0.0130935...","[-0.04080368, 0.026135705, 0.031749316, -0.081...",True
2,110442613019980396910,Stevey Markovich,2020-07-23 18:46:40,5.0,Absolutely love this office! Afton is truly am...,0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,['Dentist'],4.9,318,relevant,POSITIVE,0.999889,Positive,16,"[-0.04052647, -0.030089611, -0.009051651, -0.0...","[-0.08501962, 0.02050875, -0.04436949, -0.0035...",True
3,102404509430936241440,William Campbell,2018-12-09 21:13:20,3.0,The food is as good as it usually is,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,"['Restaurant', 'Asian restaurant', 'Chinese re...",3.8,128,relevant,POSITIVE,0.999851,Positive,9,"[0.013494, -0.01855295, 0.03622511, 0.10009426...","[-0.02385982, 0.056261, 0.04095394, 0.04023178...",True
4,111658148729564710703,Beverly Thorman,2018-04-08 01:13:20,5.0,We came in without an appointment on a Saturda...,0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,"['Hair salon', 'Beauty salon']",4.3,168,relevant,POSITIVE,0.999873,Positive,48,"[-0.0027537446, 0.010088735, 0.04393, 0.019915...","[0.022430552, -0.01589779, 0.012581493, 0.0249...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10879,106056314955409000000,Britt,2024-07-03 09:46:40,3.0,No experience with Pho Kinh Do. Still writing ...,0x80c8c6a5752edf13:0xc2eae525daec627c,Pho Kinh Do,36.126711,-115.197821,['Vietnamese restaurant'],4.5,538,rant,NEGATIVE,0.997033,Negative,11,"[-0.017284725, 0.022090089, 0.05470626, 0.0090...","[-0.019427117, 0.020295473, -0.042405277, -0.0...",False
10880,112728804194032000000,Laurie Brekken,2023-07-22 04:26:40,2.0,Why does SlotZilla Zipline exist? No thanks.,0x80c8c375619da61f:0xd29657d04cdc6e02,SlotZilla Zipline,36.169472,-115.141264,['Tourist attraction'],4.4,9637,rant,NEGATIVE,0.997852,Negative,7,"[-0.05944578, -0.035353906, -0.07918974, 0.068...","[0.083074145, 0.022992572, -0.01932243, 0.0200...",True
10881,117864101451928000000,Danny Kenney,2023-07-22 04:26:40,4.0,"Why is The Mob Museum even here? Never been, b...",0x80c8c30a83b0d4e9:0x5535db1cc0e7329d,The Mob Museum,36.172820,-115.141239,"['Museum', 'Historical landmark', 'History mus...",4.6,9949,rant,NEGATIVE,0.999409,Negative,12,"[0.047451008, 0.055764362, 0.0031792677, 0.058...","[0.022421965, 0.09233307, -0.051740035, 0.0303...",True
10882,111163141152686000000,NATALIE LOPEZ,2024-10-27 03:33:20,5.0,I don't know what happens inside Las Vegas Sco...,0x80c8c69a4bc9a5ab:0x742b93383454f91e,Las Vegas Scooters,36.111262,-115.199144,"['Scooter rental service', 'Wheelchair rental ...",4.2,77,rant,NEGATIVE,0.995747,Negative,14,"[0.07422001, 0.039953854, -0.060588937, 0.0678...","[-0.018990772, -0.053306464, -0.047045518, -0....",False
