## Load Libraries

In [1]:
import pandas as pd
from pathlib import Path

# from nltk.sentiment import SentimentIntensityAnalyzer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# %pip install -q transformers sentence-transformers pandas
from sentence_transformers import SentenceTransformer, util

# Disable truncation of long strings in pandas output
pd.set_option('display.max_colwidth', None)


## Load Parquet files

In [2]:
import pandas as pd
import requests
from io import BytesIO

def load_parquet_from_url(url):
    """
    Downloads a Parquet file from a URL and loads it into a DataFrame.

    Parameters:
    - url (str): The URL to the Parquet file.

    Returns:
    - pd.DataFrame: The loaded DataFrame if successful, None otherwise.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError if status is 4xx, 5xx
        file_stream = BytesIO(response.content)
        customer_reviews_df = pd.read_parquet(file_stream)
        print("File loaded successfully!")
        # display(customer_reviews_df.head())
        return customer_reviews_df
    except requests.exceptions.RequestException as e:
        print(f"Failed to download the file: {e}")
        return None
    except Exception as e:
        print(f"Failed to load the parquet file: {e}")
        return None

## Load unclassified dataset:

In [3]:
# from pathlib import Path

# load from local Parquet file
# file_path = Path("./Resources/philly_reviews_without_mood.parquet")
# Load the Parquet file
# customer_reviews_df = pd.read_parquet(file_path)

# Load parquet file
url = "https://github.com/tlockhart/project-3/releases/download/v2.0/philly_reviews_without_mood.parquet"
customer_reviews_df = load_parquet_from_url(url)


# Rename "text" column to "review" if needed
customer_reviews_df = customer_reviews_df.rename(columns={"text": "review"})

# Display the first few rows
display(customer_reviews_df.head(1))

File loaded successfully!


Unnamed: 0,review_id,user_id,business_id,review,business_name,address,city,categories,friends,review_stars,...,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,YpDx1X1OvnwUgdkHeZNvWw,OfhZlqYMrCVAhM-bt_cQug,J-ciDDEdIHMcChGIyKZnOg,Beautiful clean shop with knock your socks off Everything ...\nHomemade breakfast tarts and chicken pot pies .\nAll sweets looked well beyond beautiful and delish.\nI felt some of the items were a tad pricey considering the sizes being on a smaller size. But I know premium quality costs more .. \nIt's a great place for locals and worth a trip if traveling from center city etc .\nI found counter staff to be most friendly and engaging \nWill be back and certainly will recommend,Cake Life Bake Shop,1306 Frankford Ave,Philadelphia,"[Caterers, Cafes, Event Planning & Services, Food, Cupcakes, Bakeries, Restaurants, Desserts]","[rFANUt3nNW1auqcQTILUGQ, x1_sp_rHxZtI8VLX_IF5cw, 4My9e4rcNrAIIQEkNRqsBg, beN_tzswevSz73ur-O3zOA, _sZ6KsJyDRvaTBRZY9Welg, kTpsjDUhl9BlRl0FVuNreA]",4.0,...,0,0,0,0,0,6,0,0,0,0


## Classify review

In [4]:
"""
Deprecated code to classify review's mood (16 mins)
Required library Installations::
1. transformers: Hugging Face library for state-of-the-art NLP models
2. sentence-transformers: Simplified interface for computing sentence embeddings and semantic similarity
3. pandas: Data handling and manipulation library
4. SentenceTransformer: Loads pre-trained models for sentence embeddings.
5. util: Provides similarity utilities like cosine similarity.
"""
# Store Semantic Categories:
moods = [
    "adventurous", "comforting", "energizing", "romantic",
    "cozy", "festive", "indulgent", "refreshing"
]

"""
Use HugginFace sentence-transformers package and the 
all-MiniLM-L6-v2 model to classify review text
Initialize sentence Embedding Model with all_miniLM-L6-v2
to training and calculate the probalistic distance between
vectors to determine semantic similarity
"""
model = SentenceTransformer("all-MiniLM-L6-v2")  
mood_embeddings = model.encode(moods, convert_to_tensor=True)


def assign_mood_and_score(text):
    """
    function classifies the mood of yelp customer review text
    """
    # Checks if the input is a non-empty string.
    if not isinstance(text, str) or len(text.strip()) == 0:
        return "neutral"
    
    # Encodes the review text into an embedding.
    # Note: SentenceTransformers models (based on BERT model) 
    # include tokenization inside the encode method.
    text_embedding = model.encode(text, convert_to_tensor=True)
    
    # Compares the review embedding against mood embeddings using cosine similarity
    similarity_scores = util.pytorch_cos_sim(text_embedding, mood_embeddings)
    
    # Returns the mood with the highest similarity
    best_match = similarity_scores.argmax().item()
    
    # Get similarity scores as a list
    scores = similarity_scores[0].tolist()
    
    return moods[best_match], scores
#########################################
#  Add the classified moods to a new column called mood
results = customer_reviews_df["review"].apply(assign_mood_and_score)
#########################################

## Seperate results
moods_predicted = []
similarity_scores_list = []

for mood, scores in results:
    moods_predicted.append(mood)
    similarity_scores_list.append(scores)

# Add predicted mood to original dataframe
customer_reviews_df["mood"] = moods_predicted

# Create a new dataframe for similarity scores
similarity_scores_df = pd.DataFrame(similarity_scores_list, columns=moods)
similarity_scores_df = pd.concat([customer_reviews_df[["review", "mood"]], similarity_scores_df], axis=1)

if "review" in similarity_scores_df.columns:
    similarity_scores_df["short_review"] = similarity_scores_df["review"].apply(
        lambda x: x[:50] + "..." if isinstance(x, str) and len(x) > 50 else x
    )

# Rename 'short_review' back to 'review'
similarity_scores_df = similarity_scores_df.rename(columns={"short_review": "review"})
similarity_scores_df = similarity_scores_df.drop(columns=["review"])

# Pring first 5 rows
print("*****Similarity_Scores_DF")
print(similarity_scores_df.head(1))

# Stored as Github Release: Save only similarity scores to a separate file
# scores_output_path = "./Resources/review_mood_similarity_scores.parquet"
# similarity_scores_df.to_parquet(scores_output_path, index=False)

# Stored as Github Release: Output the new dataset with mood labels
# output_path = "./Resources/philly_reviews_with_mood.parquet"
# customer_reviews_df.to_parquet(output_path, index=False)

# print(f"Saved labeled file to: {output_path}")




*****Similarity_Scores_DF
        mood  adventurous  comforting  energizing  romantic      cozy  \
0  indulgent     0.145074    0.012018    0.089682  0.051065  0.208144   

    festive  indulgent  refreshing  
0  0.225149   0.280261    0.029452  
