In [1]:
import json
import nltk
import torch
import numpy as np
from textblob import TextBlob
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     /hpc/home/sz243/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /hpc/home/sz243/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /hpc/home/sz243/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def load_data(file_path, target_business_id):
    with open(file_path, "r") as f:
        lines = f.readlines()

    reviews = []
    for line in lines:
        review = json.loads(line)
        if review["business_id"] == target_business_id:
            reviews.append(review["text"])

    return reviews

# Remove stopwords from the text
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Extract unique words from the text
def extract_unique_words(text):
    words = set(text.split())
    return list(words)

# Filter adjectives from the list of unique words
def filter_adjectives(words):
    adjectives = []
    for word in words:
        blob = TextBlob(word)
        if blob.tags and blob.tags[0][1].startswith('JJ'):
            adjectives.append(word)
    return adjectives

# Count the number of reviews containing the specified words
def count_reviews_containing_words(reviews, words):
    counts = {word: 0 for word in words}
    model = SentenceTransformer('paraphrase-mpnet-base-v2', device='cuda')

    for text in reviews:
        review_embedding = model.encode([text])
        max_similarity = 0
        max_word = None

        for word in words:
            word_embedding = model.encode([word])
            similarity = cosine_similarity(review_embedding, word_embedding)

            if similarity > max_similarity:
                max_similarity = similarity
                max_word = word

        if max_word:
            counts[max_word] += 1

    return counts

def extract_keywords_bert(words, model_name, n_clusters=10):
    model = SentenceTransformer(model_name)
    word_embeddings = model.encode(words)

    clustering_model = KMeans(n_clusters=n_clusters)
    clustering_model.fit(word_embeddings)

    cluster_centers = clustering_model.cluster_centers_

    # Find the closest word in each cluster
    keywords = []
    for cluster_center in cluster_centers:
        distances = np.linalg.norm(word_embeddings - cluster_center, axis=1)
        closest_word_index = np.argmin(distances)
        keywords.append(words[closest_word_index])

    return keywords

In [3]:
file_path = "yelp_academic_dataset_review.json"  # Replace with your Yelp dataset path
target_business_id = "XQfwVwDr-v0ZS3_CbbE5Xw"  # The business_id you want to extract reviews for
data = load_data(file_path, target_business_id)
combined_data = ' '.join(data)
filtered_data = remove_stopwords(combined_data)
unique_words = extract_unique_words(filtered_data)
adjectives = filter_adjectives(unique_words)
model_name = 'distilbert-base-nli-mean-tokens'
keywords = extract_keywords_bert(adjectives, model_name)
review_counts = count_reviews_containing_words(data, keywords)
sorted_adjectives = sorted(review_counts.items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 Adjectives by Review Count (using BERT):")
for word, count in sorted_adjectives:
    print(f"{word}: {count} reviews")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Top 5 Adjectives by Review Count (using BERT):
great: 84 reviews
unacceptable: 65 reviews
large: 8 reviews
15-30: 7 reviews
new: 6 reviews
