In [None]:
# NLP Pipeline

import os
import json
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import spacy
import nltk
from collections import Counter
import networkx as nx

from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

from transformers import pipeline
import torch

nltk.download('stopwords')
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

review_fp = "/yelp_academic_dataset_review.json"
business_fp = "/yelp_academic_dataset_business.json"
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)

# 1. ESG Keyword Filtering
# 1.1 ESG Keywords
ESG_KEYWORDS = [
    # Environmental
    "sustainable", "organic", "eco-friendly", "recyclable", "renewable",
    # Social
    "fair trade", "community", "local sourcing", "ethical",
    # Governance
    "transparency", "responsible", "ESG"
]

# 1.2 Load & Filter Data
def load_and_filter_reviews(review_fp, business_fp, keywords):
    print("Loading business data...")
    business_df = pd.read_json(business_fp, lines=True)
    print("Loading review data...")
    review_df = pd.read_json(review_fp, lines=True)

    # Merge on business_id
    merged_df = review_df.merge(business_df[["business_id", "name", "categories"]], on="business_id", how="left")

    # Filter by ESG keywords
    pattern = "|".join([re.escape(k) for k in keywords])
    mask = merged_df["text"].str.contains(pattern, case=False, na=False)
    filtered_df = merged_df[mask].reset_index(drop=True)

    print(f"Total reviews after ESG filter: {len(filtered_df)}")
    return filtered_df

df = load_and_filter_reviews(review_fp, business_fp, ESG_KEYWORDS)


In [None]:
# 3. LDA Topic Modeling
# 3.1 Preprocess for LDA
stop_words = set(stopwords.words("english"))

def preprocess(text):
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]

df["tokens"] = [preprocess(t) for t in tqdm(df["text"], desc="Preprocessing")]

# 3.2 LDA Topic Modeling with Coherence Selection
dictionary = corpora.Dictionary(df["tokens"])
corpus = [dictionary.doc2bow(text) for text in df["tokens"]]

def find_best_lda(dictionary, corpus, texts, start=3, limit=10):
    best_score, best_model, best_k = -1, None, None
    for k in range(start, limit+1):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, passes=5, random_state=42)
        coherence = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence="c_v").get_coherence()
        print(f"Topics: {k}, Coherence: {coherence:.4f}")
        if coherence > best_score:
            best_score, best_model, best_k = coherence, model, k
    return best_model, best_k, best_score

lda_model, best_k, best_score = find_best_lda(dictionary, corpus, df["tokens"])
print(f"Best topic count: {best_k} with coherence {best_score:.4f}")

# 3.3 Extract topic keywords and probabilities
def get_topic_keywords(lda_model, dictionary, num_words=10):
    topics = []
    for topic_id in range(best_k):
        topic_terms = lda_model.show_topic(topic_id, num_words)
        topic_words = [word for word, _ in topic_terms]
        topic_probs = [prob for _, prob in topic_terms]
        topics.append({"topic_id": topic_id, "words": ", ".join(topic_words), "probabilities": ", ".join(f"{p:.4f}" for p in topic_probs)})
    return topics

topics_data = get_topic_keywords(lda_model, dictionary)
topics_df = pd.DataFrame(topics_data)
topics_df.to_csv(os.path.join(output_dir, "topics.csv"), index=False)
print(f" Topic keywords saved to {os.path.join(output_dir, 'topics.csv')}")

# 3.4 Add topic distributions to reviews for ABM
def get_topic_distributions(lda_model, corpus, df):
    topic_distributions = []
    for bow in corpus:
        topic_dist = lda_model.get_document_topics(bow, minimum_probability=0.0)
        dist = [0.0] * best_k
        for topic_id, prob in topic_dist:
            dist[topic_id] = prob
        topic_distributions.append(dist)
    df["topic_distribution"] = topic_distributions
    return df

df = get_topic_distributions(lda_model, corpus, df)

In [116]:
print(df)

                    review_id                 user_id             business_id  \
0      -7LkjSPzfVgnVpuVuRuOow  uAu772KpSkb-tPFgZmU-lA  2GYg3liJ9-m6Z67L_4_BRQ   
1      r-PjI5sBvNoBekk5mURNww  uzy_KYIZx65cp7Yh8_seeQ  ZuM1vcZ_ObCjCCGweYzItg   
2      hVs0KrkaRNaxmH2GQm2qLA  jcJGCNrEgPHsl7zkvumq2Q  QUEWdnmVCv2Ri8xJATTrIQ   
3      bVgpM_sA9AMAlL2R5TPNAQ  8DSuxkm7SyX9fgQTw2tuaw  dz_aIFbATP2PLWQSOBnMfw   
4      -RsFYcZz0HeSrAhu5xDWBg  -cb2SLDoCysQFxbDDf_0pg  Q1HHAb4FzrzfnnrRyA8fgg   
...                       ...                     ...                     ...   
78111  zWVJHU8L9txRPDlSMnHsYw  HuB6-5n480bnhtV6SHGUgQ  3LWPWhqWaLSDq6dktuB-jA   
78112  R4dfSB633wUohJn9eq2fLQ  81QjvIwJK5JZZT6Ady0x3w  7KkgY_GrKT3Pal_6NN46yQ   
78113  jxvrLD0XmCWFBNIrO17CRw  4bP86EzvAa0sfaxrYW5_BA  KQx5ZIETVo7_KBm0ovLfSw   
78114  m_WyTXe6z6FlAzG7qjebEA  uw9cwb4qvH0EKvUh-X_W-w  necj933-7IiKCyMGj6ZWGQ   
78115  kZiKvXxK7o5i7fa32u5Jgw  6jjHo9Lilv3kTy87pm2ycw  pQAQwhBlSQdG1HuuLuCqXw   

       stars  useful  funny

In [None]:
# 4. BERT Sentiment Analysis
sentiment_model = pipeline(
    "sentiment-analysis",
    model="/bert-base-multilingual-uncased-sentiment",
)
# Process sentiment and map to 1-5 scale
def map_sentiment(score):
    # Convert BERT's [-1, 1] to [1, 5]
    return (2 * (score) + 3)

df["sentiment"] = [map_sentiment(sentiment_model(t[:512])[0]["score"]) for t in tqdm(df["text"], desc="BERT Sentiment")]



In [None]:
# Save outputs & model params
df.to_csv(os.path.join(output_dir, "esg_reviews_processed.csv"), index=False)

# Save sentiment results with topic_distribution
sentiment_output = df[["review_id", "business_id", "text", "sentiment", "tokens", "topic_distribution"]]
sentiment_output.to_csv(os.path.join(output_dir, "sentiment_results.csv"), index=False)
print(f" Sentiment results saved to {os.path.join(output_dir, 'sentiment_results.csv')}")

# Convert tuple keys to "word1,word2" strings for JSON
coocc_str_keys = {",".join(pair): count for pair, count in coocc}

model_params = {
    "best_topic_count": best_k,
    "coherence_score": best_score,
    "esg_keywords": ESG_KEYWORDS,
    "keyword_cooccurrence": coocc_str_keys
}

with open(os.path.join(output_dir, "model_params.json"), "w") as f:
    json.dump(model_params, f, indent=4)

print(" NLP pipeline completed. Outputs saved.")

In [119]:
print (df)

                    review_id                 user_id             business_id  \
0      -7LkjSPzfVgnVpuVuRuOow  uAu772KpSkb-tPFgZmU-lA  2GYg3liJ9-m6Z67L_4_BRQ   
1      r-PjI5sBvNoBekk5mURNww  uzy_KYIZx65cp7Yh8_seeQ  ZuM1vcZ_ObCjCCGweYzItg   
2      hVs0KrkaRNaxmH2GQm2qLA  jcJGCNrEgPHsl7zkvumq2Q  QUEWdnmVCv2Ri8xJATTrIQ   
3      bVgpM_sA9AMAlL2R5TPNAQ  8DSuxkm7SyX9fgQTw2tuaw  dz_aIFbATP2PLWQSOBnMfw   
4      -RsFYcZz0HeSrAhu5xDWBg  -cb2SLDoCysQFxbDDf_0pg  Q1HHAb4FzrzfnnrRyA8fgg   
...                       ...                     ...                     ...   
78111  zWVJHU8L9txRPDlSMnHsYw  HuB6-5n480bnhtV6SHGUgQ  3LWPWhqWaLSDq6dktuB-jA   
78112  R4dfSB633wUohJn9eq2fLQ  81QjvIwJK5JZZT6Ady0x3w  7KkgY_GrKT3Pal_6NN46yQ   
78113  jxvrLD0XmCWFBNIrO17CRw  4bP86EzvAa0sfaxrYW5_BA  KQx5ZIETVo7_KBm0ovLfSw   
78114  m_WyTXe6z6FlAzG7qjebEA  uw9cwb4qvH0EKvUh-X_W-w  necj933-7IiKCyMGj6ZWGQ   
78115  kZiKvXxK7o5i7fa32u5Jgw  6jjHo9Lilv3kTy87pm2ycw  pQAQwhBlSQdG1HuuLuCqXw   

       stars  useful  funny

In [None]:
# Calculate topic prevalence
topic_counts = [0] * best_k
for dist in df["topic_distribution"]:
    max_topic = max(range(len(dist)), key=lambda i: dist[i])
    topic_counts[max_topic] += 1
prevalence_df = pd.DataFrame({
    "topic_id": range(best_k),
    "count": topic_counts,
    "proportion": [count / len(df) for count in topic_counts]
})
prevalence_df.to_csv(os.path.join(output_dir, "topic_prevalence.csv"), index=False)
print(f" Topic prevalence saved to {os.path.join(output_dir, 'topic_prevalence.csv')}")