In [None]:
pip install pandas nltk scikit-learn gensim

In [None]:
pip install pyLDAvis

In [None]:
import nltk

# Download NLTK stop words (if not already downloaded)
nltk.download('stopwords')

In [1]:
import pandas as pd
import re
import spacy
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
import nltk

# Load data
file_path = './00_google_maps_reviews_all_pages.csv'
df = pd.read_csv(file_path)

# Custom stop words
custom_stop_words = {'good', 'nice', 'place', 'food', 'restaurant', 'service', 'try'}

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocessing
def preprocess_text_spacy(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A).lower()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.lemma_.strip() != '' and token.lemma_ not in custom_stop_words]
    return tokens

# Apply preprocessing
df['cleaned_text'] = df['description'].apply(lambda x: preprocess_text_spacy(str(x)))

# Tokenize reviews for bigram and trigram models
tokenized_reviews = df['cleaned_text'].tolist()

# Create bigram and trigram models
bigram = Phrases(tokenized_reviews, min_count=3, threshold=80)
trigram = Phrases(bigram[tokenized_reviews], threshold=80)
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Apply bigrams and trigrams
tokenized_reviews = [bigram_mod[review] for review in tokenized_reviews]
tokenized_reviews = [trigram_mod[review] for review in tokenized_reviews]

# Create dictionary and corpus
dictionary = corpora.Dictionary(tokenized_reviews)
corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

# Initialize variables for loop
target_coherence = 0.7
max_iterations = 20
current_iteration = 0
best_coherence_score = 0
best_lda_model = None

# Parameter settings for tuning
num_topics = 5
passes = 15
alpha = 'auto'
eta = 'auto'

# Loop to train model with parameter adjustments
while best_coherence_score < target_coherence and current_iteration < max_iterations:
    current_iteration += 1
    print(f"\nIteration {current_iteration} with num_topics={num_topics}, passes={passes}, alpha={alpha}, eta={eta}...")

    # Train LDA model with current parameters
    lda_model = LdaModel(
        corpus=corpus,
        num_topics=num_topics,
        id2word=dictionary,
        passes=passes,
        alpha=alpha,
        eta=eta
    )

    # Calculate coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Coherence Score: {coherence_score}")

    # Check if this model has the best coherence score so far
    if coherence_score > best_coherence_score:
        best_coherence_score = coherence_score
        best_lda_model = lda_model  # Save best model

    # Adjust parameters based on the current iteration
    if coherence_score < target_coherence:
        num_topics = 5          # Increment topics to refine granularity
        passes += 2               # Increase passes to allow more refinement per iteration
        if current_iteration % 3 == 0:  # Change alpha and eta periodically
            alpha = max(0.01, alpha - 0.1) if alpha != 'auto' else 'auto'
            eta = max(0.01, eta - 0.1) if eta != 'auto' else 'auto'

# Final model and score
print(f"\nBest Coherence Score achieved: {best_coherence_score} with num_topics={num_topics}, passes={passes}")

# Display topics from the best model
if best_lda_model:
    topics = best_lda_model.print_topics(num_words=8)
    for idx, topic in topics:
        print(f"Topic {idx}: {topic}")

# Document-topic distribution (for the best model)
doc_topics = best_lda_model.get_document_topics(corpus)
print(doc_topics)



Iteration 1 with num_topics=5, passes=15, alpha=auto, eta=auto...
Coherence Score: 0.4468896744121138

Iteration 2 with num_topics=5, passes=17, alpha=auto, eta=auto...
Coherence Score: 0.4896448650842897

Iteration 3 with num_topics=5, passes=19, alpha=auto, eta=auto...
Coherence Score: 0.4701742825884379

Iteration 4 with num_topics=5, passes=21, alpha=auto, eta=auto...
Coherence Score: 0.49020002180466965

Iteration 5 with num_topics=5, passes=23, alpha=auto, eta=auto...
Coherence Score: 0.4421429846357

Iteration 6 with num_topics=5, passes=25, alpha=auto, eta=auto...
Coherence Score: 0.4716637922271417

Iteration 7 with num_topics=5, passes=27, alpha=auto, eta=auto...
Coherence Score: 0.492841692543713

Iteration 8 with num_topics=5, passes=29, alpha=auto, eta=auto...
Coherence Score: 0.46988361625844693

Iteration 9 with num_topics=5, passes=31, alpha=auto, eta=auto...
Coherence Score: 0.4543265546048449

Iteration 10 with num_topics=5, passes=33, alpha=auto, eta=auto...
Coheren

In [19]:
# Load data
file_path = './00_google_maps_reviews_all_pages.csv'
df = pd.read_csv(file_path)

# Custom stop words
custom_stop_words = {'good', 'nice', 'place', 'food', 'restaurant', 'service', 'try','indian'}

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocessing
def preprocess_text_spacy(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A).lower()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.lemma_.strip() != '' and token.lemma_ not in custom_stop_words]
    return tokens

# Apply preprocessing
df['cleaned_text'] = df['description'].apply(lambda x: preprocess_text_spacy(str(x)))

# Tokenize reviews for bigram and trigram models
tokenized_reviews = df['cleaned_text'].tolist()

# Create bigram and trigram models
bigram = Phrases(tokenized_reviews, min_count=3, threshold=80)
trigram = Phrases(bigram[tokenized_reviews], threshold=80)
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Apply bigrams and trigrams
tokenized_reviews = [bigram_mod[review] for review in tokenized_reviews]
tokenized_reviews = [trigram_mod[review] for review in tokenized_reviews]

# Create dictionary and corpus
dictionary = corpora.Dictionary(tokenized_reviews)
corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

# LDA model with tuned parameters
lda_model = LdaModel(
    corpus=corpus,
    num_topics=5,         # experiment with X topics
    id2word=dictionary,
    passes=50,            # Increase passes to allow model to refine further
    alpha='auto',         # Auto-adjust alpha and eta for topic sparsity
    eta='auto'
)

# Calculate coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_score}')

# Display topics
topics = lda_model.print_topics(num_words=7)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

# Document-topic distribution
doc_topics = lda_model.get_document_topics(corpus)
print(doc_topics)

Coherence Score: 0.4460524288301274
Topic 0: 0.339*"nan" + 0.012*"drink" + 0.012*"visit" + 0.011*"ambience" + 0.010*"great" + 0.009*"tasty" + 0.008*"serve"
Topic 1: 0.024*"mutton" + 0.024*"taste" + 0.017*"biryani" + 0.015*"chicken" + 0.014*"like" + 0.013*"awesome" + 0.011*"dish"
Topic 2: 0.014*"authentic" + 0.014*"chicken" + 0.012*"location" + 0.012*"order" + 0.012*"dish" + 0.008*"pax" + 0.008*"go"
Topic 3: 0.019*"chicken" + 0.017*"order" + 0.017*"authentic" + 0.015*"taste" + 0.012*"ghee_roast" + 0.012*"fish" + 0.011*"not"
Topic 4: 0.026*"great" + 0.020*"taste" + 0.018*"friendly" + 0.017*"staff" + 0.013*"price" + 0.011*"mangalore" + 0.011*"worth"
<gensim.interfaces.TransformedCorpus object at 0x326cc7290>


In [20]:
# Display per-document topic distributions
for doc_num, topics in enumerate(lda_model.get_document_topics(corpus)):
    print(f"Document {doc_num}:")
    for topic, prob in topics:
        print(f"  Topic {topic} - Probability: {prob:.4f}")

Document 0:
  Topic 0 - Probability: 0.0181
  Topic 1 - Probability: 0.9711
Document 1:
  Topic 1 - Probability: 0.9946
Document 2:
  Topic 0 - Probability: 0.9761
Document 3:
  Topic 3 - Probability: 0.9905
Document 4:
  Topic 1 - Probability: 0.9940
Document 5:
  Topic 3 - Probability: 0.9832
Document 6:
  Topic 0 - Probability: 0.0254
  Topic 3 - Probability: 0.9565
Document 7:
  Topic 0 - Probability: 0.0378
  Topic 1 - Probability: 0.0125
  Topic 2 - Probability: 0.9343
Document 8:
  Topic 0 - Probability: 0.0123
  Topic 1 - Probability: 0.9803
Document 9:
  Topic 0 - Probability: 0.5902
  Topic 1 - Probability: 0.3945
Document 10:
  Topic 0 - Probability: 0.0380
  Topic 1 - Probability: 0.9396
Document 11:
  Topic 0 - Probability: 0.0124
  Topic 1 - Probability: 0.9802
Document 12:
  Topic 2 - Probability: 0.9947
Document 13:
  Topic 2 - Probability: 0.9857
Document 14:
  Topic 4 - Probability: 0.9841
Document 15:
  Topic 0 - Probability: 0.0137
  Topic 2 - Probability: 0.9760
Do

In [21]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Prepare the visualization data for pyLDAvis
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

# Display the visualization in a notebook or as an HTML file
pyLDAvis.display(vis_data)
# Alternatively, save the visualization as an HTML file
pyLDAvis.save_html(vis_data, '03_lda_visualization.html')