In [None]:
pip install pandas nltk scikit-learn gensim

In [None]:
pip install pyLDAvis

In [None]:
import nltk

# Download NLTK stop words (if not already downloaded)
nltk.download('stopwords')

# Load necessary NLTK resources
nltk.download('wordnet')

In [8]:
import re
import pandas as pd
import spacy
from gensim.models import Phrases, CoherenceModel
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define dictionary for standardizing specific words
standardize_dict = {
    'indian': 'india',
    'indain': 'india',
    'indians': 'india',
    'ambiance': 'ambience',
    'knew': 'know',
    'lets': 'let',
    'looked': 'look',
    'mangaloreaan': 'mangalore',
    'mangalorean': 'mangalore',
    'mangaloren': 'mangalore',
    'mangaloreso': 'mangalore',
    'manglorean': 'mangalore',
    'manglorian': 'mangalore',
    'gd': 'good'
    # Add other words as needed
}

# Load data
file_path = './00_google_maps_reviews_all_pages.csv'
df = pd.read_csv(file_path)

# Custom stop words
custom_stop_words = {'good', 'nice', 'place', 'food', 'restaurant', 'service', 'try'}

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocessing function with additional steps
def preprocess_text_spacy(text):
    # Remove non-alphabetic characters and lowercase the text
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A).lower()
    doc = nlp(text)
    
    # Initial tokenization and stop word removal
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and token.lemma_.strip() != '' and token.lemma_ not in custom_stop_words
    ]
    
    # Step 1: Correct spelling for each word
    corrected_words = [str(TextBlob(word).correct()) for word in tokens]
    
    # Step 2: Lemmatize the corrected words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in corrected_words]
    
    # Step 3: Standardize words based on the dictionary
    standardized_words = [standardize_dict.get(word, word) for word in lemmatized_words]
    
    return standardized_words

# Apply preprocessing
df['cleaned_text'] = df['description'].apply(lambda x: preprocess_text_spacy(str(x)))

# Tokenize reviews for bigram and trigram models
tokenized_reviews = df['cleaned_text'].tolist()

# Create bigram and trigram models
bigram = Phrases(tokenized_reviews, min_count=2, threshold=80)
trigram = Phrases(bigram[tokenized_reviews], threshold=80)
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Apply bigrams and trigrams
tokenized_reviews = [bigram_mod[review] for review in tokenized_reviews]
tokenized_reviews = [trigram_mod[review] for review in tokenized_reviews]

# Create dictionary and corpus
dictionary = corpora.Dictionary(tokenized_reviews)
corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

# LDA model with tuned parameters
lda_model = LdaModel(
    corpus=corpus,
    num_topics=5,         # Experiment with number of topics
    id2word=dictionary,
    passes=90,            # Increase passes to allow model to refine further
    alpha='auto',         # Auto-adjust alpha and eta for topic sparsity
    eta='auto'
)

# Calculate coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_score}')

# Display topics
topics = lda_model.print_topics(num_words=7)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

print("\n\n")
# Display topics without weights
topics = lda_model.print_topics(num_words=7)
for idx, topic in topics:
    # Extract only the words from the topic string by splitting and filtering out weights
    words = [word.split('*')[1].replace('"', '') for word in topic.split(' + ')]
    # Join the words into a single string and print the topic number with words only
    print(f"Topic {idx}: {', '.join(words)}")
    
# Document-topic distribution
doc_topics = lda_model.get_document_topics(corpus)
print(doc_topics)

Coherence Score: 0.47365321060762494
Topic 0: 0.041*"india" + 0.034*"taste" + 0.019*"little" + 0.014*"price" + 0.012*"staff" + 0.010*"visit" + 0.010*"authentic"
Topic 1: 0.033*"mutton" + 0.028*"bryan" + 0.027*"chicken" + 0.020*"taste" + 0.018*"mangalore" + 0.017*"order" + 0.014*"like"
Topic 2: 0.030*"india" + 0.020*"taste" + 0.016*"great" + 0.014*"delicious" + 0.011*"south" + 0.011*"ve" + 0.011*"authentic"
Topic 3: 0.310*"nan" + 0.013*"great" + 0.013*"chicken" + 0.011*"fish" + 0.009*"taste" + 0.008*"carry" + 0.008*"absence"
Topic 4: 0.025*"mangalore" + 0.014*"order" + 0.014*"britain" + 0.012*"visit" + 0.011*"authentic" + 0.009*"mutton" + 0.009*"quinine"



Topic 0: india, taste, little, price, staff, visit, authentic
Topic 1: mutton, bryan, chicken, taste, mangalore, order, like
Topic 2: india, taste, great, delicious, south, ve, authentic
Topic 3: nan, great, chicken, fish, taste, carry, absence
Topic 4: mangalore, order, britain, visit, authentic, mutton, quinine
<gensim.interfaces.T

In [None]:
# Display per-document topic distributions
for doc_num, topics in enumerate(lda_model.get_document_topics(corpus)):
    print(f"Document {doc_num}:")
    for topic, prob in topics:
        print(f"  Topic {topic} - Probability: {prob:.4f}")

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Prepare the visualization data for pyLDAvis
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

# Display the visualization in a notebook or as an HTML file
pyLDAvis.display(vis_data)
# Alternatively, save the visualization as an HTML file
pyLDAvis.save_html(vis_data, 'lda_visualization.html')