In [None]:
pip install pandas nltk scikit-learn gensim

In [None]:
pip install pyLDAvis

In [None]:
import nltk

# Download NLTK stop words (if not already downloaded)
nltk.download('stopwords')

In [6]:
import pandas as pd
import re
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from gensim.models.ldamodel import LdaModel
from gensim import corpora

# Load the CSV file
file_path = './google_maps_reviews_all_pages.csv'
df = pd.read_csv(file_path)

# Preview the data
print(df.head())

# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

# Step 1: Preprocess the text data (Lemmatization)
def preprocess_text_spacy(text):
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Convert to lowercase
    text = text.lower()
    # Apply SpaCy model
    doc = nlp(text)
    # Lemmatize and remove stop words
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return tokens

# Apply preprocessing to the 'description' column (assuming it's the review text)
df['cleaned_text'] = df['description'].apply(lambda x: preprocess_text(str(x)))

# Step 2: Create a document-term matrix using the CountVectorizer
# Convert preprocessed tokens back to sentences for CountVectorizer
cleaned_reviews = df['cleaned_text'].apply(lambda x: ' '.join(x)).tolist()

# Create a CountVectorizer object (Similar to Doc2Bow, term frequency of words)
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
document_term_matrix = vectorizer.fit_transform(cleaned_reviews)

# Step 3: Create the dictionary and corpus for LDA
# Tokenize the cleaned reviews again (needed for gensim LDA)
tokenized_reviews = df['cleaned_text'].tolist()

# Create a dictionary and corpus for LDA (Doc2Bow)
dictionary = corpora.Dictionary(tokenized_reviews)
corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

# Step 4: Perform LDA
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary, passes=15)

# Step 5: Display the topics
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

# Optional: Convert topics for visualization
# To show per-document topics distribution:
doc_topics = lda_model.get_document_topics(corpus)
print(doc_topics)

                                                link  rating          date  \
0  https://www.google.com/maps/reviews/data=!4m8!...     5.0   a month ago   
1  https://www.google.com/maps/reviews/data=!4m8!...     4.0  5 months ago   
2  https://www.google.com/maps/reviews/data=!4m8!...     4.0   a month ago   
3  https://www.google.com/maps/reviews/data=!4m8!...     5.0  6 months ago   
4  https://www.google.com/maps/reviews/data=!4m8!...     5.0  7 months ago   

        user_name                                        description  likes  
0  Kok Jacqueline  I have invited a 3 friends for the dinner.\nTh...      0  
1    Supreet Kini  Pleased to see a Mangalorean restaurant finall...      0  
2      Robin CHAN  Good service.   Fish a bit too spicy and lassi...      0  
3    Stallon Rego  Nothing quite satisfies your taste buds like a...      0  
4     Nivi Prabhu  My uncle called me up, excited to tell me abou...      3  
Topic 0: 0.534*"nan" + 0.006*"go" + 0.005*"quite" + 0.004*"aver

In [7]:
# Display per-document topic distributions
for doc_num, topics in enumerate(lda_model.get_document_topics(corpus)):
    print(f"Document {doc_num}:")
    for topic, prob in topics:
        print(f"  Topic {topic} - Probability: {prob:.4f}")

Document 0:
  Topic 4 - Probability: 0.4956
  Topic 9 - Probability: 0.4644
Document 1:
  Topic 8 - Probability: 0.9902
Document 2:
  Topic 9 - Probability: 0.9250
Document 3:
  Topic 0 - Probability: 0.4892
  Topic 2 - Probability: 0.0738
  Topic 3 - Probability: 0.4253
Document 4:
  Topic 3 - Probability: 0.9886
Document 5:
  Topic 7 - Probability: 0.4869
  Topic 8 - Probability: 0.4896
Document 6:
  Topic 8 - Probability: 0.9308
Document 7:
  Topic 0 - Probability: 0.0125
  Topic 1 - Probability: 0.0125
  Topic 2 - Probability: 0.8875
  Topic 3 - Probability: 0.0125
  Topic 4 - Probability: 0.0125
  Topic 5 - Probability: 0.0125
  Topic 6 - Probability: 0.0125
  Topic 7 - Probability: 0.0125
  Topic 8 - Probability: 0.0125
  Topic 9 - Probability: 0.0125
Document 8:
  Topic 7 - Probability: 0.9719
Document 9:
  Topic 6 - Probability: 0.9400
Document 10:
  Topic 0 - Probability: 0.0125
  Topic 1 - Probability: 0.0125
  Topic 2 - Probability: 0.8875
  Topic 3 - Probability: 0.0125
  T

In [8]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Prepare the visualization data for pyLDAvis
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

# Display the visualization in a notebook or as an HTML file
pyLDAvis.display(vis_data)
# Alternatively, save the visualization as an HTML file
pyLDAvis.save_html(vis_data, 'lda_visualization.html')