In [1]:
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
import os
import gensim
import nltk
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('reviews_2.csv')

In [3]:
# Load standard stop words
stop_words = set(stopwords.words('english'))

# Add custom stop words
custom_stop_words = [
    'hotel', 'room', 'stay', 'airport', 'and', 'the', 'i', 'bristol', 
    'stay', 'one', 'hilton', 'said', 'say','to','is','was','for','in','of','The','but','at','a','an','as','had','have','on','from','you','would','will','not','us','we','get','could'  # Ensure not to duplicate words
]
stop_words.update(custom_stop_words)

# Function to generate tokens
def generate_tokens(text):
    # Tokenize and filter out stop words and any other potential noise
    return [word for word in word_tokenize(text.lower()) if word not in stop_words and word.isalpha()]

# Apply the tokenization function to the 'Review' column
df['tokens'] = df['Review'].apply(generate_tokens)


In [4]:
def create_dictionary(words):
    return corpora.Dictionary(words)
id2word=create_dictionary(df['tokens'])

In [5]:
def create_document_matrix(tokens,id2word):
    corpus = []
    for text in tokens:
        corpus.append(id2word.doc2bow(text))
    return corpus
#passing the dataframe column having tokens and dictionary
corpus=create_document_matrix(df['tokens'],id2word)
print(df['tokens'][0])
print(corpus[0])

['faulty', 'air', 'con', 'poor', 'service', 'used', 'base', 'early', 'morning', 'flight', 'ate', 'restaurant', 'fine', 'portions', 'quite', 'small', 'service', 'friendly', 'efficient', 'family', 'despite', 'setting', 'air', 'con', 'awake', 'midnight', 'like', 'sauna', 'checking', 'air', 'con', 'unit', 'fault', 'code', 'flashing', 'traipse', 'reception', 'midnight', 'solution', 'small', 'fans', 'gave', 'level', 'noise', 'pneumatic', 'drill', 'pretty', 'poor', 'seemed', 'fully', 'aware', 'fault', 'decided', 'try', 'haggle', 'argue', 'discount', 'give', 'wanted', 'sleep', 'left', 'staff', 'member', 'manager', 'argue', 'still', 'refund', 'offered', 'weeks', 'afterwards', 'barely', 'slept', 'due', 'temperature', 'ridiculously', 'loud', 'plug', 'fans', 'solution']
[(0, 1), (1, 3), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 3), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1

In [6]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                             ) #building up LDA model

In [7]:
#apply LDA model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis



In [10]:
# analyse csr report topic            
# Standard NLTK stop words
stop_words = set(stopwords.words('english'))

# Add custom stop words
custom_stop_words = [ 'hotel', 'room', 'stay', 'airport', 'and', 'the', 'i', 'bristol', 
    'stay', 'one', 'hilton', 'said', 'say','to','is','was','for','in','of','The','but','at','a','an','as','had','have','on','from','you','would','will','not','us','we','get','could']  # Add more as needed
stop_words.update(custom_stop_words)

# Apply LDA
num_topics = 10 
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=id2word, passes=15)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.035*"early" + 0.033*"breakfast" + 0.032*"flight" + 0.025*"good" + 0.024*"location" + 0.019*"convenient" + 0.018*"comfortable" + 0.017*"food" + 0.014*"expensive" + 0.013*"night"
Topic: 1 
Words: 0.035*"alarm" + 0.026*"fire" + 0.012*"meet" + 0.010*"going" + 0.009*"went" + 0.009*"noise" + 0.009*"son" + 0.009*"building" + 0.008*"review" + 0.008*"suited"
Topic: 2 
Words: 0.044*"bed" + 0.027*"shower" + 0.024*"bathroom" + 0.018*"small" + 0.016*"air" + 0.014*"water" + 0.012*"noisy" + 0.011*"uncomfortable" + 0.010*"pleasant" + 0.010*"hard"
Topic: 3 
Words: 0.014*"staff" + 0.011*"reception" + 0.009*"parking" + 0.008*"told" + 0.008*"time" + 0.008*"check" + 0.007*"day" + 0.007*"service" + 0.007*"made" + 0.007*"night"
Topic: 4 
Words: 0.059*"staff" + 0.035*"clean" + 0.035*"great" + 0.033*"location" + 0.030*"friendly" + 0.028*"good" + 0.026*"breakfast" + 0.023*"comfortable" + 0.023*"helpful" + 0.019*"excellent"
Topic: 5 
Words: 0.290*"superb" + 0.017*"v" + 0.014*"usb" + 0.014*"poi

In [9]:
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    print(f"Document {i} Topics:")
    for topic, prob in doc_topics:
        print(f"Topic {topic}: {prob:.4f}")
    print("\n")

Document 0 Topics:
Topic 0: 0.0376
Topic 1: 0.3340
Topic 2: 0.3572
Topic 4: 0.0354
Topic 6: 0.2113
Topic 7: 0.0213


Document 1 Topics:
Topic 0: 0.0407
Topic 4: 0.0873
Topic 7: 0.8459


Document 2 Topics:
Topic 2: 0.5249
Topic 4: 0.1709
Topic 6: 0.2885


Document 3 Topics:
Topic 1: 0.1373
Topic 3: 0.3458
Topic 6: 0.2852
Topic 7: 0.2177


Document 4 Topics:
Topic 1: 0.1631
Topic 3: 0.0589
Topic 7: 0.7468


Document 5 Topics:
Topic 0: 0.4643
Topic 1: 0.3896
Topic 6: 0.1259


Document 6 Topics:
Topic 1: 0.2180
Topic 2: 0.1727
Topic 3: 0.0564
Topic 4: 0.0306
Topic 6: 0.3925
Topic 7: 0.1253


Document 7 Topics:
Topic 1: 0.7687
Topic 2: 0.0326
Topic 6: 0.1945


Document 8 Topics:
Topic 0: 0.0707
Topic 3: 0.1108
Topic 4: 0.7976


Document 9 Topics:
Topic 0: 0.4486
Topic 6: 0.2117
Topic 7: 0.3136


Document 10 Topics:
Topic 0: 0.3450
Topic 2: 0.5743
Topic 4: 0.0575


Document 11 Topics:
Topic 1: 0.5011
Topic 2: 0.3015
Topic 5: 0.1063
Topic 6: 0.0852


Document 12 Topics:
Topic 0: 0.1309
Topic 