In [10]:
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
import os
import gensim
import nltk
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [11]:
df = pd.read_csv('reviews_1.csv')

In [12]:
# Load standard stop words
stop_words = set(stopwords.words('english'))

# Add custom stop words
custom_stop_words = [
    'hotel', 'room', 'stay', 'airport', 'and', 'the', 'i', 'bristol', 
    'stay', 'one', 'hilton', 'said', 'say','to','is','was','for','in','of','The','but','at','a','an','as','had','have','on','from','you','would','will','not','us','we','get','could'  # Ensure not to duplicate words
]
stop_words.update(custom_stop_words)

# Function to generate tokens
def generate_tokens(text):
    # Tokenize and filter out stop words and any other potential noise
    return [word for word in word_tokenize(text.lower()) if word not in stop_words and word.isalpha()]

# Apply the tokenization function to the 'Review' column
df['tokens'] = df['Review'].apply(generate_tokens)


In [13]:
def create_dictionary(words):
    return corpora.Dictionary(words)
id2word=create_dictionary(df['tokens'])

In [14]:
def create_document_matrix(tokens,id2word):
    corpus = []
    for text in tokens:
        corpus.append(id2word.doc2bow(text))
    return corpus
#passing the dataframe column having tokens and dictionary
corpus=create_document_matrix(df['tokens'],id2word)
print(df['tokens'][0])
print(corpus[0])

['enjoyable', 'hub', 'traving', 'region', 'train', 'clean', 'nice', 'amazing', 'staff', 'front', 'desk', 'perfect', 'location', 'near', 'train', 'station']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1)]


In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            random_state=100,
                                             ) #building up LDA momdel

In [16]:
#applying LDA model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis 



In [17]:
# analyse csr report topic            
# Standard NLTK stop words
stop_words = set(stopwords.words('english'))

# Add custom stop words
custom_stop_words = [ 'hotel', 'room', 'stay', 'airport', 'and', 'the', 'i', 'bristol', 
    'stay', 'one', 'hilton', 'said', 'say','to','is','was','for','in','of','The','but','at','a','an','as','had','have','on','from','you','would','will','not','us','we','get','could']  # Add more as needed
stop_words.update(custom_stop_words)

# Apply LDA
num_topics = 10  
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=id2word, passes=15)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.028*"breakfast" + 0.027*"food" + 0.020*"good" + 0.018*"staff" + 0.014*"comfortable" + 0.014*"location" + 0.013*"clean" + 0.013*"price" + 0.012*"night" + 0.011*"expensive"
Topic: 1 
Words: 0.034*"parking" + 0.025*"good" + 0.018*"car" + 0.018*"location" + 0.015*"park" + 0.012*"walk" + 0.011*"convenience" + 0.010*"bar" + 0.009*"clean" + 0.009*"area"
Topic: 2 
Words: 0.234*"superb" + 0.032*"air" + 0.013*"con" + 0.012*"conditioning" + 0.010*"passable" + 0.010*"et" + 0.008*"la" + 0.006*"personnel" + 0.006*"shuttle" + 0.006*"très"
Topic: 3 
Words: 0.035*"la" + 0.032*"de" + 0.026*"muy" + 0.023*"en" + 0.017*"el" + 0.015*"e" + 0.012*"personal" + 0.010*"desayuno" + 0.010*"del" + 0.009*"que"
Topic: 4 
Words: 0.091*"early" + 0.077*"flight" + 0.050*"breakfast" + 0.036*"good" + 0.034*"location" + 0.033*"convenient" + 0.028*"morning" + 0.022*"perfect" + 0.022*"comfortable" + 0.021*"great"
Topic: 5 
Words: 0.094*"good" + 0.026*"breakfast" + 0.023*"staff" + 0.017*"bed" + 0.012*"locati

In [18]:
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    print(f"Document {i} Topics:")
    for topic, prob in doc_topics:
        print(f"Topic {topic}: {prob:.4f}")
    print("\n")

Document 0 Topics:
Topic 0: 0.2646
Topic 8: 0.3301
Topic 9: 0.3640


Document 1 Topics:
Topic 0: 0.0143
Topic 1: 0.0143
Topic 2: 0.1572
Topic 3: 0.0143
Topic 4: 0.0143
Topic 5: 0.0143
Topic 6: 0.0143
Topic 7: 0.0143
Topic 8: 0.0143
Topic 9: 0.7285


Document 2 Topics:
Topic 2: 0.1000
Topic 6: 0.2769
Topic 8: 0.2994
Topic 9: 0.2691


Document 3 Topics:
Topic 0: 0.0250
Topic 1: 0.7749
Topic 2: 0.0250
Topic 3: 0.0250
Topic 4: 0.0250
Topic 5: 0.0250
Topic 6: 0.0250
Topic 7: 0.0250
Topic 8: 0.0250
Topic 9: 0.0250


Document 4 Topics:
Topic 5: 0.1688
Topic 6: 0.2476
Topic 8: 0.1412
Topic 9: 0.4090


Document 5 Topics:
Topic 0: 0.2079
Topic 3: 0.0524
Topic 8: 0.2151
Topic 9: 0.4960


Document 6 Topics:
Topic 5: 0.7303
Topic 9: 0.2125


Document 7 Topics:
Topic 1: 0.5432
Topic 5: 0.3134
Topic 9: 0.1239


Document 8 Topics:
Topic 0: 0.0125
Topic 1: 0.0125
Topic 2: 0.0125
Topic 3: 0.0125
Topic 4: 0.0125
Topic 5: 0.8874
Topic 6: 0.0125
Topic 7: 0.0125
Topic 8: 0.0125
Topic 9: 0.0125


Document 9 