In [None]:
import nltk
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from sklearn.model_selection import GridSearchCV

# Ensure that the required NLTK data is downloaded
nltk.download('stopwords')

# Load the CSV file into a DataFrame
df = pd.read_csv('filtered_negreviews/filtered_negreviews.csv')

# Define a function for preprocessing text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    
    # Tokenize the text by splitting on whitespace
    words = text.split()
    
    # Filter out words with 3 characters or less
    filtered_words = [word for word in words if len(word) > 3]
    
    # Join the filtered words back into a single string
    return ' '.join(filtered_words)

# Apply preprocessing to the 'ReviewBody' column
df['cleaned_review'] = df['ReviewBody'].apply(preprocess_text)

# Define domain-specific stopwords
domain_stopwords = ['british', 'airway', 'airways', 'flights', 'ba', 'plane', 
                     'flight', 'ife', 'yo', 'ba2058', 'ba169', 'us']

# Combine NLTK stopwords with domain-specific stopwords
stop_words = list(set(stopwords.words('english')).union(domain_stopwords))

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words)

# Fit and transform the cleaned reviews
X = vectorizer.fit_transform(df['cleaned_review'])

# Set the number of topics
n_topics = 5

# Apply LDA model
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_topics = lda_model.fit_transform(X)

# Apply NMF model
nmf_model = NMF(n_components=n_topics, random_state=42)
nmf_topics = nmf_model.fit_transform(X)

# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10

print("LDA Model Topics:")
display_topics(lda_model, vectorizer.get_feature_names_out(), no_top_words)

print("\nNMF Model Topics:")
display_topics(nmf_model, vectorizer.get_feature_names_out(), no_top_words)
        

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vivienne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LDA Model Topics:
Topic 1:
communication worst miami stand mins passengers experienced club service aviv
Topic 2:
class service food business london seat crew staff seats cabin
Topic 3:
pandemic assignments diverted rotterdam releases 19th roof minor parents grandfather
Topic 4:
service london hours seat told time business class staff customer
Topic 5:
seat staff seats class delayed london business luggage required arrogant

NMF Model Topics:
Topic 1:
crew food cabin service meal water aircraft london staff poor
Topic 2:
class business first lounge seats economy worst middle seat never
Topic 3:
refund customer cancelled call booked service voucher email phone told
Topic 4:
luggage delayed hours staff airport check boarding minutes arrived hour
Topic 5:
seat seats economy extra premium airlines legroom paid front aisle
[--------------------------------------------------] 1.2% 20.1/1662.8MB downloaded

In [86]:
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
import numpy as np
import pandas as pd
import re
import string

# Load the CSV file into a DataFrame
df = pd.read_csv('filtered_negreviews/filtered_negreviews.csv')

# Define a function for preprocessing text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    # Tokenize the text by splitting on whitespace
    words = text.split()
    return ' '.join(words)

# Apply preprocessing to the 'ReviewBody' column
df['cleaned_review'] = df['ReviewBody'].apply(preprocess_text)

# Preprocess the text data
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['cleaned_review'])
feature_names = vectorizer.get_feature_names_out()

# Create a Gensim Dictionary and Corpus
texts = [doc.split() for doc in df['cleaned_review']]  # Tokenize the documents
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

def calculate_coherence(model, X, feature_names, texts, dictionary):
    topics_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        topics_words.append(top_words)
    coherence_model = CoherenceModel(topics=topics_words, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score

# LDA model Coherence
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)
lda_coherence = calculate_coherence(lda_model, X, feature_names, texts, dictionary)
print(f"LDA Model Coherence Score: {lda_coherence}")

# NMF model Coherence
nmf_model = NMF(n_components=5, random_state=42)
nmf_model.fit(X)
nmf_coherence = calculate_coherence(nmf_model, X, feature_names, texts, dictionary)
print(f"NMF Model Coherence Score: {nmf_coherence}")

LDA Model Coherence Score: 0.4080976120755132
NMF Model Coherence Score: 0.4413820044520914


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the CSV file into a DataFrame
df = pd.read_csv('filtered_negreviews/cleaned_reviews.csv')
median_value = df['SeatComfort'].median()
df['SeatComfort'].fillna(median_value, inplace=True)
# Define a function for preprocessing text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    # Tokenize the text by splitting on whitespace
    words = text.split()
    # Filter out words with 3 characters or less
    filtered_words = [word for word in words if len(word) > 3]
    # Join the filtered words back into a single string
    return ' '.join(filtered_words)

# Apply preprocessing to the 'ReviewBody' column
df['cleaned_review'] = df['ReviewBody'].apply(preprocess_text)

# Define domain-specific stopwords
domain_stopwords = ['british', 'airway', 'airways', 'flights', 'ba', 'plane', 
                     'flight', 'ife', 'yo', 'ba2058', 'ba169', 'us']

# Combine NLTK stopwords with domain-specific stopwords
stop_words = list(set(stopwords.words('english')).union(domain_stopwords))

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words)

# Fit and transform the cleaned reviews
X = vectorizer.fit_transform(df['cleaned_review'])

# Convert the sparse matrix to a dense matrix and then to an array
X_dense = X.toarray()

# Target variable
y = df['SeatComfort']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42)

# Train regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')

x_feature = X_dense[:, 0]  
plt.figure(figsize=(10, 6))
plt.scatter(x_feature, y, color='blue', label='Data Points')
plt.plot(X_test[:, 0], y_pred, color='red', linewidth=2, label='Regression Line')

plt.xlabel('Feature 1 (TF-IDF)')
plt.ylabel('Seat Comfort')
plt.title('Regression of Seat Comfort on Feature 1')
plt.legend()
plt.show()

In [64]:
pip install pyLDAvis

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Prepare the corpus and dictionary for Gensim
texts = df['cleaned_review'].map(tokenize).tolist()
dictionary = corpora.Dictionary(texts)
corpus_gensim = [dictionary.doc2bow(text) for text in texts]

# Apply LDA model using Gensim
num_topics = 5
lda_model = gensim.models.LdaModel(corpus_gensim, num_topics=num_topics, id2word=dictionary, passes=15)

# Visualize the topics using pyLDAvis
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_gensim, dictionary)
pyLDAvis.display(vis)