In [None]:
# Standard Libraries
import pandas as pd
import re
import string
import os
from google.colab import files

# Natural Language Processing Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# SKlearn Libraries
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# Gensim Libraries
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Visualisation libraries
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Downloading NLTK Packages
nltk.download(['stopwords', 'wordnet', 'punkt'])

# Define Stop Words
stop_words = list(stopwords.words('english'))

# Add Custom Stop Words
new_words = ['british', 'airway', 'company', 'airline', 'flight', 'heathrow', 'service', 'london',
             'business', 'economy', 'customer', 'passenger', 'hour', 'minute']
stop_words.extend(new_words)


### 1. Load data

In [None]:
def load_data() -> pd.DataFrame:
  """
  The function:
  - uploads the data from local machine
  - read the data from csv file to df
  - samples 20% of the data
  """
  # Prompt user to upload a the reviews csv file
  uploaded = files.upload()
  # Read the data from csv
  data = pd.read_csv('reviews.csv')
  data = data[['ReviewBody']]
  # Sample 20% of the data
  data = data.sample(frac=0.20, random_state=42).reset_index(drop=True)
  return data


In [None]:
df_reviews = load_data()

### 2. Data Preprocessing

In [None]:
def preprocess_text(text : str) -> list:
    """
    This function prepares the text data, conducting the following steps:
    1) Removal of text in sqaure brackets
    2) Removal of words containing numbers
    3) Removal of emojis
    4) Removal of extra spaces and newline characters
    5) Tokenization
    6) Lemmatization
    7) Removal of stopwords
    8) Removal of punctuation
    9) Removal of names
    """
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    # Remove words containing numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove extra spaces and newline characters
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Initialize WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove stop words and punctuation
    processed_tokens = [token for token in lemmatized_tokens if token.lower() not in stop_words and token not in string.punctuation and len(token)>5]
    return processed_tokens

In [None]:
df_reviews['ReviewBody'] = df_reviews.ReviewBody.apply(
    lambda x: preprocess_text(x))

### 3. Bag of words and LDA (Gensim)

In [None]:
def apply_lda_with_bag_of_words(df_reviews:pd.DataFrame) -> pd.DataFrame:
    """
    This function applies LDA from gensim with bag of words approach.
    """
    # Map IDs to words to be used as an input for the LDA model using the universal corpous ids
    words = corpora.Dictionary(df_reviews['ReviewBody'])

    # Turn each review into a bag of words.
    corpus = [words.doc2bow(doc) for doc in df_reviews['ReviewBody']]

    # Apply the LDA model from gensim to establish topics
    lda_bag_of_words_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, # text
        id2word=words, # representations
        num_topics=6, # define number of topics
        random_state=42
    )

    feature_names_bag_of_words = list(words.values()) # get the features names from Bag of Words
    n_top_words = 10

    # Initialize a dictionary to store dominant words per LDA topic group
    topics_dict = {}

    for topic_idx, topic in enumerate(lda_bag_of_words_model.get_topics()):
        topic_words = [feature_names_bag_of_words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics_dict[f'Topic {(topic_idx + 1)}'] = topic_words

    # Turn the dict of topics/words into df
    lda_bag_of_words_model_topics_df = pd.DataFrame(topics_dict)

    return feature_names_bag_of_words, lda_bag_of_words_model, lda_bag_of_words_model_topics_df


In [None]:
feature_names_bag_of_words,  lda_bag_of_words_model, lda_with_bag_of_words_topics_df = apply_lda_with_bag_of_words(df_reviews)

In [None]:
def visualize_topics(model_method: str, feature_names:list, n_top_words:int) -> None:
    """
    Visualizes topics generated by a topic modeling method using word clouds.
    """
    # Create subplots for each topic
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(16, 8), sharex=True, sharey=True)
    axes = axes.flatten()

    for topic_idx, topic in enumerate(model_method):
        # Generate word cloud for each topic
        wordcloud = WordCloud(background_color="white", colormap="viridis").generate(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        ax = axes[topic_idx]
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Topic {topic_idx + 1}', fontsize=16)
        ax.axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
visualize_topics(lda_bag_of_words_model.get_topics(), feature_names_bag_of_words, n_top_words=10)

## Topics from LDA (Gensim) and Bag of Words Method
1) Food and Baverages

2) Cabin and Luggage

3) Entertaiment

4) Departures and Flights

5) Booking

6) Lounge and Boarding

### 4. TFIDF and LDA (scikit-learn)

In [None]:
def apply_lda_with_tfidf(df_reviews):
    """
    This function applies LDA from scikit-learn with tfidf approach.
    """
    # Initialize TFIDF
    tfidf = TfidfVectorizer(max_df=.8, min_df=20, max_features=10000)
    # Convert the text column to a list of strings
    df_reviews['ReviewBody_tokenized'] = df_reviews['ReviewBody'].apply(
    lambda x: ' '.join(x))
    # Fit the TF-IDF vectorizer to the text data
    X = tfidf.fit_transform(df_reviews['ReviewBody_tokenized'])
    # Run LDA
    lda_model_with_tfidf = LatentDirichletAllocation(
    n_components=6, # define number of topics
    random_state=123)
    lda_model_with_tfidf.fit_transform(X)

    n_top_words = 10 # define numbers of words per topics
    feature_names_tfidf = tfidf.get_feature_names_out() # extract features names from TFIDF

    # Initialize a dictionary to store dominant words per LDA topic group
    topics_dict = {}

    for topic_idx, topic in enumerate(lda_model_with_tfidf.components_):
        topic_words = [feature_names_tfidf[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics_dict[f'Topic {(topic_idx + 1)}'] = topic_words

    # Turn the dict of topics/words into df
    lda_tfidf_topics_df = pd.DataFrame(topics_dict)
    return feature_names_tfidf, lda_model_with_tfidf, lda_tfidf_topics_df


In [None]:
feature_names_tfidf, lda_model_with_tfidf, lda_tfidf_topics_df = apply_lda_with_tfidf(df_reviews)

In [None]:
visualize_topics(lda_model_with_tfidf.components_, feature_names_tfidf, n_top_words=10)

## Topics from LDA and TFIDF Method
1) Lounge Experience

2) Bookings

3) Departures, Ticketing and Cancellations

4) Luggage Handling and Delays

5) Cabin and Crew Experience

6) Baggage and Boarding

## **Final list of Topics based on business judgment:**

1) Boarding and Crew Experience

2) Entertainment and Food

3) Cabin Comfort and Baggage

4) Lounge Experience

5) Bookings and Refunds

6) Flights and Cancellations