In [3]:
import streamlit as st
from selenium import webdriver
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import openai
import faiss
import numpy as np
import re
import os 
import matplotlib.pyplot as plt
import groq
from dotenv import load_dotenv
import os

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
!pip install dotenv



In [254]:
# Load the .env file
load_dotenv()

# Retrieve the API key
groq_api_key = os.getenv("GROQ_API_KEY")

print("GROQ API Key:", groq_api_key)  # Ensure it's loaded correctly


True

GROQ API Key: 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


In [255]:
def scrape(base_url, num_pages):
    if os.path.exists('reviews.txt'):
        os.remove('reviews.txt')
    driver = webdriver.Chrome()
    for page_num in range(1, num_pages + 1):
        url = f'{base_url}/page/{page_num}' if page_num > 1 else base_url
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        reviews = soup.find_all('div', class_='body')
        with open('reviews.txt', 'a', encoding='utf-8') as file:
            for i, review in enumerate(reviews):
                text = review.text.strip()
                file.write(f'Review {i + 1 + (page_num - 1) * len(reviews)}:\n{text}\n{"-" * 80}\n')
    driver.quit()

In [256]:
scrape('https://www.airlinequality.com/airline-reviews/qatar-airways',5)

In [257]:
# Function to load text from 'reviews.txt'

def load_text(file_path='reviews.txt', encoding='utf-8'):
    try:
        with open(file_path, encoding=encoding) as f:
            text = f.read()
        return text
    except UnicodeDecodeError as e:
        st.error(f"Error decoding file {file_path} with encoding {encoding}: {e}")
 
# Function to perform text splitting

def split_text(data):
    docs = data.split("--------------------------------------------------------------------------------")
    docs = [review.strip() for review in docs if review.strip()]
    return docs

'''
def split_text(data):
    # Split the content into individual reviews based on the delimiter "Review"
    docs = data.split("Review")[1:]
    docs = [review.strip() for review in docs]
    #docs = reviews.split_text(data)
    return docs'
'''

'\ndef split_text(data):\n    # Split the content into individual reviews based on the delimiter "Review"\n    docs = data.split("Review")[1:]\n    docs = [review.strip() for review in docs]\n    #docs = reviews.split_text(data)\n    return docs\'\n'

## Preprocess the reviews

In [258]:
import tiktoken

def count_tokens(text, model="llama3-8b-8192"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


In [259]:
import re

def clean_review_text(text):
    # 1️⃣ Replace multiple newlines with single space
    text = re.sub(r'\n+', ' ', text)
    
    # 2️⃣ Remove unnecessary 'reviews' counts (like "121 reviews")
    text = re.sub(r'\d+\s+reviews', '', text, flags=re.IGNORECASE)
    
    # 3️⃣ Remove seat rating blocks (sequence of '12345')
    text = re.sub(r'(?:12345\s*){3,}', '', text)
    
    # 4️⃣ Remove 'Close' or other repeated Close markers
    text = re.sub(r'\bClose\b', '', text)
    
    # 5️⃣ Remove multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # 6️⃣ Remove stray "|" at start/end
    text = text.strip(" |")

    # 7️⃣ Remove "✅ Trip Verified |" type symbols for a cleaner read
    text = re.sub(r'✅\s+Trip Verified\s*\|', '', text)
    
    # 8️⃣ Clean up repeated whitespace (final pass)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [260]:
reviews_txt = load_text()

In [261]:
chunks = split_text(reviews_txt)
chunks = [clean_review_text(review) for review in chunks]
len(chunks)

50

In [262]:
def initialize_sentence_transformers_embeddings(docs):
    gist_embedding = SentenceTransformer("avsolatorio/GIST-Embedding-v0")
    _list = gist_embedding.encode(docs, convert_to_tensor=True)
    return gist_embedding, _list

In [263]:
gist_embedding, gist_embeddings_list = initialize_sentence_transformers_embeddings(chunks)
print(f'Gist Embeddings: {gist_embedding}')
print(f'Gist Embeddings List: {gist_embeddings_list}')


Gist Embeddings: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
Gist Embeddings List: tensor([[-0.0626, -0.0107, -0.0173,  ...,  0.0189,  0.0317,  0.0074],
        [-0.0426, -0.0343, -0.0034,  ...,  0.0111,  0.0671, -0.0031],
        [-0.0164,  0.0068,  0.0182,  ...,  0.0247,  0.0906, -0.0138],
        ...,
        [-0.0338,  0.0113,  0.0318,  ..., -0.0187,  0.0395,  0.0076],
        [-0.0239, -0.0231,  0.0334,  ..., -0.0017,  0.0528, -0.0113],
        [-0.0278, -0.0098,  0.0332,  ...,  0.0117,  0.0568,  0.0117]])


In [264]:
question = 'What is that Michael Schade said about the experience with Qatar Airways?'

In [265]:
def convert_question_to_embeddings(question, gist_embedding):
    search_vec = gist_embedding.encode(question, convert_to_tensor=True)  # Now, gist_embedding is a model
    svec = np.array(search_vec).reshape(1, -1)
    svec = np.ascontiguousarray(svec, dtype='float32')
    return svec


In [266]:
svec = convert_question_to_embeddings(question, gist_embedding)

  svec = np.array(search_vec).reshape(1, -1)


In [241]:
dim = gist_embeddings_list.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(gist_embeddings_list)
distance, I = index.search(svec, k=7)

In [242]:
lst = [chunks[i] for i in I[0]]
lst

['Review 23: "the flight with Qatar Airways was great" L Han (Germany) 5th January 2025 Overall the flight with Qatar Airways itself was great, but the flight with one of its partner Malaysia Airlines to KUL is just terrible. For the Munich to Doha route, usually it is flown by the Boeing 787 or A350-900 which does not have QSuite, but on the day of my travel, we got the 777 with QSuite which was a huge upgrade from these types of planes that usually operate this route. Food was great, Staff was great, Entertainment was Good, WIFI is free for Privilege Club Member up to 1 hour. Arrived in Doha and we enjoy their Al Mourjan Business Lounge located at The Garden. Then we continue to Kuala Lumpur with Malaysia Airlines and the food was terrible, entertainment was very limited, WIFI was slow and staff was not so great. Qatar Airways needs to stop handing over their flights to their partners like MH and continue to operate their own daily flights instead. AircraftBoeing 777-300ER Type Of Tr

## Formated output Test Code

In [251]:
import groq  # Ensure you have the groq library installed

def batch_retrieval_qa(prompt, documents, model="llama3-8b-8192", temperature=0.7, max_tokens=1000):
    """
    Analyzes customer feedback and generates a structured report formatted for senior management.

    Parameters:
    - prompt (str): Instruction for analysis.
    - documents (list): List of customer feedback documents.
    - model (str): The model used for analysis (default: "llama3-8b-8192").
    - temperature (float): Controls randomness in responses (default: 0.7).
    - max_tokens (int): Maximum token length for response (default: 1000).

    Returns:
    - Formatted report as a string.
    """

    client = groq.Client(api_key="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")  # Replace with your actual API key
    batch_size = 3  # Process 3 documents at a time
    results = []
    token_stats = []  # To store token usage info

    for i in range(0, len(documents), batch_size):
        batch_docs = documents[i:i + batch_size]
        #print(f"Processing batch {batch_docs}")  # Debugging line
        input_text = f"{prompt}\n\nDocuments:\n" + "\n".join(batch_docs)

        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "user",
                    "content": (
                        "As a business analyst, analyze the customer feedback provided in the structured review format below. "
                        "Focus on key themes and highlight actionable insights. "
                        "Use the following structured format for your response:\n\n"

                        "**Customer Feedback Analysis – Qatar Airways**\n\n"
                        
                        "**Summary:**\n"
                        "- Provide a high-level overview of customer feedback themes.\n\n"

                        "**Key Insights:**\n"
                        "- Identify positive and negative experiences.\n"
                        "- Highlight recurring concerns such as customer service, food quality, or operational inefficiencies.\n\n"

                        "**Specific Examples:**\n"
                        "- Mention customer names and describe their experiences in bullet points.\n\n"

                        "**Actionable Recommendations:**\n"
                        "- Provide practical solutions to improve identified issues.\n"
                        "- Keep recommendations concise and impactful.\n\n"

                        "Ensure the response is clear, concise, and formatted for senior management readability. Keep it to 100 words. Avoid inferences not supported by data."
                    )
                },
                {"role": "user", "content": input_text}
            ],
            temperature=temperature,
            max_tokens=max_tokens
        )

        results.append(response.choices[0].message.content.strip())

        # Extract token usage info (if available)
        try:
            usage = response.usage
            token_stats.append({
                'batch': (i // batch_size) + 1,
                'input_tokens': usage.prompt_tokens,
                'output_tokens': usage.completion_tokens,
                'total_tokens': usage.total_tokens
            })
        except AttributeError:
            token_stats.append({
                'batch': (i // batch_size) + 1,
                'input_tokens': None,
                'output_tokens': None,
                'total_tokens': None
            })

    return "\n\n".join(results), token_stats

    #return "\n\n".join(results)  # Returns a single formatted report


In [244]:
prompt_list = [
    "Analyze feedback focusing on customer satisfaction drivers.",
    "Identify operational challenges based on the feedback.",
    "Summarize recurring complaints and suggest quick fixes.",
    "List both compliments and complaints mentioned by customers.",
    "Extract insights on service quality and suggest improvements."
]


In [245]:
batch_response = batch_retrieval_qa(question, lst, model="llama3-8b-8192", temperature=0.7, max_tokens=1000)

Processing batch ['Review 23: "the flight with Qatar Airways was great" L Han (Germany) 5th January 2025 Overall the flight with Qatar Airways itself was great, but the flight with one of its partner Malaysia Airlines to KUL is just terrible. For the Munich to Doha route, usually it is flown by the Boeing 787 or A350-900 which does not have QSuite, but on the day of my travel, we got the 777 with QSuite which was a huge upgrade from these types of planes that usually operate this route. Food was great, Staff was great, Entertainment was Good, WIFI is free for Privilege Club Member up to 1 hour. Arrived in Doha and we enjoy their Al Mourjan Business Lounge located at The Garden. Then we continue to Kuala Lumpur with Malaysia Airlines and the food was terrible, entertainment was very limited, WIFI was slow and staff was not so great. Qatar Airways needs to stop handing over their flights to their partners like MH and continue to operate their own daily flights instead. AircraftBoeing 777

In [246]:
results, token_stats = batch_retrieval_qa(question, lst)

# Print all batch results and token stats together
print("========= Customer Feedback Analysis =========\n")
for idx, res in enumerate(results, 1):
    print(f"Batch {idx} Result:\n{res}\n")
print("==============================================\n")

print("========= Token Usage per Batch ==============")
for stat in token_stats:
    print(f"Batch {stat['batch']}: Input Tokens = {stat['input_tokens']}, Output Tokens = {stat['output_tokens']}, Total Tokens = {stat['total_tokens']}")
print("==============================================")


Processing batch ['Review 23: "the flight with Qatar Airways was great" L Han (Germany) 5th January 2025 Overall the flight with Qatar Airways itself was great, but the flight with one of its partner Malaysia Airlines to KUL is just terrible. For the Munich to Doha route, usually it is flown by the Boeing 787 or A350-900 which does not have QSuite, but on the day of my travel, we got the 777 with QSuite which was a huge upgrade from these types of planes that usually operate this route. Food was great, Staff was great, Entertainment was Good, WIFI is free for Privilege Club Member up to 1 hour. Arrived in Doha and we enjoy their Al Mourjan Business Lounge located at The Garden. Then we continue to Kuala Lumpur with Malaysia Airlines and the food was terrible, entertainment was very limited, WIFI was slow and staff was not so great. Qatar Airways needs to stop handing over their flights to their partners like MH and continue to operate their own daily flights instead. AircraftBoeing 777

In [247]:
total_input = sum(stat['input_tokens'] for stat in token_stats if stat['input_tokens'] is not None)
total_output = sum(stat['output_tokens'] for stat in token_stats if stat['output_tokens'] is not None)
total_tokens = sum(stat['total_tokens'] for stat in token_stats if stat['total_tokens'] is not None)

print(f"\nTotal Input Tokens: {total_input}")
print(f"Total Output Tokens: {total_output}")
print(f"Total Tokens Used: {total_tokens}")



Total Input Tokens: 2488
Total Output Tokens: 816
Total Tokens Used: 3304


In [248]:
# Loop over each prompt
token_stats = [] 
repsonses = []
for idx, question in enumerate(prompt_list, 1):
    print(f"\n----- Running Prompt {idx}: {question} -----")
    batch_response, stats = batch_retrieval_qa(question, lst, model="llama3-8b-8192", temperature=0.7, max_tokens=1000)
    repsonses.append(batch_response)
    print(f"\nResult for Prompt {idx}:\n{batch_response}")
    print(f"Token Stats for Prompt {idx}:\n{stats}")
    token_stats.extend(stats)  # Collect token stats from each prompt
#calulate the total tokens used
total_input = sum(stat['input_tokens'] for stat in token_stats if stat['input_tokens'] is not None)
total_output = sum(stat['output_tokens'] for stat in token_stats if stat['output_tokens'] is not None)
total_tokens = sum(stat['total_tokens'] for stat in token_stats if stat['total_tokens'] is not None)
print(f"\nTotal Input Tokens: {total_input}")
print(f"Total Output Tokens: {total_output}")
print(f"Total Tokens Used: {total_tokens}")
# Display token stats in a table    
import pandas as pd
token_df = pd.DataFrame(token_stats)
token_df['batch'] = token_df['batch'].astype(int)
token_df['prompt'] = token_df['batch'].apply(lambda x: f"Prompt {x}")
token_df['input_tokens'] = token_df['input_tokens'].astype(int)
token_df['output_tokens'] = token_df['output_tokens'].astype(int)
token_df['total_tokens'] = token_df['total_tokens'].astype(int)
print(token_df)


----- Running Prompt 1: Analyze feedback focusing on customer satisfaction drivers. -----
Processing batch ['Review 23: "the flight with Qatar Airways was great" L Han (Germany) 5th January 2025 Overall the flight with Qatar Airways itself was great, but the flight with one of its partner Malaysia Airlines to KUL is just terrible. For the Munich to Doha route, usually it is flown by the Boeing 787 or A350-900 which does not have QSuite, but on the day of my travel, we got the 777 with QSuite which was a huge upgrade from these types of planes that usually operate this route. Food was great, Staff was great, Entertainment was Good, WIFI is free for Privilege Club Member up to 1 hour. Arrived in Doha and we enjoy their Al Mourjan Business Lounge located at The Garden. Then we continue to Kuala Lumpur with Malaysia Airlines and the food was terrible, entertainment was very limited, WIFI was slow and staff was not so great. Qatar Airways needs to stop handing over their flights to their p

In [249]:
token_df['input_tokens'].sum()
token_df['output_tokens'].sum()
token_df['total_tokens'].sum()

np.int64(12443)

np.int64(4195)

np.int64(16638)

In [231]:
lst

['Review 23: "the flight with Qatar Airways was great" L Han (Germany) 5th January 2025 Overall the flight with Qatar Airways itself was great, but the flight with one of its partner Malaysia Airlines to KUL is just terrible. For the Munich to Doha route, usually it is flown by the Boeing 787 or A350-900 which does not have QSuite, but on the day of my travel, we got the 777 with QSuite which was a huge upgrade from these types of planes that usually operate this route. Food was great, Staff was great, Entertainment was Good, WIFI is free for Privilege Club Member up to 1 hour. Arrived in Doha and we enjoy their Al Mourjan Business Lounge located at The Garden. Then we continue to Kuala Lumpur with Malaysia Airlines and the food was terrible, entertainment was very limited, WIFI was slow and staff was not so great. Qatar Airways needs to stop handing over their flights to their partners like MH and continue to operate their own daily flights instead. AircraftBoeing 777-300ER Type Of Tr

## Load test by passing all the review.txt data in chunks of 3

In [252]:
question = "Identify operational challenges based on the feedback."
opereations_asnwer= batch_retrieval_qa(question, lst, model="llama3-8b-8192", temperature=0.7, max_tokens=1000)
opereations_asnwer

('**Customer Feedback Analysis – Qatar Airways**\n\n**Summary:**\nQatar Airways customer feedback highlights a mix of positive and negative experiences. While some customers praise the airline\'s modern aircraft, comfortable seating, and exceptional service, others express concerns about partner airline operations and limited inflight entertainment options.\n\n**Key Insights:**\n\n* Positive experiences: customers appreciate Qatar Airways\' modern aircraft, comfortable seating, and exceptional service.\n* Negative experiences: customers criticize partner airline operations, limited inflight entertainment options, and inconsistent service.\n\n**Specific Examples:**\n\n* Review 23: L Han (Germany) experienced a great flight with Qatar Airways, but was dissatisfied with the partner airline, Malaysia Airlines, which operated the second leg of the journey.\n* Review 16: Dr. Akeil Al-Faraj (Canada) had an outstanding experience with Qatar Airways, praising the check-in process, aircraft clea

## Evaluate the performance of the model

In [253]:
lst

['Review 23: "the flight with Qatar Airways was great" L Han (Germany) 5th January 2025 Overall the flight with Qatar Airways itself was great, but the flight with one of its partner Malaysia Airlines to KUL is just terrible. For the Munich to Doha route, usually it is flown by the Boeing 787 or A350-900 which does not have QSuite, but on the day of my travel, we got the 777 with QSuite which was a huge upgrade from these types of planes that usually operate this route. Food was great, Staff was great, Entertainment was Good, WIFI is free for Privilege Club Member up to 1 hour. Arrived in Doha and we enjoy their Al Mourjan Business Lounge located at The Garden. Then we continue to Kuala Lumpur with Malaysia Airlines and the food was terrible, entertainment was very limited, WIFI was slow and staff was not so great. Qatar Airways needs to stop handing over their flights to their partners like MH and continue to operate their own daily flights instead. AircraftBoeing 777-300ER Type Of Tr

In [156]:
lst[0]

'Review 23: "the flight with Qatar Airways was great" L Han (Germany) 5th January 2025 Overall the flight with Qatar Airways itself was great, but the flight with one of its partner Malaysia Airlines to KUL is just terrible. For the Munich to Doha route, usually it is flown by the Boeing 787 or A350-900 which does not have QSuite, but on the day of my travel, we got the 777 with QSuite which was a huge upgrade from these types of planes that usually operate this route. Food was great, Staff was great, Entertainment was Good, WIFI is free for Privilege Club Member up to 1 hour. Arrived in Doha and we enjoy their Al Mourjan Business Lounge located at The Garden. Then we continue to Kuala Lumpur with Malaysia Airlines and the food was terrible, entertainment was very limited, WIFI was slow and staff was not so great. Qatar Airways needs to stop handing over their flights to their partners like MH and continue to operate their own daily flights instead. AircraftBoeing 777-300ER Type Of Tra

In [152]:
batch_response

'**Customer Feedback Analysis – Qatar Airways**\n\n**Summary:**\nThe customer feedback highlights a mixed experience, with some customers praising the airline\'s service quality, while others express concerns about partner airline operations.\n\n**Key Insights:**\n\n* Positive experiences with Qatar Airways\' own flights, citing excellent cabin staff service, comfortable seating, and great food options.\n* Negative experiences with partner airline operations, particularly with Malaysia Airlines, citing poor food, limited entertainment, and slow WiFi.\n* Recurring theme of exceptional service provided by Qatar Airways\' cabin crew.\n\n**Specific Examples:**\n\n* L. Han (Germany) appreciated the upgrade to a Boeing 777 with QSuite, citing great food, staff, and entertainment, but expressed disappointment with the partner airline operation.\n* Dr. Akeil Al-Faraj (Canada) had an outstanding experience on a Qatar Airways flight from Doha to Amman, praising the check-in process, aircraft cle