In [1]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import LlamaCpp
import string
from bs4 import BeautifulSoup


In [9]:
pdf_directory = r"C:\Users\jenit\OneDrive\Desktop\Hotel_Q&A_bot\data\\"

In [3]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove bullets (including `●`, `•`, `-`, `*`) at the beginning of lines
    text = re.sub(r'^[●•*-]\s*', '', text, flags=re.MULTILINE)
    
    # Remove any remaining standalone bullets
    text = re.sub(r'●|•|-|\*', '', text)
    
    # Remove excess spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [10]:
documents = []
for file in os.listdir(pdf_directory):
    if file.endswith(".pdf"):
        hotel_name = os.path.splitext(file)[0]  # Extract hotel name from filename
        loader = PyPDFLoader(os.path.join(pdf_directory, file))
        docs = loader.load()
        
        # Merge all pages into a single text block
        full_text = " ".join([clean_text(doc.page_content) for doc in docs])

        # Create a single document for the entire PDF
        cleaned_doc = Document(
            page_content=full_text, 
            metadata={"hotel": hotel_name}
        )
        documents.append(cleaned_doc)

print(f"Total documents loaded: {len(documents)}")
# print(f"Sample document metadata: {documents[3].metadata}")


incorrect startxref pointer(1)
parsing for Object Streams
found 0 objects within Object(27,0) whereas 76 expected


Total documents loaded: 5


In [12]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Adjust as needed
    chunk_overlap=200,  # Keeps some context between chunks
    separators=["\n\n", "\n", " ", "."],
)

chunks = text_splitter.split_documents(documents)

# Check if metadata is preserved
print(f"Total chunks created: {len(chunks)}")
print(f"Sample chunk metadata: {chunks[50].metadata}")
print(f"Sample chunk text: {chunks[2].page_content[:300]}")  # Print first 300 characters


Total chunks created: 108
Sample chunk metadata: {'hotel': 'keys_prima_rules'}
Sample chunk text: to win various prizes. we may combine personal information from promotions and contests with nonpersonal information collected through cookies and other means (described below). automatic collection when you use the site our servers may automatically collect nonpersonal information as you browse our


In [20]:
print(f"Sample chunk text: {chunks[11].page_content}")  # Print first 300 characters


Sample chunk text: before arrival no retention charge (full refund). if cancelled 15 days before arrival 50% retention charge from the full amount. if cancelled between 07 days of arrival 100% retention charge from the full amount. retention charges will be levied for all days of confirmed booking in case of no show peak period: no refund for cancellation in peak period bookings. contact eitticity road anachal, munnar685565 kerala, india tel.: +91 8547 802 563 info@cloudcastlemunnar.com


In [21]:
# Use a high-quality embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [22]:
import shutil

# Path to the ChromaDB storage directory
chroma_db_path = "./chroma_db"

# Delete the existing ChromaDB storage to ensure fresh data
shutil.rmtree(chroma_db_path, ignore_errors=True)

print("✅ ChromaDB storage deleted successfully!")


✅ ChromaDB storage deleted successfully!


In [23]:
# Recreate the vector database with clean data
vector_db = Chroma.from_documents(
    documents=chunks,  # Your cleaned chunks
    embedding=embedding_model,
    persist_directory=chroma_db_path  # Ensure it's a fresh directory
)

vector_db.persist()
print("✅ Fresh ChromaDB created with new embeddings!")


✅ Fresh ChromaDB created with new embeddings!


In [24]:
def retrieve_answer(user_query, hotel_name):
    results = vector_db.similarity_search(
        query=user_query,
        k=5,  # Retrieve more candidates
        filter={"hotel": hotel_name}
    )

    if not results:
        return ["No relevant information found."]
    
    return [doc.page_content for doc in results]



In [43]:

# Load the local Mistral-7B-Instruct model
model_path = "C:/Users/jenit/OneDrive/Desktop/Hotel_Q&A_bot/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf"

llm = LlamaCpp(
    model_path=model_path,  
    temperature=0.1,  # Lower temperature to reduce randomness
    max_tokens=200,  # Reduce max token length to limit verbosity
    n_ctx=4096,  
    top_p=0.7,  # Restrict probability space to prevent excessive details
    verbose=False  
)


def generate_answer(user_query, hotel_name):
    # Retrieve relevant documents based on the selected hotel
    retrieved_docs = retrieve_answer(user_query, hotel_name)
    unique_chunks = list(set(retrieved_docs))

    if not unique_chunks:
        return "Sorry, no relevant information was found for this hotel."

    # Log retrieved chunks for debugging
    print("\n🔹 **Retrieved Chunks for Context:**")
    for idx, doc in enumerate(unique_chunks, 1):
        print(f"Chunk {idx}: {doc}\n{'-'*80}")

    # Combine retrieved docs into a single formatted context
    context = "\n\n".join(f"- {doc.strip()}" for doc in unique_chunks)

    # Few-shot examples to improve LLM accuracy
    few_shot_examples = """
    **Example 1:**  
    **Q:** What is the cancellation policy if I cancel 20 days before arrival?  
    **A:** If you cancel 20 days before arrival, a 50% retention charge from the full amount will apply.

    **Example 2:**  
    **Q:** What happens if I cancel my booking just 5 days before arrival?  
    **A:** If you cancel within 7 days of arrival, a 100% retention charge from the full amount will apply.

    **Example 3:**  
    **Q:** Will I get a refund if I cancel my booking during peak season?  
    **A:** No, there is no refund for cancellations during peak period bookings.

    **Example 4:**  
    **Q:** What documents are required for check-in?  
    **A:** It is mandatory to produce a valid government-issued photo ID or a passport with a visa page at the time of check-in.

    **Example 5:**  
    **Q:** What happens if someone misbehaves in the hotel premises?  
    **A:** The hotel reserves the right to ask any person who is not properly attired or misbehaves to leave the hotel premises immediately.
    """

    # Create the prompt for the LLM
    prompt = f"""
    You are a professional customer support assistant answering questions about hotel policies.  
    **Your response must be strictly factual, concise, and relevant to the user's question.**  

    **Rules for Answering:**  
    - **Do not add extra details** beyond what is in the policy.  
    - **Do not include promotional content** or greetings.  
    - **Do not speculate.** If the information is unavailable, say so.  

    **Hotel Name:** {hotel_name}  

    Below are correct and incorrect response examples:  
    {few_shot_examples}

    **Retrieved Hotel Policy Information:**  
    ------------------  
    {context}  
    ------------------  

    **Now, answer the following question based only on the provided policy:**  
    **Customer's Question:** "{user_query}"  
    **Your Response:**  
    """

    # Get LLM-generated response
    response = llm.invoke(prompt)

    return response


In [46]:

# Example Query
hotel = "keys_prima_rules"
query = "what are the late check-out timings?"
answer = generate_answer(query, hotel)

print(f"Generated Answer:\n{answer}")


🔹 **Retrieved Chunks for Context:**
Chunk 1: number: infinity members have a dedicated hotline which is designed to offer personalized assistance and resolve any issues or inquiries more efficiently for members of the program. hotlines are available around the clock, ensuring that members can get help at any time, no matter where they are. 13. late checkout – this benefit is subject to availability and is on request. member will have pre inform the hotel about the late checkout request.  silver – not applicable  gold – 2:00 pm  platinum – 4:00 pm  platinum plus 6:00 pm 14. early check in this benefit is subject to availability and is on request. member will have pre inform the hotel about the early checkin request  silver – not applicable  gold – not applicable  platinum not applicable  platinum plus – 12:00 pm 15. discount on f&b – the in–house members (members staying at participating hotels) are eligible for below belowmentioned f&b discount. the walkin members (not staying