In [3]:
!pip install faiss-cpu
!pip install sentence-transformers
!pip install transformers



In [4]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch


In [5]:
df = pd.read_csv("hotel_bookings.csv")

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
text_data = df.apply(lambda row: f"Booking ID {row.name}: {row.to_dict()}", axis=1).tolist()

In [8]:
embeddings = model.encode(text_data, convert_to_numpy=True)

In [9]:
#print("Embedding shape:", embeddings.shape)
#print("FAISS expected dimension:", d)

In [10]:
embeddings = embeddings.astype(np.float32)
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

In [11]:
metadata = {i: text_data[i] for i in range(len(text_data))}
print("Embeddings stored successfully in FAISS!")

Embeddings stored successfully in FAISS!


In [12]:
qa_pipeline = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B", torch_dtype="auto", device=0)

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Device set to use cuda:0


In [15]:
def retrieve_and_answer(query):
    """Retrieve relevant data and generate answers using LLM."""
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k=3)  # Retrieve top 3 results
    retrieved_docs = "\n".join([metadata[idx] for idx in indices[0]])

    # Generate response using LLM
    prompt = f"Context: {retrieved_docs}\nQuestion: {query}\nAnswer:"
    response = qa_pipeline(
    prompt,
    max_new_tokens=100,  # Generates up to 100 new tokens instead of cutting off at 200
    truncation=True,      # Explicitly enable truncation to avoid warnings
    do_sample=True,
    temperature=0.7,  # Adjust for randomness (0.7 is a good balance)
    top_p=0.9
)[0]["generated_text"]

    return response

In [16]:
queries = [
    "Show me total revenue for July 2017.",
    "Which locations had the highest booking cancellations?",
    "What is the average price of a hotel booking?"
]

for query in queries:
    print(f"\nðŸ”¹ {query}")
    print(retrieve_and_answer(query))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



ðŸ”¹ Show me total revenue for July 2017.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context: Booking ID 24775: {'hotel': 'Resort Hotel', 'is_canceled': 0, 'lead_time': 0, 'arrival_date_year': 2016, 'arrival_date_month': 'May', 'arrival_date_week_number': 23, 'arrival_date_day_of_month': 30, 'stays_in_weekend_nights': 1, 'stays_in_week_nights': 3, 'adults': 1, 'children': 0.0, 'babies': 0, 'meal': 'BB', 'country': 'PRT', 'market_segment': 'Corporate', 'distribution_channel': 'Corporate', 'is_repeated_guest': 0, 'previous_cancellations': 0, 'previous_bookings_not_canceled': 0, 'reserved_room_type': 'A', 'assigned_room_type': 'A', 'booking_changes': 1, 'deposit_type': 'No Deposit', 'agent': nan, 'company': 47.0, 'days_in_waiting_list': 0, 'customer_type': 'Transient', 'adr': 57.0, 'required_car_parking_spaces': 0, 'total_of_special_requests': 0, 'reservation_status': 'Check-Out', 'reservation_status_date': '03-06-16'}
Booking ID 24770: {'hotel': 'Resort Hotel', 'is_canceled': 0, 'lead_time': 0, 'arrival_date_year': 2016, 'arrival_date_month': 'May', 'arrival_date_week_nu

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context: Booking ID 97854: {'hotel': 'City Hotel', 'is_canceled': 0, 'lead_time': 1, 'arrival_date_year': 2016, 'arrival_date_month': 'September', 'arrival_date_week_number': 39, 'arrival_date_day_of_month': 20, 'stays_in_weekend_nights': 0, 'stays_in_week_nights': 1, 'adults': 1, 'children': 0.0, 'babies': 0, 'meal': 'BB', 'country': 'PRT', 'market_segment': 'Aviation', 'distribution_channel': 'Corporate', 'is_repeated_guest': 0, 'previous_cancellations': 0, 'previous_bookings_not_canceled': 0, 'reserved_room_type': 'A', 'assigned_room_type': 'A', 'booking_changes': 0, 'deposit_type': 'No Deposit', 'agent': nan, 'company': 153.0, 'days_in_waiting_list': 0, 'customer_type': 'Transient', 'adr': 95.0, 'required_car_parking_spaces': 0, 'total_of_special_requests': 0, 'reservation_status': 'Check-Out', 'reservation_status_date': '21-09-16'}
Booking ID 97870: {'hotel': 'City Hotel', 'is_canceled': 0, 'lead_time': 3, 'arrival_date_year': 2016, 'arrival_date_month': 'September', 'arrival_date