In [16]:
import os
from dotenv import load_dotenv
import numpy as np
from tqdm import tqdm
import pandas as pd
import sqlite3 as sql
# from transformers import pipeline
from google import genai
import chromadb as cdb
import chromadb.utils.embedding_functions as embedding_functions

In [3]:
load_dotenv()
cohere_api_key = os.getenv("COHERE_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")
jina_api_key = os.getenv("JINA_API_KEY")

In [4]:
data_path = "data/hotel_bookings.csv"
db_df = pd.read_csv(data_path)
db_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [5]:
database = sql.connect("./Database/hotel_database.db")

In [6]:
db_df.to_sql('booking', database, if_exists='replace', index=True)

119390

In [7]:
df = pd.read_sql_query("SELECT * FROM booking", database)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 33 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   index                           119390 non-null  int64  
 1   hotel                           119390 non-null  object 
 2   is_canceled                     119390 non-null  int64  
 3   lead_time                       119390 non-null  int64  
 4   arrival_date_year               119390 non-null  int64  
 5   arrival_date_month              119390 non-null  object 
 6   arrival_date_week_number        119390 non-null  int64  
 7   arrival_date_day_of_month       119390 non-null  int64  
 8   stays_in_weekend_nights         119390 non-null  int64  
 9   stays_in_week_nights            119390 non-null  int64  
 10  adults                          119390 non-null  int64  
 11  children                        119386 non-null  float64
 12  babies          

In [9]:
# df[numerical_cols].astype('str')
# df[date_cols].astype('str')

df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
check = df['reservation_status_date'].dt.strftime('%Y-%m-%d')

df['reservation_status_date_year'] = df['reservation_status_date'].dt.year
df['reservation_status_date_month'] = df['reservation_status_date'].dt.month_name()
df['reservation_status_date_day'] = df['reservation_status_date'].dt.day


In [10]:
df.drop(columns=['reservation_status_date'], inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 35 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   index                           119390 non-null  int64  
 1   hotel                           119390 non-null  object 
 2   is_canceled                     119390 non-null  int64  
 3   lead_time                       119390 non-null  int64  
 4   arrival_date_year               119390 non-null  int64  
 5   arrival_date_month              119390 non-null  object 
 6   arrival_date_week_number        119390 non-null  int64  
 7   arrival_date_day_of_month       119390 non-null  int64  
 8   stays_in_weekend_nights         119390 non-null  int64  
 9   stays_in_week_nights            119390 non-null  int64  
 10  adults                          119390 non-null  int64  
 11  children                        119386 non-null  float64
 12  babies          

In [12]:
df.fillna("missing", inplace=True)

  df.fillna("missing", inplace=True)


In [13]:

# Helper function to return the value or "missing" if the value is NaN.
def safe_value(val):
    return val if pd.notnull(val) else "missing"

# Prepare empty lists to store the documents and metadatas.
documents = []
metadatas = []

# Iterate over each row in the DataFrame.
for idx, row in df.iterrows():
    # For arrival and reservation dates, check each component.
    arrival_year = safe_value(row['arrival_date_year'])
    arrival_month = safe_value(row['arrival_date_month'])
    arrival_day = safe_value(row['arrival_date_day_of_month'])
    if "missing" in (arrival_year, arrival_month, arrival_day):
        arrival_date_str = "missing"
    else:
        arrival_date_str = f"{arrival_year}-{arrival_month}-{arrival_day}"
    
    res_year = safe_value(row['reservation_status_date_year'])
    res_month = safe_value(row['reservation_status_date_month'])
    res_day = safe_value(row['reservation_status_date_day'])
    if "missing" in (res_year, res_month, res_day):
        reservation_status_date_str = "missing"
    else:
        reservation_status_date_str = f"{res_year}-{res_month}-{res_day}"
    
    # Compute total nights, ensuring that if either value is missing, we mark as missing.
    stays_weekend = safe_value(row['stays_in_weekend_nights'])
    stays_week = safe_value(row['stays_in_week_nights'])
    if stays_weekend == "missing" or stays_week == "missing":
        total_nights = "missing"
    else:
        total_nights = int(stays_weekend) + int(stays_week)
    
    # Create a document string that includes all columns in a readable format.
    document = f"""
        Hotel: {safe_value(row['hotel'])}
        is_canceled: {safe_value(row['is_canceled'])}
        lead_time: {safe_value(row['lead_time'])}
        arrival_date_year: {arrival_year}
        arrival_date_month: {arrival_month}
        arrival_date_week_number: {safe_value(row['arrival_date_week_number'])}
        arrival_date_day_of_month: {arrival_day}
        stays_in_weekend_nights: {stays_weekend}
        stays_in_week_nights: {stays_week}
        adults: {safe_value(row['adults'])}
        children: {safe_value(row['children'])}
        babies: {safe_value(row['babies'])}
        meal: {safe_value(row['meal'])}
        country: {safe_value(row['country'])}
        market_segment: {safe_value(row['market_segment'])}
        distribution_channel: {safe_value(row['distribution_channel'])}
        is_repeated_guest: {safe_value(row['is_repeated_guest'])}
        previous_cancellations: {safe_value(row['previous_cancellations'])}
        previous_bookings_not_canceled: {safe_value(row['previous_bookings_not_canceled'])}
        reserved_room_type: {safe_value(row['reserved_room_type'])}
        assigned_room_type: {safe_value(row['assigned_room_type'])}
        booking_changes: {safe_value(row['booking_changes'])}
        deposit_type: {safe_value(row['deposit_type'])}
        agent: {safe_value(row['agent'])}
        company: {safe_value(row['company'])}
        days_in_waiting_list: {safe_value(row['days_in_waiting_list'])}
        customer_type: {safe_value(row['customer_type'])}
        adr: {safe_value(row['adr'])}
        required_car_parking_spaces: {safe_value(row['required_car_parking_spaces'])}
        total_of_special_requests: {safe_value(row['total_of_special_requests'])}
        reservation_status: {safe_value(row['reservation_status'])}
        reservation_status_date_year: {safe_value(row['reservation_status_date_year'])}
        reservation_status_date_month: {safe_value(row['reservation_status_date_month'])}
        reservation_status_date_day: {safe_value(row['reservation_status_date_day'])}
        Combined arrival_date: {arrival_date_str}
        Combined reservation_status_date: {reservation_status_date_str}
    """
    documents.append(document.strip())
    
    # Create the metadata dictionary including all columns.
    metadata = {
        'booking_id': str(idx),
        'hotel': safe_value(row['hotel']),
        'is_canceled': safe_value(row['is_canceled']),
        'lead_time': safe_value(row['lead_time']),
        'arrival_date_year': safe_value(row['arrival_date_year']),
        'arrival_date_month': safe_value(row['arrival_date_month']),
        'arrival_date_week_number': safe_value(row['arrival_date_week_number']),
        'arrival_date_day_of_month': safe_value(row['arrival_date_day_of_month']),
        'stays_in_weekend_nights': safe_value(row['stays_in_weekend_nights']),
        'stays_in_week_nights': safe_value(row['stays_in_week_nights']),
        'adults': safe_value(row['adults']),
        'children': safe_value(row['children']),
        'babies': safe_value(row['babies']),
        'meal': safe_value(row['meal']),
        'country': safe_value(row['country']),
        'market_segment': safe_value(row['market_segment']),
        'distribution_channel': safe_value(row['distribution_channel']),
        'is_repeated_guest': safe_value(row['is_repeated_guest']),
        'previous_cancellations': safe_value(row['previous_cancellations']),
        'previous_bookings_not_canceled': safe_value(row['previous_bookings_not_canceled']),
        'reserved_room_type': safe_value(row['reserved_room_type']),
        'assigned_room_type': safe_value(row['assigned_room_type']),
        'booking_changes': safe_value(row['booking_changes']),
        'deposit_type': safe_value(row['deposit_type']),
        'agent': safe_value(row['agent']),
        'company': safe_value(row['company']),
        'days_in_waiting_list': safe_value(row['days_in_waiting_list']),
        'customer_type': safe_value(row['customer_type']),
        'adr': safe_value(row['adr']),
        'required_car_parking_spaces': safe_value(row['required_car_parking_spaces']),
        'total_of_special_requests': safe_value(row['total_of_special_requests']),
        'reservation_status': safe_value(row['reservation_status']),
        'reservation_status_date_year': safe_value(row['reservation_status_date_year']),
        'reservation_status_date_month': safe_value(row['reservation_status_date_month']),
        'reservation_status_date_day': safe_value(row['reservation_status_date_day']),
        'arrival_date': arrival_date_str,
        'reservation_status_date': reservation_status_date_str,
        'total_nights': total_nights
    }
    metadatas.append(metadata)

# Example: Print the first document and its metadata.
print(documents[0])
print(metadatas[0])


Hotel: Resort Hotel
        is_canceled: 0
        lead_time: 342
        arrival_date_year: 2015
        arrival_date_month: July
        arrival_date_week_number: 27
        arrival_date_day_of_month: 1
        stays_in_weekend_nights: 0
        stays_in_week_nights: 0
        adults: 2
        children: 0.0
        babies: 0
        meal: BB
        country: PRT
        market_segment: Direct
        distribution_channel: Direct
        is_repeated_guest: 0
        previous_cancellations: 0
        previous_bookings_not_canceled: 0
        reserved_room_type: C
        assigned_room_type: C
        booking_changes: 3
        deposit_type: No Deposit
        agent: missing
        company: missing
        days_in_waiting_list: 0
        customer_type: Transient
        adr: 0.0
        required_car_parking_spaces: 0
        total_of_special_requests: 0
        reservation_status: Check-Out
        reservation_status_date_year: 2015
        reservation_status_date_month: July
        

In [None]:
# df["combined_text"] = df.apply(
#     lambda row: ' '.join([f"{col}: {row[col]}" for col in row.index]),
#     axis=1
# )
# df["combined_text"] = df.apply(
#     lambda row: ', '.join([f"{col}: {row[col]}" for col in row.index]),
#     axis=1
# )



In [None]:
# client = cdb.Client()
# collection = client.create_collection(name="hotel_collection")

In [23]:
# cohere_ef  = embedding_functions.CohereEmbeddingFunction(api_key=cohere_api_key,  model_name="large")
# cohere_ef(input=["document1","document2"])


In [None]:
# pipe = pipeline("feature-extraction", model="Linq-AI-Research/Linq-Embed-Mistral", use_auth_token=True, device=-1)

In [28]:
embedding_collection = embedding_functions.GoogleGenerativeAiEmbeddingFunction(
    api_key=gemini_api_key,
    model_name="models/text-embedding-004"  # You can change the model as needed
)

# Initialize ChromaDB client and create collection
client = cdb.PersistentClient(path="./Database")
# If collection already exists, delete it to avoid errors
try:
    client.delete_collection("hotel_collection")
    print("Deleted existing collection")
except:
    pass
collection = client.create_collection(
    name="hotel_collection",
    embedding_function=embedding_collection
)

Deleted existing collection


In [None]:
batch_size = 1000
num_batches = (len(documents) + batch_size - 1) // batch_size
for i in tqdm(range(num_batches)):
    # Get the current batch
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(documents))
    
    current_docs = documents[start_idx:end_idx]
    current_metadatas = metadatas[start_idx:end_idx]
    
    # Add the batch to the vector database
    collection.add(
        documents=current_docs,
        metadatas=current_metadatas,
        ids=[f"doc_{j}" for j in range(start_idx, end_idx)]
    )

    
    print(f"Added batch {i+1}/{num_batches} to vector database")

print("All documents have been added to the vector database")

  0%|                                                                                           | 0/120 [00:00<?, ?it/s]

  1%|▋                                                                              | 1/120 [08:14<16:21:25, 494.84s/it]

Added batch 1/120 to vector database


  2%|█▎                                                                             | 2/120 [16:19<16:01:28, 488.88s/it]

Added batch 2/120 to vector database


  2%|█▉                                                                             | 3/120 [23:50<15:19:30, 471.54s/it]

Added batch 3/120 to vector database


  3%|██▋                                                                            | 4/120 [31:23<14:57:25, 464.18s/it]

Added batch 4/120 to vector database


  4%|███▎                                                                           | 5/120 [38:53<14:39:37, 458.94s/it]

Added batch 5/120 to vector database


  5%|███▉                                                                           | 6/120 [46:26<14:28:43, 457.22s/it]

Added batch 6/120 to vector database


  6%|████▌                                                                          | 7/120 [54:02<14:19:53, 456.58s/it]

Added batch 7/120 to vector database


  7%|█████▏                                                                       | 8/120 [1:01:31<14:07:54, 454.24s/it]

Added batch 8/120 to vector database


  8%|█████▊                                                                       | 9/120 [1:08:48<13:50:23, 448.86s/it]

Added batch 9/120 to vector database


  8%|██████▎                                                                     | 10/120 [1:16:06<13:36:40, 445.46s/it]

Added batch 10/120 to vector database
