In [2]:
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from transformers import pipeline
from google import genai
import chromadb as cdb
import chromadb.utils.embedding_functions as embedding_functions

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
load_dotenv()
cohere_api_key = os.getenv("COHERE_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")
jina_api_key = os.getenv("JINA_API_KEY")

In [4]:
data_path = "data/hotel_bookings.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [5]:
# Assuming df is your DataFrame
# Get numerical and textual columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
textual_cols = df.select_dtypes(include=['object', 'string', 'category']).columns.tolist()

# Print the column names
print("Numerical columns:")
print(numerical_cols)
print("\nTextual columns:")
print(textual_cols)

# Create separate DataFrames if needed
df_numerical = df[numerical_cols]
df_textual = df[textual_cols]

# Display the first few rows of each
print("\nNumerical DataFrame sample:")
display(df_numerical.head())
print("\nTextual DataFrame sample:")
display(df_textual.head())

Numerical columns:
['is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']

Textual columns:
['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status', 'reservation_status_date']

Numerical DataFrame sample:


Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0,342,2015,27,1,0,0,2,0.0,0,0,0,0,3,,,0,0.0,0,0
1,0,737,2015,27,1,0,0,2,0.0,0,0,0,0,4,,,0,0.0,0,0
2,0,7,2015,27,1,0,1,1,0.0,0,0,0,0,0,,,0,75.0,0,0
3,0,13,2015,27,1,0,1,1,0.0,0,0,0,0,0,304.0,,0,75.0,0,0
4,0,14,2015,27,1,0,2,2,0.0,0,0,0,0,0,240.0,,0,98.0,0,1



Textual DataFrame sample:


Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [29]:
# client = cdb.Client()
# collection = client.create_collection(name="hotel_collection")

In [30]:
# cohere_ef  = embedding_functions.CohereEmbeddingFunction(api_key=cohere_api_key,  model_name="large")
# cohere_ef(input=["document1","document2"])


In [9]:
pipe = pipeline("feature-extraction", model="Linq-AI-Research/Linq-Embed-Mistral", use_auth_token=True, device=-1)

Loading checkpoint shards:   0%|                                                                  | 0/3 [00:00<?, ?it/s]


RuntimeError: unable to mmap 4943161664 bytes from file </home/vishnusharma7/.cache/huggingface/hub/models--Linq-AI-Research--Linq-Embed-Mistral/snapshots/0c1a0b0589177079acc552433cad51d7c9132379/model-00001-of-00003.safetensors>: Cannot allocate memory (12)

In [33]:

# Initialize Cohere embedding function
cohere_ef = embedding_functions.CohereEmbeddingFunction(
    api_key=cohere_api_key,
    model_name="embed-english-v3.0"  # You can change the model as needed
)

# Initialize ChromaDB client and create collection
client = cdb.Client()
# If collection already exists, delete it to avoid errors
try:
    client.delete_collection("hotel_collection")
    print("Deleted existing collection")
except:
    pass
collection = client.create_collection(
    name="hotel_collection",
    embedding_function=cohere_ef
)

# Preprocess datetime columns if they exist
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
date_cols = [col for col in df.columns if 'date' in col.lower()]

for col in date_cols:
    if col in df.columns:
        try:
            df[col] = pd.to_datetime(df[col])
            df[f'{col}_formatted'] = df[col].dt.strftime('%Y-%m-%d')
        except:
            print(f"Could not convert {col} to datetime")

# Create documents for embedding
documents = []
ids = []
metadatas = []

# Process each row in the dataframe
for idx, row in df.iterrows():
    # Create a text representation of the booking
    # Adjust the fields based on your actual data columns
    text_parts = []
    
    # Add all string columns to the text representation
    for col in df.select_dtypes(include=['object']).columns:
        if pd.notna(row[col]):
            text_parts.append(f"{col}: {row[col]}")
    
    # Add formatted date columns if they exist
    for col in date_cols:
        formatted_col = f'{col}_formatted'
        if formatted_col in df.columns and pd.notna(row.get(formatted_col)):
            text_parts.append(f"{col}: {row[formatted_col]}")
    
    # Add numerical columns
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        if pd.notna(row[col]):
            text_parts.append(f"{col}: {row[col]}")
    
    # Join all parts into a single text
    text_content = " | ".join(text_parts)
    
    # Create metadata dictionary
    metadata = {}
    for col in df.columns:
        if pd.notna(row[col]):
            # Convert numpy types to Python native types for compatibility with ChromaDB
            if isinstance(row[col], (np.integer, np.floating)):
                metadata[col] = row[col].item()
            elif isinstance(row[col], (pd.Timestamp)):
                metadata[col] = row[col].strftime('%Y-%m-%d')
            else:
                try:
                    # Try to convert to string for other types
                    metadata[col] = str(row[col])
                except:
                    # Skip if conversion fails
                    pass
    
    # Add to lists
    documents.append(text_content)
    ids.append(f"booking_{idx}")
    metadatas.append(metadata)
    
    # Add in batches of 100 to avoid memory issues
    if len(documents) >= 100:
        collection.add(
            documents=documents,
            ids=ids,
            metadatas=metadatas
        )
        print(f"Added batch of {len(documents)} documents")
        documents = []
        ids = []
        metadatas = []

# Add any remaining documents
if documents:
    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas
    )
    print(f"Added final batch of {len(documents)} documents")

# Verify the collection
print(f"\nCollection count: {collection.count()}")

# Test a simple query
query_text = "luxury hotel with pool"
results = collection.query(
    query_texts=[query_text],
    n_results=3
)

print("\nSample query results:")
for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
)):
    print(f"\nResult {i+1} (distance: {distance}):")
    print(f"Document: {doc[:150]}...")  # Show first 150 chars
    print(f"Metadata sample: {list(metadata.items())[:5]}...")  # Show first 5 metadata items

Deleted existing collection


  df[col] = pd.to_datetime(df[col])


Could not convert arrival_date_month to datetime
Added batch of 100 documents
Added batch of 100 documents


TooManyRequestsError: status_code: 429, body: status_code: 429, body: {'message': 'trial token rate limit exceeded, limit is 100000 tokens per minute'} in add.