# Boosting


More often our document contains the field that we can use to rank the documents in the vector database. For example AIRBNB listing database contains Average Rating and number of reviews fields that we can use to rank the documents.

So overall boosting is reranking the documents according to metadata.

- It can improve relevancy in vector search.
- Personalize result.

In [8]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [19]:
import os
from typing import List, Optional
from pydantic import BaseModel, ValidationError
from datetime import datetime
import pandas as pd
import openai
from pymongo.collection import Collection
from pymongo.errors import OperationFailure
from pymongo.operations import SearchIndexModel
from pymongo.mongo_client import MongoClient
import time

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

DB_NAME = "airbnb_dataset"
COLLECTION_NAME = "listings_reviews"

class Host(BaseModel):
    host_id: str
    host_url: str
    host_name: str
    host_location: str
    host_about: str
    host_response_time: Optional[str] = None
    host_thumbnail_url: str
    host_picture_url: str
    host_response_rate: Optional[int] = None
    host_is_superhost: bool
    host_has_profile_pic: bool
    host_identity_verified: bool

class Location(BaseModel):
    type: str
    coordinates: List[float]
    is_location_exact: bool

class Address(BaseModel):
    street: str
    government_area: str
    market: str
    country: str
    country_code: str
    location: Location

class Review(BaseModel):
    _id: str
    date: Optional[datetime] = None
    listing_id: str
    reviewer_id: str
    reviewer_name: Optional[str] = None
    comments: Optional[str] = None

class Listing(BaseModel):
    _id: int
    listing_url: str
    name: str
    summary: str
    space: str
    description: str
    neighborhood_overview: Optional[str] = None
    notes: Optional[str] = None
    transit: Optional[str] = None
    access: str
    interaction: Optional[str] = None
    house_rules: str
    property_type: str
    room_type: str
    bed_type: str
    minimum_nights: int
    maximum_nights: int
    cancellation_policy: str
    last_scraped: Optional[datetime] = None
    calendar_last_scraped: Optional[datetime] = None
    first_review: Optional[datetime] = None
    last_review: Optional[datetime] = None
    accommodates: int
    bedrooms: Optional[float] = 0
    beds: Optional[float] = 0
    number_of_reviews: int
    bathrooms: Optional[float] = 0
    amenities: List[str]
    price: int
    security_deposit: Optional[float] = None
    cleaning_fee: Optional[float] = None
    extra_people: int
    guests_included: int
    images: dict
    host: Host
    address: Address
    availability: dict
    review_scores: dict
    reviews: List[Review]
    text_embeddings: List[float]

def process_records(data_frame):
    records = data_frame.to_dict(orient='records')
    # Handle potential `NaT` values
    for record in records:
        for key, value in record.items():
            # Check if the value is list-like; if so, process each element.
            if isinstance(value, list):
                processed_list = [None if pd.isnull(v) else v for v in value]
                record[key] = processed_list
            # For scalar values, continue as before.
            else:
                if pd.isnull(value):
                    record[key] = None
    try:
        # Convert each dictionary to a Listing instance
        listings = [Listing(**record).dict() for record in records]
        return listings
    except ValidationError as e:
        print("Validation error:", e)
        return []
    


def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        embedding = openai.embeddings.create(
            input=text,
            model="text-embedding-3-small", dimensions=1536).data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None
    

def setup_vector_search_index(collection: Collection, 
                              text_embedding_field_name: str = "text_embeddings", 
                              vector_search_index_name: str = "vector_index_text"):
    """
    Sets up a vector search index for a MongoDB collection based on text embeddings.

    Parameters:
    - collection (Collection): The MongoDB collection to which the index is applied.
    - text_embedding_field_name (str): The field in the documents that contains the text embeddings.
    - vector_search_index_name (str): The name for the vector search index.

    Returns:
    - None
    """
    # Define the model for the vector search index
    vector_search_index_model = SearchIndexModel(
        definition={
            "mappings": { # describes how fields in the database documents are indexed and stored
                "dynamic": True, # automatically index new fields that appear in the document
                "fields": { # properties of the fields that will be indexed.
                    text_embedding_field_name: { 
                        "dimensions": 1536, # size of the vector.
                        "similarity": "cosine", # algorithm used to compute the similarity between vectors
                        "type": "knnVector",
                    }
                },
            }
        },
        name=vector_search_index_name, # identifier for the vector search index
    )

    # Check if the index already exists
    index_exists = False
    for index in collection.list_indexes():
        if index['name'] == vector_search_index_name:
            index_exists = True
            break

    # Create the index if it doesn't exist
    if not index_exists:
        try:
            result = collection.create_search_index(vector_search_index_model)
            print("Creating index...")
            time.sleep(20)  # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
            print(f"Index created successfully: {result}")
            print("Wait a few minutes before conducting search with index to ensure index initialization.")
        except OperationFailure as e:
            print(f"Error creating vector search index: {str(e)}")
    else:
        print(f"Index '{vector_search_index_name}' already exists.")


def setup_vector_search_index_with_filter(collection):
    vector_index_with_filter = "vector_index_with_filter"

    new_vector_search_index_model = SearchIndexModel(
        definition={
            "mappings": {
                "dynamic": True,
                "fields": {
                    "text_embeddings": {
                        "dimensions": 1536,
                        "similarity": "cosine",
                        "type": "knnVector",
                    },
                     "accommodates": {
                        "type": "number"
                    },
                    "bedrooms": {
                        "type": "number"
                    },
                },
            }
        },
        name=vector_index_with_filter,
    )
    
    # Create the new index
    try:
        result = collection.create_search_index(model=new_vector_search_index_model)
        print("Creating index...")
        time.sleep(20)  # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
        print("New index created successfully:", result)
    except Exception as e:
        print(f"Error creating new vector search index: {str(e)}")


def vector_search_with_filter(user_query, db, collection, additional_stages=[], vector_index="vector_index_text"):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    db (MongoClient.database): The database object.
    collection (MongoCollection): The MongoDB collection to search.
    additional_stages (list): Additional aggregation stages to include in the pipeline.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search stage
    vector_search_stage = {
        "$vectorSearch": {
            "index": vector_index,  # specifies the index to use for the search
            "queryVector": query_embedding,  # the vector representing the query
            "path": "text_embeddings",  # field in the documents containing the vectors to search against
            "numCandidates": 150,  # number of candidate matches to consider
            "limit": 20,  # return top 20 matches
            "filter": {
                "$and": [
                    {"accommodates": {"$gte": 2}}, 
                    {"bedrooms": {"$lte": 7}}
                ]
            },
        }
    }


    # Define the aggregate pipeline with the vector search stage and additional stages
    pipeline = [vector_search_stage] + additional_stages

    # Execute the search
    results = collection.aggregate(pipeline)

    explain_query_execution = db.command( # sends a database command directly to the MongoDB server
        'explain', { # return information about how MongoDB executes a query or command without actually running it
            'aggregate': collection.name, # specifies the name of the collection on which the aggregation is performed
            'pipeline': pipeline, # the aggregation pipeline to analyze
            'cursor': {} # indicates that default cursor behavior should be used
        }, 
        verbosity='executionStats') # detailed statistics about the execution of each stage of the aggregation pipeline

    vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
    millis_elapsed = vector_search_explain['explain']['collectStats']['millisElapsed']

    print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")

    return list(results)




def connect_to_database():
    """Establish connection to the MongoDB."""

    MONGO_URI = os.environ.get("MONGO_URI")

    if not MONGO_URI:
        print("MONGO_URI not set in environment variables")

    # gateway to interacting with a MongoDB database cluster
    mongo_client = MongoClient(MONGO_URI, appname="devrel.deeplearningai.python")
    print("Connection to MongoDB successful")

    # Pymongo client of database and collection
    db = mongo_client.get_database(DB_NAME)
    collection = db.get_collection(COLLECTION_NAME)

    return db, collection

<p style="background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px"> 💻 &nbsp; <b>Access <code>requirements.txt</code> and <code>utils</code> files:</b> To access <code>requirements.txt</code> for this notebook, 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>. For more help, please see the <em>"Appendix - Tips and Help"</em> Lesson.</p>

## Data Loading

In [20]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
dataset = dataset.take(100)
# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset)
# dataset_df.head(5)

In [21]:
print("Columns:", dataset_df.columns)

Columns: Index(['_id', 'listing_url', 'name', 'summary', 'space', 'description',
       'neighborhood_overview', 'notes', 'transit', 'access', 'interaction',
       'house_rules', 'property_type', 'room_type', 'bed_type',
       'minimum_nights', 'maximum_nights', 'cancellation_policy',
       'last_scraped', 'calendar_last_scraped', 'first_review', 'last_review',
       'accommodates', 'bedrooms', 'beds', 'number_of_reviews', 'bathrooms',
       'amenities', 'price', 'security_deposit', 'cleaning_fee',
       'extra_people', 'guests_included', 'images', 'host', 'address',
       'availability', 'review_scores', 'reviews', 'weekly_price',
       'monthly_price', 'text_embeddings', 'image_embeddings'],
      dtype='object')


## Document Modelling

In [22]:
listings = process_records(dataset_df)

## Database Creation and Connection

In [23]:
db, collection = connect_to_database()

Connection to MongoDB successful


In [24]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 100, 'electionId': ObjectId('7fffffff0000000000000002'), 'opTime': {'ts': Timestamp(1722840369, 91), 't': 2}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1722840369, 100), 'signature': {'hash': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'keyId': 0}}, 'operationTime': Timestamp(1722840369, 91)}, acknowledged=True)

## Data Ingestion

In [25]:
collection.insert_many(listings)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


## Vector Search Index defintion

In [26]:
# Create vector search index
setup_vector_search_index_with_filter(collection=collection)

Creating index...
New index created successfully: vector_index_with_filter


<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ⏳ <b>Note:</b> If the output of the previous cell is <code>Error creating vector search index: Duplicate Index</code> you may proceed to the next cell if you intend to still use a previously created index.</p>

## Handling User Query

In [27]:
from pydantic import BaseModel
from typing import Optional

class SearchResultItem(BaseModel):
    name: str
    accommodates: Optional[int] = None
    address: custom_utils.Address
        
        #new fields for boosting purpose
    averageReviewScore: Optional[float] = None
    number_of_reviews: Optional[float] = None
    combinedScore: Optional[float] = None


In [28]:
from IPython.display import display, HTML

def handle_user_query(query, db, collection, stages=[], vector_index="vector_index_text"):
    get_knowledge = vector_search_with_filter(query, db, collection, stages, vector_index)

    if not get_knowledge:
        return "No results found.", "No source information available."
    
    print("List of all fields of the first document, before model conformance")
    print(get_knowledge[0].keys())

    search_results_models = [
        SearchResultItem(**result)
        for result in get_knowledge
    ]

    search_results_df = pd.DataFrame([item.dict() for item in search_results_models])

    completion = custom_utils.openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system", 
                "content": "You are a airbnb listing recommendation system."},
            {
                "role": "user", 
                "content": f"Answer this user query: {query} with the following context:\n{search_results_df}"
            }
        ]
    )
    system_response = completion.choices[0].message.content
    print(f"- User Question:\n{query}\n")
    print(f"- System Response:\n{system_response}\n")
    display(HTML(search_results_df.to_html()))
    return system_response

## Boosting Search Results After Vector Search

In [29]:
review_average_stage = {
    "$addFields": {
        "averageReviewScore": {
            "$divide": [
                {
                    "$add": [
                        "$review_scores.review_scores_accuracy",
                        "$review_scores.review_scores_cleanliness",
                        "$review_scores.review_scores_checkin",
                        "$review_scores.review_scores_communication",
                        "$review_scores.review_scores_location",
                        "$review_scores.review_scores_value",
                    ]
                },
                6  # Divide by the number of review score types to get the average
            ]
        },
        # Calculate a score boost factor based on the number of reviews
        "reviewCountBoost": "$number_of_reviews"
    }
}

In [30]:
weighting_stage = {
    "$addFields": {
        "combinedScore": {
            "$add": [
                {"$multiply": ["$averageReviewScore", 0.9]},  # Weighted average review score
                {"$multiply": ["$reviewCountBoost", 0.1]}   # Weighted review count boost
            ]
        }
    }
}


In [31]:
# Apply the combinedScore for sorting
sorting_stage_sort = {
    "$sort": {"combinedScore": -1}  # Descending order to boost higher combined scores
}

In [32]:
additional_stages = [review_average_stage, weighting_stage, sorting_stage_sort]

## Results

In [33]:
query = """
I want to stay in a place that's warm and friendly, 
and not too far from resturants, can you recommend a place? 
Include a reason as to why you've chosen your selection"
"""
handle_user_query(
    query, 
    db, 
    collection, 
    additional_stages, 
    vector_index="vector_index_with_filter"
)

Total time for the execution to complete on the database server: 0.084179 milliseconds
List of all fields of the first document, before model conformance
dict_keys(['_id', 'listing_url', 'name', 'summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access', 'interaction', 'house_rules', 'property_type', 'room_type', 'bed_type', 'minimum_nights', 'maximum_nights', 'cancellation_policy', 'last_scraped', 'calendar_last_scraped', 'first_review', 'last_review', 'accommodates', 'bedrooms', 'beds', 'number_of_reviews', 'bathrooms', 'amenities', 'price', 'security_deposit', 'cleaning_fee', 'extra_people', 'guests_included', 'images', 'host', 'address', 'availability', 'review_scores', 'reviews', 'text_embeddings', 'averageReviewScore', 'reviewCountBoost', 'combinedScore'])
- User Question:

I want to stay in a place that's warm and friendly, 
and not too far from resturants, can you recommend a place? 
Include a reason as to why you've chosen your selection"


- Syst

Unnamed: 0,name,accommodates,address,averageReviewScore,number_of_reviews,combinedScore
0,A bedroom far away from home,2,"{'street': 'Queens, NY, United States', 'government_area': 'Briarwood', 'market': 'New York', 'country': 'United States', 'country_code': 'US', 'location': {'type': 'Point', 'coordinates': [-73.82257, 40.71485], 'is_location_exact': True}}",9.833333,239.0,32.75
1,Homely Room in 5-Star New Condo@MTR,2,"{'street': 'Mongkok, Kowloon, Hong Kong', 'government_area': 'Yau Tsim Mong', 'market': 'Hong Kong', 'country': 'Hong Kong', 'country_code': 'HK', 'location': {'type': 'Point', 'coordinates': [114.17094, 22.32074], 'is_location_exact': False}}",9.5,179.0,26.45
2,Cozy double bed room 東涌鄉村雅緻雙人房,2,"{'street': 'Hong Kong, New Territories, Hong Kong', 'government_area': 'Islands', 'market': 'Hong Kong', 'country': 'Hong Kong', 'country_code': 'HK', 'location': {'type': 'Point', 'coordinates': [113.92823, 22.27671], 'is_location_exact': False}}",9.666667,162.0,24.9
3,The Garden Studio,2,"{'street': 'Marrickville, NSW, Australia', 'government_area': 'Marrickville', 'market': 'Sydney', 'country': 'Australia', 'country_code': 'AU', 'location': {'type': 'Point', 'coordinates': [151.15036, -33.90318], 'is_location_exact': False}}",9.833333,146.0,23.45
4,Best location 1BR Apt in HK - Shops & Sights,4,"{'street': 'Hong Kong, Kowloon, Hong Kong', 'government_area': 'Yau Tsim Mong', 'market': 'Hong Kong', 'country': 'Hong Kong', 'country_code': 'HK', 'location': {'type': 'Point', 'coordinates': [114.17088, 22.29663], 'is_location_exact': True}}",9.833333,145.0,23.35
5,Bondi Beach Dreaming 3-Bed House,8,"{'street': 'Bondi Beach, NSW, Australia', 'government_area': 'Waverley', 'market': 'Sydney', 'country': 'Australia', 'country_code': 'AU', 'location': {'type': 'Point', 'coordinates': [151.27448, -33.8872], 'is_location_exact': False}}",9.833333,139.0,22.75
6,Sydney Hyde Park City Apartment (checkin from 6am),2,"{'street': 'Darlinghurst, NSW, Australia', 'government_area': 'Sydney', 'market': 'Sydney', 'country': 'Australia', 'country_code': 'AU', 'location': {'type': 'Point', 'coordinates': [151.21346, -33.87603], 'is_location_exact': False}}",10.0,109.0,19.9
7,"Studio convenient to CBD, beaches, street parking.",5,"{'street': 'Balgowlah, NSW, Australia', 'government_area': 'Manly', 'market': 'Sydney', 'country': 'Australia', 'country_code': 'AU', 'location': {'type': 'Point', 'coordinates': [151.26108, -33.7975], 'is_location_exact': True}}",9.833333,104.0,19.25
8,Banyan Bungalow,2,"{'street': 'Waialua, HI, United States', 'government_area': 'North Shore Oahu', 'market': 'Oahu', 'country': 'United States', 'country_code': 'US', 'location': {'type': 'Point', 'coordinates': [-158.1602, 21.57561], 'is_location_exact': False}}",9.666667,99.0,18.6
9,Cheerful new renovated central apt,8,"{'street': 'Beyoğlu, İstanbul, Turkey', 'government_area': 'Beyoglu', 'market': 'Istanbul', 'country': 'Turkey', 'country_code': 'TR', 'location': {'type': 'Point', 'coordinates': [28.97477, 41.03735], 'is_location_exact': False}}",9.333333,77.0,16.1


'Based on your preferences for a warm and friendly place not too far from restaurants, I recommend "The Garden Studio" in Marrickville, NSW, Australia. This listing has an average review score of 9.83 and received positive feedback from 146 previous guests. Marrickville is known for its vibrant community atmosphere and diverse dining options, making it a great choice for someone looking for warmth and friendliness, as well as easy access to restaurants. Additionally, the studio offers a cozy and inviting atmosphere, perfect for a comfortable stay.'