### Data Preparation - Import Langchain library and ingest data pandas dataframe 

In [None]:
import os
from pymongo import MongoClient
from pymongo.operations import SearchIndexModel
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings, OpenAI, ChatOpenAI
from datasets import load_dataset
import pandas as pd

MONGODB_ATLAS_CLUSTER_URI = 'YOUR MONGODB ATLAS CLUSTER URI'
os.environ['OPENAI_API_KEY'] = '<YOUR OPEN API KEY>'

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

DB_NAME = "langchain"
COLLECTION_NAME = "listings_reviews"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "default"
#ATLAS_VECTOR_SEARCH_INDEX_NAME = "default_new"

collection = client["langchain"]["listings_reviews"]

try:
    client[DB_NAME].drop_collection(COLLECTION_NAME)
except:
    print("Collection does not exist")


dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")

dataset_df = pd.DataFrame(dataset)

print("Columns: ", dataset_df.columns)

### Pydantic data model declaration of dataset

In [None]:
from typing import List, Optional
from pydantic import BaseModel, ValidationError
from datetime import datetime

class Host(BaseModel):
    host_id: str
    host_url: str
    host_name: str
    host_location: str
    host_about: str
    host_response_time: Optional[str] = None
    host_thumbnail_url: str
    host_picture_url: str
    host_response_rate: Optional[int] = None
    host_is_superhost: bool
    host_has_profile_pic: bool
    host_identity_verified: bool

class Location(BaseModel):
    type: str
    coordinates: List[float]
    is_location_exact: bool

class Address(BaseModel):
    street: str
    government_area: str
    market: str
    country: str
    country_code: str
    location: Location

class Review(BaseModel):
    _id: str
    date: Optional[datetime] = None
    listing_id: str
    reviewer_id: str
    reviewer_name: Optional[str] = None
    comments: Optional[str] = None

class Listing(BaseModel):
    _id: int
    listing_url: str
    name: str
    summary: str
    space: str
    description: str
    neighborhood_overview: Optional[str] = None
    notes: Optional[str] = None
    transit: Optional[str] = None
    access: str
    interaction: Optional[str] = None
    house_rules: str
    property_type: str
    room_type: str
    bed_type: str
    minimum_nights: int
    maximum_nights: int
    cancellation_policy: str
    last_scraped: Optional[datetime] = None
    calendar_last_scraped: Optional[datetime] = None
    first_review: Optional[datetime] = None
    last_review: Optional[datetime] = None
    accommodates: int
    bedrooms: Optional[float] = 0
    beds: Optional[float] = 0
    number_of_reviews: int
    bathrooms: Optional[float] = 0
    amenities: List[str]
    price: int
    security_deposit: Optional[float] = None
    cleaning_fee: Optional[float] = None
    extra_people: int
    guests_included: int
    images: dict
    host: Host
    address: Address
    availability: dict
    review_scores: dict
    reviews: List[Review]
    text_embeddings: List[float]

records = dataset_df.to_dict(orient='records')

for record in records:
    for key, value in record.items():
        # Check if the value is list-like; if so, process each element.
        if isinstance(value, list):
            processed_list = [None if pd.isnull(v) else v for v in value]
            record[key] = processed_list
        # For scalar values, continue as before.
        else:
            if pd.isnull(value):
                record[key] = None
                
try:
  # Convert each dictionary to a Movie instance
  listings = [Listing(**record).dict() for record in records]
  # Get an overview of a single datapoint
  print(listings[0].keys())
except ValidationError as e:
  print(e)

### Data Preparation Ingest data into mongodb collection

In [None]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""

    # gateway to interacting with a MongoDB database cluster
    client = MongoClient(mongo_uri)
    print("Connection to MongoDB successful")
    return client

if not MONGODB_ATLAS_CLUSTER_URI:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(MONGODB_ATLAS_CLUSTER_URI)

# Pymongo client of database and collection
db = mongo_client.get_database(DB_NAME)
collection = db.get_collection(COLLECTION_NAME)

collection.delete_many({})

collection.insert_many(listings)
print("Data ingestion into MongoDB completed")




### Atlas vector search index creation

In [None]:


search_index_model = SearchIndexModel(
  definition={
    "fields": [
      {
        "type": "vector",
        "numDimensions": 1536,
        "path": "text_embeddings",
        "similarity": "cosine"
      },
      {
        "type": "filter",
        "path": "bathrooms"
      },
      {
        "type": "filter",
        "path": "bedrooms"
      },
      {
        "type": "filter",
        "path": "security_deposit"
      }
    ]
  },
  name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
  type="vectorSearch",
)

try:
    result = collection.create_search_index(model=search_index_model)
except Exception as e:
    print("Vector search Index Exists")

### Create langchain vector store object to perform vector search query

In [None]:
#from langchain.llms import OpenAI
from langchain_openai import OpenAIEmbeddings, OpenAI, ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch

metadata_field_info = [
    AttributeInfo(
        name="bathrooms",
        description="Number of bathrooms in any accomodation or hotel or stay",
        type="integer or double",
    ),
    AttributeInfo(
        name="bedrooms",
        description="Number of bathrooms in any accomodation or hotel or stay",
        type="integer or double",
    ),
    AttributeInfo(
        name="security_deposit",
        description="It also known as security deposit, It is the Amount of security deposit in any accomodation or hotel or stay",
        type="integer or double",
    )
]
document_content_description = "Brief description of accomodation or hotel or stay"

vectorstore = MongoDBAtlasVectorSearch(
  collection=collection,
  embedding=embeddings,
  embedding_key='text_embeddings',
  text_key='description',
  index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
  relevance_score_fn="cosine",
)
llm = OpenAI(temperature=0)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True
)


### Prompt from Traveller

In [None]:
from langchain.callbacks.tracers import ConsoleCallbackHandler
handler = ConsoleCallbackHandler()
#retriever.invoke("Give me ocean facing accomodation")
#retriever.invoke("Give me ocean facing accomodation with 3 bedrooms")
#retriever.invoke("Give me ocean facing accomodation with 3 bedrooms and 3 bathrooms")
#retriever.invoke("Give me ocean facing accomodation with 3 bedrooms and 2 bathrooms")
question = "Give me riverside accomodation with 3 bedrooms"
#relevant_docs=retriever.invoke("Give me ocean facing accomodation with 3 bedrooms, 2 bathrooms and zero deposit")
#retriever.invoke("Give me ocean facing accomodation with 3 bedrooms, 2 bathrooms, zero deposit",{"callbacks":[handler]})
relevant_docs=retriever.invoke(question,{"callbacks":[handler]})
print(relevant_docs)
#retriever.get_relevant_documents("Give me ocean facing accomodation with 3 bedrooms, 2 bathrooms, zero deposit")

### Context augmented LLM response

In [None]:
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

'''document = Document(
    page_content="Hello, world!",
    metadata={"source": "https://example.com"}
)'''

refined_documents=[]
for doc in relevant_docs:
    document = Document(page_content=doc.page_content, metadata={"source": doc.metadata["listing_url"], "bathrooms": doc.metadata["bathrooms"], "bedrooms": doc.metadata["bedrooms"], "security_deposit": doc.metadata["security_deposit"]})
    refined_documents.append(document)


prompt = PromptTemplate(
        input_variables = ["query","context"],
        template="""You are a travel assistant and you suggest travellers accomodations based on their requirements.
        Answer the question from a traveller:{query} by searching the following relevant documents :{context} and provide the best accomodation based on the requirements"""
    )

document_chain = create_stuff_documents_chain(llm, prompt)
print(document_chain.invoke({"query":question, "context":refined_documents}))

