In [40]:
from decouple import Config, RepositoryEnv
import os
from pathlib import Path

from langchain.chat_models import init_chat_model
from langchain_ollama import OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
import urllib.parse
from langchain.chains import RetrievalQA

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import bs4
from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document

from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
root_dir = Path().resolve()
print(root_dir.parent/'.env')


/Users/taejunsong/workspace/rag_tutorial/.env


In [41]:
config = Config(RepositoryEnv(root_dir.parent/'.env'))  # Explicitly load .env
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = config('LANGSMITH_API_KEY')
# os.environ["GROQ_API_KEY"] = config("GROQ_API_KEY")
# llm = init_chat_model("llama3-8b-8192", model_provider="groq")
llm = ChatOllama(model="llama3.2")

In [42]:
embeddings = OllamaEmbeddings(model="llama3.2")
vector_store = InMemoryVectorStore(embeddings)

In [43]:
# TODO: Change this into LLM
def build_search_url(query: str) -> str:
    """
    Build the search URL from a user query.
    Note: This URL structure is hypothetical. You’ll need to adjust the base URL
    and query parameters based on how apartment.com constructs its search URLs.
    """
    base_url = "https://www.apartment.com/search"  # Adjust as necessary
    params = {"q": query}
    search_url = f"{base_url}?{urllib.parse.urlencode(params)}"
    return search_url
    
class CustomWebLoader(WebBaseLoader):
    """
    A custom web loader that fetches a webpage with custom headers and retry logic,
    then filters the HTML to only include the <section> tag with the specified attributes.
    The filtered HTML is returned as a LangChain Document object.
    """
    def load(self):
        # Define custom headers to mimic a browser.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        # Create a session with retry logic.
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()  # Check for HTTP errors.
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        
        # Use SoupStrainer to filter only the desired <section> tag.
        # filter_section = SoupStrainer("section", attrs={
        #     "class": "placards placardsv2",
        #     "id": "placards",
        #     "data-nosnippet": ""
        # })
        filter_section = SoupStrainer(
            "div", 
            id="placardContainer", 
            class_="placardContainer", 
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        content = BeautifulSoup(response.text, "html.parser", parse_only=filter_section).get_text(separator="\n", strip=True)
        doc = Document(page_content=content, metadata={"source": self.web_path})
        return [doc]


In [44]:
target_url

'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'

In [45]:
# Load and chunk contents of the blog
user_query = "2 bedroom apartments in Los Angeles"
target_url = build_search_url(user_query)
loader = CustomWebLoader(target_url)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [46]:
docs

[Document(metadata={'source': 'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'}, page_content='Apartments for Rent in Chicago IL - 23,204 Rentals\nPresidential Towers\n555 W Madison St, Chicago, IL 60661\n1\n/\n72\n3D Tours\nVideos\nVirtual Tour\n$1,400 - $9,400\nStudio - 2 Beds\nPets Allowed\nFitness Center\nGrill\nCourtyard\n(708) 725-1991\nEmail\nSentral Michigan Avenue\n808 S Michigan Ave, Chicago, IL 60605\n1\n/\n31\n3D Tours\nVideos\nVirtual Tour\n$2,116 - $11,500\nStudio - 4 Beds\nSpecials\nPets Allowed\nFitness Center\nPool\nDishwasher\nIn Unit Washer & Dryer\nStainless Steel Appliances\n(708) 797-6742\nEmail\nThe 808 Cleveland\n808 N Cleveland Ave, Chicago, IL 60610\n1\n/\n155\n3D Tours\nVideos\nVirtual Tour\n$1,185 - $5,068\nStudio - 4 Beds\nPets Allowed\nFitness Center\nPool\nIn Unit Washer & Dryer\nStainless Steel Appliances\nBusiness Center\nPackage Service\n(708) 934-9407\nEmail\nPost\n853 W Blackhawk St, Chicago, IL 60642\n1\n/\n130\n3D Tours\nVid

In [47]:
messages = {
    
        "system":
        "You are a confident world-class realtor. Don't say you don't know or have not enough information. Provide detailed information to the users about the apartments.",
        "question": "I want you to recommend the room in LA",
}
response = graph.invoke(messages)
print(response["answer"])

I don't have enough information to provide a specific recommendation for a room in Chicago. The context provided includes addresses and a doorman's phone number, but it doesn't specify which type of accommodation or hotel is being referred to. I recommend searching online for "best rooms in Chicago" or checking websites like TripAdvisor for recommendations.


In [48]:
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")

Total characters: 43130


In [49]:
print(docs[0].page_content[:500])



      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In


In [51]:
def build_search_url(query: str) -> str:
    """
    Build the search URL from a user query.
    Note: This URL structure is hypothetical. You’ll need to adjust the base URL
    and query parameters based on how apartment.com constructs its search URLs.
    """
    base_url = "https://www.apartment.com/search"  # Adjust as necessary
    params = {"q": query}
    search_url = f"{base_url}?{urllib.parse.urlencode(params)}"
    return search_url

def search_apartment(query: str) -> str:
    """
    Given a search query, build the URL, load the webpage content,
    and then use a retrieval pipeline to extract relevant information.
    """
    # Construct the search URL
    search_url = build_search_url(query)
    print(f"Searching URL: {search_url}")
    
    # Use the WebBaseLoader to load content from the constructed URL
    loader = WebBaseLoader(search_url)
    documents = loader.load()
    
    # Build embeddings and vector store from the retrieved documents
    embeddings = OllamaEmbeddings()  # Ensure your OpenAI API key is set
    vectorstore = InMemoryVectorStore.from_documents(documents, embeddings)
    
    # Build the RetrievalQA pipeline using the vector store
    qa = RetrievalQA.from_chain_type(
        llm=ChatOllama(),
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )
    
    # Use the QA system to get an answer based on your original query
    result = qa.run(query)
    return result

# Example usage:
if __name__ == "__main__":
    user_query = "2 bedroom apartments in Los Angeles"
    result = search_apartment(user_query)
    print("Search Result:")
    print(result)


Searching URL: https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles


KeyboardInterrupt: 

In [None]:
import requests

In [None]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup

url = 'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'  # Replace with your target URL

# Create a session with retry logic
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

# Define headers to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}

try:
    response = session.get(url, headers=headers, timeout=10)
    response.raise_for_status()  # Raise an error for bad responses
except requests.exceptions.RequestException as e:
    print("Error fetching the page:", e)
else:
    # Parse and prettify the HTML with BeautifulSoup
    only_placards = bs4.SoupStrainer("section", attrs={
    "class": "placards placardsv2",
    "id": "placards",
    "data-nosnippet": ""
    })
    soup = BeautifulSoup(response.text, 'html.parser', parse_only=only_placards)
    print(soup.prettify())


In [None]:


class CustomWebLoader(WebBaseLoader):
    """
    A custom web loader that fetches a webpage with custom headers and retry logic,
    then filters the HTML to only include the <section> tag with the specified attributes.
    The filtered HTML is returned as a LangChain Document object.
    """
    def load(self):
        # Define custom headers to mimic a browser.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        # Create a session with retry logic.
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()  # Check for HTTP errors.
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        
        # Use SoupStrainer to filter only the desired <section> tag.
        # filter_section = SoupStrainer("section", attrs={
        #     "class": "placards placardsv2",
        #     "id": "placards",
        #     "data-nosnippet": ""
        # })
        filter_section = SoupStrainer(
            "div", 
            id="placardContainer", 
            class_="placardContainer", 
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        soup = BeautifulSoup(response.text, "html.parser", parse_only=filter_section)
        
        # Prettify the filtered HTML (if found) and create a Document.
        content = soup.prettify() if soup and soup.contents else ""
        doc = Document(page_content=content, metadata={"source": self.web_path})
        return [doc]

# Example usage:
if __name__ == "__main__":
    target_url = 'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'
    loader = CustomWebLoader(target_url)
    documents = loader.load()
    
    if documents and documents[0].page_content:
        print("Filtered content:")
        print(documents[0].page_content)
    else:
        print("No matching content found.")


In [None]:
# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
docs

In [78]:
import re
from langchain_ollama import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

# Define the data model with allowed values and defaults.
class ApartmentQuery(BaseModel):
    city: str = Field(
        description="City with state abbreviation, e.g., 'Los Angeles, CA'"
    )
    min_price: int = Field(
        description="Minimum price as an integer, e.g., 1300"
    )
    home_type: str = Field(
        default="apartments",
        description="Home type: one of apartments, houses, condos, townhomes. Defaults to apartments."
    )
    bedrooms: str = Field(
        default="2+",
        description="Bedroom filter: one of any, studio, 1+, 2+, 3+, 4+. Defaults to 2+."
    )
    bathrooms: str = Field(
        default="1+",
        description="Bathroom filter: one of any, 1+, 2+, 3+. Defaults to 1+."
    )

# Set up the output parser for our Pydantic model.
parser = PydanticOutputParser(pydantic_object=ApartmentQuery)

# Create a prompt template that clearly instructs the model what JSON to output.
prompt_template = """
You are a data extraction assistant. Extract the following details from the user's query:

- city: The city with its state abbreviation (e.g., "Los Angeles, CA").
- min_price: The minimum price as an integer (e.g., 1300).
- home_type: The home type, choose from: apartments, houses, condos, townhomes. (Default: apartments)
- bedrooms: The bedroom filter, choose from: any, studio, 1+, 2+, 3+, 4+. (Default: 2+)
- bathrooms: The bathroom filter, choose from: any, 1+, 2+, 3+. (Default: 1+)

Return your answer as a valid JSON object with exactly these keys: "city", "min_price", "home_type", "bedrooms", "bathrooms". Do not include any additional keys or text.

Query: "{query}"
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

# Initialize ChatOllama with your local model.
chat = ChatOllama(model="llama3.2")

# Chain the prompt, model, and parser.
chain = prompt | chat | parser

# Mapping functions for URL segments.
def map_bedrooms(bedrooms: str) -> str:
    bed_map = {
        "any": "",
        "studio": "studios",  # changed from "studio" to "studios"
        "1+": "min-1-bedrooms",
        "2+": "min-2-bedrooms",
        "3+": "min-3-bedrooms",
        "4+": "min-4-bedrooms"
    }
    return bed_map.get(bedrooms.lower().strip(), "min-2-bedrooms")

def map_bathrooms(bathrooms: str) -> str:
    bath_map = {
        "any": "",
        "1+": "1-bathrooms",
        "2+": "2-bathrooms",
        "3+": "3-bathrooms"
    }
    return bath_map.get(bathrooms.lower().strip(), "1-bathrooms")  # default to 1+

# URL generator that produces the Apartments.com URL.
def generate_apartments_url(params: ApartmentQuery) -> str:
    # Create a city slug, e.g., "Los Angeles, CA" -> "los-angeles-ca"
    city_slug = re.sub(r'[,\s]+', '-', params.city.lower().strip())
    
    # Use the provided home_type if valid; otherwise, default to "apartments"
    home_type_lower = params.home_type.lower().strip()
    if home_type_lower not in ["apartments", "houses", "condos", "townhomes"]:
        home_type_lower = "apartments"
    
    # Map bedrooms and bathrooms.
    bedrooms_segment = map_bedrooms(params.bedrooms)
    bathrooms_segment = map_bathrooms(params.bathrooms)
    price_segment = f"over-{params.min_price}"
    
    # Build segments, omitting any empty segments.
    segments = []
    if bedrooms_segment:
        segments.append(bedrooms_segment)
    if bathrooms_segment:
        segments.append(bathrooms_segment)
    segments.append(price_segment)
    
    middle_segment = "-".join(segments)
    
    # Final URL format:
    # Example: https://www.apartments.com/apartments/los-angeles-ca/min-2-bedrooms-1-bathrooms-over-1300/
    return f"https://www.apartments.com/{home_type_lower}/{city_slug}/{middle_segment}/"

# Example query.
query = "I want condos in Los Angeles, CA with studio options, priced over 1500, and any bathrooms."

# Run the chain to extract parameters.
parsed_result = chain.invoke({
    "query": query,
    "format_instructions": parser.get_format_instructions()
})

# Generate the URL based on the parsed parameters.
url = generate_apartments_url(parsed_result)

print("Generated URL:", url)


Generated URL: https://www.apartments.com/condos/los-angeles-ca/studios-over-1500/


In [None]:
from langgraph import Graph

graph = Graph()

# Node for generating the prompt.
def prompt_node(inputs: dict) -> dict:
    # Merge query and format instructions from the parser.
    formatted_prompt = prompt.format(
        query=inputs["query"],
        format_instructions=parser.get_format_instructions()
    )
    return {"prompt": formatted_prompt}

# Node for LLM call.
def llm_node(inputs: dict) -> dict:
    # Call the LLM with the generated prompt.
    response = llm.call(inputs["prompt"])
    return {"llm_output": response}

# Node for parsing the LLM output.
def parser_node(inputs: dict) -> dict:
    # Parse the JSON output from the LLM.
    parsed = parser.parse(inputs["llm_output"])
    return {"parsed": parsed.dict()}  # convert to dict for the next node

# Node for generating the URL.
def url_node(inputs: dict) -> dict:
    url = generate_apartments_url(inputs["parsed"])
    return {"url": url}

# Add nodes to the graph.
graph.add_node("Prompt", prompt_node)
graph.add_node("LLM", llm_node)
graph.add_node("Parser", parser_node)
graph.add_node("URLGenerator", url_node)

# Connect the nodes in order.
graph.connect("Input", "Prompt")
graph.connect("Prompt", "LLM")
graph.connect("LLM", "Parser")
graph.connect("Parser", "URLGenerator")

# Sample input query.
inputs = {
    "query": "I want condos in Los Angeles, CA with studio options, priced over 1500, and any bathrooms."
}

# Run the graph.
result = graph.run(inputs)
print("Generated URL:", result["url"])

In [91]:
from decouple import Config, RepositoryEnv
import os
from pathlib import Path
import re
import urllib.parse
import json

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup, SoupStrainer

from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

from langchain import hub
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict

# ------------------------------------------------------------------------------
# Environment & Global Setup
# ------------------------------------------------------------------------------

root_dir = Path().resolve()
print(root_dir.parent / '.env')

config = Config(RepositoryEnv(root_dir.parent / '.env'))  # Explicitly load .env

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = config('LANGSMITH_API_KEY')

# We'll use two LLM instances:
# 1. For URL generation using our new chain.
llm_url = ChatOllama(model="llama3.2")
# 2. For the downstream QA process.
llm = ChatOllama(model="llama3.2")

embeddings = OllamaEmbeddings(model="llama3.2")
vector_store = InMemoryVectorStore(embeddings)

# ------------------------------------------------------------------------------
# New LLM-based URL Generation Chain
# ------------------------------------------------------------------------------

# Define the structured schema for extracting URL parameters.
class ApartmentQuery(BaseModel):
    city: str = Field(
        description="City with state abbreviation, e.g., 'Los Angeles, CA'"
    )
    min_price: int = Field(
        description="Minimum price as an integer, e.g., 1500"
    )
    home_type: str = Field(
        default="apartments",
        description="Home type: one of apartments, houses, condos, townhomes. Defaults to apartments."
    )
    bedrooms: str = Field(
        default="2+",
        description="Bedroom filter: one of any, studio, 1+, 2+, 3+, 4+. For a studio, output 'studios'. Defaults to 2+."
    )
    bathrooms: str = Field(
        default="1+",
        description="Bathroom filter: one of any, 1+, 2+, 3+. Defaults to 1+."
    )

# Set up the output parser.
parser = PydanticOutputParser(pydantic_object=ApartmentQuery)

# Define the prompt template for extracting the URL parameters.
prompt_template = """
You are a data extraction assistant. Extract the following details from the user's query:
- city: The city with its state abbreviation (e.g., "Los Angeles, CA").
- min_price: The minimum price as an integer (e.g., 1500).
- home_type: One of: apartments, houses, condos, townhomes. (Default: apartments)
- bedrooms: One of: any, studio, 1+, 2+, 3+, 4+ (Default: 2+). For a studio, output "studios".
- bathrooms: One of: any, 1+, 2+, 3+ (Default: 1+).

Return your answer as a JSON object with exactly these keys: "city", "min_price", "home_type", "bedrooms", "bathrooms".
Query: "{query}"
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

# Helper function to generate the URL based on extracted parameters.
def generate_apartments_url(params: dict) -> str:
    # Create a city slug: "Los Angeles, CA" -> "los-angeles-ca"
    city_slug = re.sub(r'[,\s]+', '-', params["city"].lower().strip())
    
    home_type_lower = params["home_type"].lower().strip()
    if home_type_lower not in ["apartments", "houses", "condos", "townhomes"]:
        home_type_lower = "apartments"
    
    # Map bedroom filter (note: for a studio, we want "studios")
    bed_map = {
        "any": "",
        "studio": "studios",
        "1+": "min-1-bedrooms",
        "2+": "min-2-bedrooms",
        "3+": "min-3-bedrooms",
        "4+": "min-4-bedrooms"
    }
    bedroom_seg = bed_map.get(params["bedrooms"].lower().strip(), "min-2-bedrooms")
    
    # Map bathroom filter.
    bath_map = {
        "any": "",
        "1+": "1-bathrooms",
        "2+": "2-bathrooms",
        "3+": "3-bathrooms"
    }
    bathroom_seg = bath_map.get(params["bathrooms"].lower().strip(), "1-bathrooms")
    
    price_segment = f"over-{params['min_price']}"
    
    segments = []
    if bedroom_seg:
        segments.append(bedroom_seg)
    if bathroom_seg:
        segments.append(bathroom_seg)
    segments.append(price_segment)
    middle_segment = "-".join(segments)
    
    # Final URL format matching apartments.com:
    return f"https://www.apartments.com/{home_type_lower}/{city_slug}/{middle_segment}/"

# The new build_search_url function that uses our LLM-based URL generation.

def parse_json_with_stripped_keys(text: str) -> dict:
    # Parse the JSON string
    data = json.loads(text)
    # Strip whitespace from all keys
    return {k.strip(): v for k, v in data.items()}

def build_search_url(query: str) -> str:
    formatted_prompt = prompt.format(
        query=query,
        format_instructions=parser.get_format_instructions()
    )
    llm_response = llm_url.invoke(formatted_prompt)
    response_text = llm_response.content if hasattr(llm_response, "content") else str(llm_response)
    # Clean the JSON output by stripping whitespace from keys
    fixed_data = parse_json_with_stripped_keys(response_text)
    # Create the Pydantic model using the cleaned dictionary
    parsed_params = ApartmentQuery(**fixed_data)
    return generate_apartments_url(parsed_params.dict())

# ------------------------------------------------------------------------------
# Custom Web Loader (unchanged)
# ------------------------------------------------------------------------------

class CustomWebLoader(WebBaseLoader):
    """
    A custom web loader that fetches a webpage with custom headers and retry logic,
    then filters the HTML to only include the desired content.
    Returns a LangChain Document.
    """
    def load(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        
        # Filter the HTML to the desired section.
        filter_section = SoupStrainer(
            "div", 
            id="placardContainer", 
            class_="placardContainer", 
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        content = BeautifulSoup(response.text, "html.parser", parse_only=filter_section).get_text(separator="\n", strip=True)
        doc = Document(page_content=content, metadata={"source": self.web_path})
        return [doc]

# ------------------------------------------------------------------------------
# Downstream Processing: Load, Chunk, Index, & QA Graph
# ------------------------------------------------------------------------------

# Use the new build_search_url function to get the Apartments.com URL.
user_query = "I want condos in Los Angeles, CA with studio options, priced over 1500, and any bathrooms."
target_url = build_search_url(user_query)
print("Generated URL:", target_url)

loader = CustomWebLoader(target_url)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=30, chunk_overlap=10)
all_splits = text_splitter.split_documents(docs)
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering from hub.
prompt_qa = hub.pull("rlm/rag-prompt")

# Define application state.
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Step: Retrieve documents from the vector store.
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

# Step: Generate an answer using the QA prompt and LLM.
def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt_qa.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

# Build the LangGraph application.
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()
messages = {
        "system":
        "You are a confident world-class realtor. Don't say you don't know or have not enough information. Provide detailed information to the users about the apartments.",
        "question": user_query,
}
response = graph.invoke(messages)
print(response["answer"])

/Users/taejunsong/workspace/rag_tutorial/.env


/var/folders/5z/52k791n10qj37p5d0t3151w80000gn/T/ipykernel_72387/534467504.py:150: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  return generate_apartments_url(parsed_params.dict())


Generated URL: https://www.apartments.com/condos/los-angeles-ca/min-2-bedrooms-over-1500/


In [92]:
messages = {
        "system":
        "You are a confident world-class realtor. Don't say you don't know or have not enough information. Provide detailed information to the users about the apartments.",
        "question": user_query,
}
response = graph.invoke(messages)
print(response["answer"])

I don't have specific information on condo listings in Los Angeles that meet your criteria. However, I can suggest searching online platforms such as Zillow or Redfin for condos with studio options, priced over $1,500, and any bathrooms in Los Angeles, CA, that also allow pets and are furnished. This will provide you with a more comprehensive list of potential options.


In [88]:
docs

[Document(metadata={'source': 'https://www.apartments.com/condos/los-angeles-ca/min-2-bedrooms-over-1500/'}, page_content='2 Bedroom Condos for Rent in Los Angeles CA - 8,839 Rentals\n11929 Courtleigh Dr Unit 310\n11929 Courtleigh Dr Unit 310, Los Angeles, CA 90066\n1\n/\n27\n$4,399\n2 Beds, 2 Baths, 1,210 sq ft\n(213) 263-4293\nEmail\n611 Levering Ave Unit FL2-ID1306\n611 Levering Ave Unit FL2-ID1306, Los Angeles, CA 90024\n1\n/\n22\n3D Tours\nVirtual Tour\n$3,630\n2 Beds, 1 Bath, 950 sq ft\nPets Allowed\nFurnished\n(424) 239-1315\nEmail\n6923 Kittyhawk Ave Unit 202\n6923 Kittyhawk Ave Unit 202, Los Angeles, CA 90045\n1\n/\n11\n$3,399\n2 Beds, 2 Baths, 1,105 sq ft\n(213) 722-3401\nEmail\n5342 Fountain Ave Unit 404\n5342 Fountain Ave Unit 404, Los Angeles, CA 90029\n$2,995\n2 Beds, 2 Baths, 1,000 sq ft\n(213) 698-5379\nEmail\n10983 Bluffside Dr Unit FL2-ID1320\n10983 Bluffside Dr Unit FL2-ID1320, Los Angeles, CA 91604\n1\n/\n8\n$3,900\n2 Beds, 2 Baths, 990 sq ft\nPets Allowed\nBalcony\

In [93]:
from decouple import Config, RepositoryEnv
import os
from pathlib import Path
import re
import urllib.parse
import json
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup, SoupStrainer

from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

from langchain import hub
from langgraph.graph import START, StateGraph  # For future extension with a full graph.
from typing_extensions import List, TypedDict

# ------------------------------------------------------------------------------
# Environment & Global Setup
# ------------------------------------------------------------------------------

root_dir = Path().resolve()
print(root_dir.parent / '.env')

config = Config(RepositoryEnv(root_dir.parent / '.env'))  # Explicitly load .env

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = config('LANGSMITH_API_KEY')

# Initialize two LLM instances:
# 1. For URL generation.
llm_url = ChatOllama(model="llama3.2")
# 2. For downstream QA.
llm = ChatOllama(model="llama3.2")

embeddings = OllamaEmbeddings(model="llama3.2")
vector_store = InMemoryVectorStore(embeddings)

# ------------------------------------------------------------------------------
# LLM-based URL Generation Chain
# ------------------------------------------------------------------------------

# Define the structured schema for URL parameters.
class ApartmentQuery(BaseModel):
    city: str = Field(description="City with state abbreviation, e.g., 'Los Angeles, CA'")
    min_price: int = Field(description="Minimum price as an integer, e.g., 1500")
    home_type: str = Field(
        default="apartments",
        description="Home type: one of apartments, houses, condos, townhomes. Defaults to apartments."
    )
    bedrooms: str = Field(
        default="2+",
        description="Bedroom filter: one of any, studio, 1+, 2+, 3+, 4+. For a studio, output 'studios'. Defaults to 2+."
    )
    bathrooms: str = Field(
        default="1+",
        description="Bathroom filter: one of any, 1+, 2+, 3+. Defaults to 1+."
    )

# Set up the output parser.
parser = PydanticOutputParser(pydantic_object=ApartmentQuery)

# Define a prompt template that extracts exactly the required fields.
prompt_template = """
You are a data extraction assistant. Extract the following details from the user's query:
- city: The city with its state abbreviation (e.g., "Los Angeles, CA").
- min_price: The minimum price as an integer (e.g., 1500).
- home_type: One of: apartments, houses, condos, townhomes. (Default: apartments)
- bedrooms: One of: any, studio, 1+, 2+, 3+, 4+ (Default: 2+). For a studio, output "studios".
- bathrooms: One of: any, 1+, 2+, 3+ (Default: 1+).

Return your answer as a JSON object with exactly these keys: "city", "min_price", "home_type", "bedrooms", "bathrooms".
Query: "{query}"
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

def generate_apartments_url(params: dict) -> str:
    """
    Build the Apartments.com URL using the provided parameters.
    """
    # Create a slug for the city (e.g., "Los Angeles, CA" -> "los-angeles-ca")
    city_slug = re.sub(r'[,\s]+', '-', params["city"].lower().strip())
    
    # Validate home type; default to apartments if not valid.
    home_type_lower = params["home_type"].lower().strip()
    if home_type_lower not in ["apartments", "houses", "condos", "townhomes"]:
        home_type_lower = "apartments"
    
    # Map bedroom filter (ensuring "studio" becomes "studios")
    bed_map = {
        "any": "",
        "studio": "studios",
        "1+": "min-1-bedrooms",
        "2+": "min-2-bedrooms",
        "3+": "min-3-bedrooms",
        "4+": "min-4-bedrooms"
    }
    bedroom_seg = bed_map.get(params["bedrooms"].lower().strip(), "min-2-bedrooms")
    
    # Map bathroom filter.
    bath_map = {
        "any": "",
        "1+": "1-bathrooms",
        "2+": "2-bathrooms",
        "3+": "3-bathrooms"
    }
    bathroom_seg = bath_map.get(params["bathrooms"].lower().strip(), "1-bathrooms")
    
    price_segment = f"over-{params['min_price']}"
    segments = []
    if bedroom_seg:
        segments.append(bedroom_seg)
    if bathroom_seg:
        segments.append(bathroom_seg)
    segments.append(price_segment)
    middle_segment = "-".join(segments)
    
    return f"https://www.apartments.com/{home_type_lower}/{city_slug}/{middle_segment}/"

def parse_json_with_stripped_keys(text: str) -> dict:
    """
    Load a JSON string and strip whitespace from its keys.
    """
    data = json.loads(text)
    return {k.strip(): v for k, v in data.items()}

def build_search_url(query: str) -> str:
    """
    Build the search URL by using the LLM to extract query parameters and generating the URL.
    """
    formatted_prompt = prompt.format(
        query=query,
        format_instructions=parser.get_format_instructions()
    )
    llm_response = llm_url.invoke(formatted_prompt)
    response_text = llm_response.content if hasattr(llm_response, "content") else str(llm_response)
    fixed_data = parse_json_with_stripped_keys(response_text)
    parsed_params = ApartmentQuery(**fixed_data)
    return generate_apartments_url(parsed_params.dict())

# ------------------------------------------------------------------------------
# Custom Web Loader
# ------------------------------------------------------------------------------

class CustomWebLoader(WebBaseLoader):
    """
    Custom web loader that fetches a webpage with retry logic and filters content.
    """
    def load(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)
        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        filter_section = SoupStrainer(
            "div",
            id="placardContainer",
            class_="placardContainer",
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        content = BeautifulSoup(response.text, "html.parser", parse_only=filter_section)\
            .get_text(separator="\n", strip=True)
        return [Document(page_content=content, metadata={"source": self.web_path})]

# ------------------------------------------------------------------------------
# Downstream Processing: Indexing & QA Pipeline
# ------------------------------------------------------------------------------

# Use the new build_search_url to get the target URL.
user_query = "I want condos in Los Angeles, CA with studio options, priced over 1500, and any bathrooms."
target_url = build_search_url(user_query)
print("Generated URL:", target_url)

# Load the webpage content.
loader = CustomWebLoader(target_url)
docs = loader.load()

# Split the document into chunks.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=30, chunk_overlap=10)
all_splits = text_splitter.split_documents(docs)
vector_store.add_documents(documents=all_splits)

# Pull the QA prompt.
prompt_qa = hub.pull("rlm/rag-prompt")

# Define the application state.
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Step 1: Retrieve documents from the vector store.
def retrieve(state: State) -> State:
    state["context"] = vector_store.similarity_search(state["question"])
    return state

# Step 2: Generate an answer using the QA prompt and LLM.
def generate(state: State) -> State:
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt_qa.invoke({"question": state["question"], "context": docs_content})
    state["answer"] = llm.invoke(messages).content
    return state

# If a full LangGraph state graph isn’t callable in your version,
# you can manually chain the steps in a helper function.
def run_pipeline(state: State) -> State:
    state = retrieve(state)
    state = generate(state)
    return state

# Run the QA pipeline with the user query.
initial_state: State = {"question": user_query, "context": [], "answer": ""}
result_state = run_pipeline(initial_state)
print("QA Answer:", result_state["answer"])


/Users/taejunsong/workspace/rag_tutorial/.env


/var/folders/5z/52k791n10qj37p5d0t3151w80000gn/T/ipykernel_72387/545242190.py:146: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  return generate_apartments_url(parsed_params.dict())


Generated URL: https://www.apartments.com/condos/los-angeles-ca/min-2-bedrooms-over-1500/
QA Answer: I don't know the specific condos in Los Angeles, CA that meet your criteria of having studio options, priced over $1500, and any bathrooms. The provided context suggests that multiple condo listings are available with similar features, but without more information or specific details about each listing, it's difficult to provide a precise answer. You may want to visit a real estate website or consult with a property manager for more tailored results.


In [94]:
from decouple import Config, RepositoryEnv
import os
from pathlib import Path
import re
import urllib.parse
import json
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup, SoupStrainer

from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

from langchain import hub
from langgraph.graph import START, StateGraph  # For potential future extension.
from typing_extensions import List, TypedDict

# ------------------------------------------------------------------------------
# Environment & Global Setup
# ------------------------------------------------------------------------------

root_dir = Path().resolve()
print(root_dir.parent / '.env')

config = Config(RepositoryEnv(root_dir.parent / '.env'))  # Explicitly load .env

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = config('LANGSMITH_API_KEY')

# Initialize two LLM instances:
# 1. For URL generation.
llm_url = ChatOllama(model="llama3.2")
# 2. For downstream QA.
llm = ChatOllama(model="llama3.2")

embeddings = OllamaEmbeddings(model="llama3.2")
vector_store = InMemoryVectorStore(embeddings)

# ------------------------------------------------------------------------------
# LLM-based URL Generation Chain
# ------------------------------------------------------------------------------

class ApartmentQuery(BaseModel):
    city: str = Field(
        description="City with state abbreviation, e.g., 'Los Angeles, CA'"
    )
    min_price: int = Field(
        description="Minimum price as an integer, e.g., 1500"
    )
    home_type: str = Field(
        default="apartments",
        description="Home type: one of apartments, houses, condos, townhomes. Defaults to apartments."
    )
    bedrooms: str = Field(
        default="2+",
        description="Bedroom filter: one of any, studio, 1+, 2+, 3+, 4+. For a studio, output 'studios'. Defaults to 2+."
    )
    bathrooms: str = Field(
        default="1+",
        description="Bathroom filter: one of any, 1+, 2+, 3+. Defaults to 1+."
    )

# Set up the output parser.
parser = PydanticOutputParser(pydantic_object=ApartmentQuery)

# Define a prompt template for extracting URL parameters.
prompt_template = """
You are a data extraction assistant. Extract the following details from the user's query:
- city: The city with its state abbreviation (e.g., "Los Angeles, CA").
- min_price: The minimum price as an integer (e.g., 1500).
- home_type: One of: apartments, houses, condos, townhomes. (Default: apartments)
- bedrooms: One of: any, studio, 1+, 2+, 3+, 4+ (Default: 2+). For a studio, output "studios".
- bathrooms: One of: any, 1+, 2+, 3+ (Default: 1+).

Return your answer as a JSON object with exactly these keys: "city", "min_price", "home_type", "bedrooms", "bathrooms".
Query: "{query}"
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

def generate_apartments_url(params: dict) -> str:
    # Create a slug for the city.
    city_slug = re.sub(r'[,\s]+', '-', params["city"].lower().strip())
    
    home_type_lower = params["home_type"].lower().strip()
    if home_type_lower not in ["apartments", "houses", "condos", "townhomes"]:
        home_type_lower = "apartments"
    
    # Map bedroom filter (with "studio" turning into "studios")
    bed_map = {
        "any": "",
        "studio": "studios",
        "1+": "min-1-bedrooms",
        "2+": "min-2-bedrooms",
        "3+": "min-3-bedrooms",
        "4+": "min-4-bedrooms"
    }
    bedroom_seg = bed_map.get(params["bedrooms"].lower().strip(), "min-2-bedrooms")
    
    # Map bathroom filter.
    bath_map = {
        "any": "",
        "1+": "1-bathrooms",
        "2+": "2-bathrooms",
        "3+": "3-bathrooms"
    }
    bathroom_seg = bath_map.get(params["bathrooms"].lower().strip(), "1-bathrooms")
    
    price_segment = f"over-{params['min_price']}"
    segments = []
    if bedroom_seg:
        segments.append(bedroom_seg)
    if bathroom_seg:
        segments.append(bathroom_seg)
    segments.append(price_segment)
    middle_segment = "-".join(segments)
    
    return f"https://www.apartments.com/{home_type_lower}/{city_slug}/{middle_segment}/"

def parse_json_with_stripped_keys(text: str) -> dict:
    data = json.loads(text)
    return {k.strip(): v for k, v in data.items()}

def build_search_url(query: str) -> str:
    formatted_prompt = prompt.format(
        query=query,
        format_instructions=parser.get_format_instructions()
    )
    llm_response = llm_url.invoke(formatted_prompt)
    response_text = llm_response.content if hasattr(llm_response, "content") else str(llm_response)
    fixed_data = parse_json_with_stripped_keys(response_text)
    parsed_params = ApartmentQuery(**fixed_data)
    return generate_apartments_url(parsed_params.dict())

# ------------------------------------------------------------------------------
# Custom Web Loader
# ------------------------------------------------------------------------------

class CustomWebLoader(WebBaseLoader):
    """
    Custom web loader that fetches a webpage with retry logic and filters content.
    """
    def load(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)
        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        filter_section = SoupStrainer(
            "div",
            id="placardContainer",
            class_="placardContainer",
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        content = BeautifulSoup(response.text, "html.parser", parse_only=filter_section)\
            .get_text(separator="\n", strip=True)
        return [Document(page_content=content, metadata={"source": self.web_path})]

# ------------------------------------------------------------------------------
# Downstream Processing: Indexing & QA Pipeline
# ------------------------------------------------------------------------------

# Use the new build_search_url function.
user_query = "I want condos in Los Angeles, CA with studio options, priced over 1500, and any bathrooms."
target_url = build_search_url(user_query)
print("Generated URL:", target_url)

loader = CustomWebLoader(target_url)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=30, chunk_overlap=10)
all_splits = text_splitter.split_documents(docs)
vector_store.add_documents(documents=all_splits)

# Instead of using the hub prompt, we define our own QA prompt messages.
def generate(state: dict) -> dict:
    # Combine the content of all retrieved documents.
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = [
        {
            "role": "system",
            "content": (
                "You are a confident, world-class realtor who has just crawled the website in real time. "
                "Based on the provided context, pick one specific condo listing that meets the criteria "
                "and provide detailed, specific information about it. Do not hedge or mention uncertainty."
            )
        },
        {
            "role": "user",
            "content": f"Based on the following context, please recommend one specific condo listing:\n\n{docs_content}"
        }
    ]
    response = llm.invoke(messages)
    state["answer"] = response.content
    return state

# Simple retrieval step: search the vector store using the question.
def retrieve(state: dict) -> dict:
    state["context"] = vector_store.similarity_search(state["question"])
    return state

# Define application state.
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Manually chain the steps.
def run_pipeline(state: State) -> State:
    state = retrieve(state)
    state = generate(state)
    return state

initial_state: State = {"question": user_query, "context": [], "answer": ""}
result_state = run_pipeline(initial_state)
print("QA Answer:", result_state["answer"])


/Users/taejunsong/workspace/rag_tutorial/.env


/var/folders/5z/52k791n10qj37p5d0t3151w80000gn/T/ipykernel_72387/3771287935.py:139: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  return generate_apartments_url(parsed_params.dict())


Generated URL: https://www.apartments.com/condos/los-angeles-ca/min-2-bedrooms-over-1500/
QA Answer: I recommend the first condo listing:

**123 Main St, Apt 3B**

Here's detailed information about this condo:

* **Unit Type:** 1 bedroom, 1 bathroom condo
* **Amenities:**
	+ Pets allowed (under 20 lbs, $50/month pet fee)
	+ Furnished with:
		- Queen-sized bed and mattress
		- Dresser and nightstand
		- Sofa bed in living room
		- Coffee table and TV stand
* **Building Features:**
	+ In-unit laundry
	+ Central air conditioning
	+ Elevator access to all floors
	+ Secure entry with intercom system
* **Neighborhood:**
	+ Located in the heart of downtown, within walking distance to shops, restaurants, and entertainment options
	+ Close proximity to public transportation and bike lanes
* **Lease Terms:** 12-month lease, rent includes utilities (heating, cooling, water)


In [104]:
from decouple import Config, RepositoryEnv
import os
from pathlib import Path
import re
import urllib.parse
import json
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup, SoupStrainer

from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

from langchain import hub
from langgraph.graph import START, StateGraph  # For potential future extension.
from typing_extensions import List, TypedDict

# ------------------------------------------------------------------------------
# Environment & Global Setup
# ------------------------------------------------------------------------------

root_dir = Path().resolve()
print(root_dir.parent / '.env')

config = Config(RepositoryEnv(root_dir.parent / '.env'))  # Explicitly load .env

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = config('LANGSMITH_API_KEY')

# Initialize two LLM instances:
# 1. For URL generation.
llm_url = ChatOllama(model="llama3.2")
# 2. For downstream QA.
llm = ChatOllama(model="llama3.2")

embeddings = OllamaEmbeddings(model="llama3.2")
vector_store = InMemoryVectorStore(embeddings)

# ------------------------------------------------------------------------------
# LLM-based URL Generation Chain
# ------------------------------------------------------------------------------

class ApartmentQuery(BaseModel):
    city: str = Field(
        description="City with state abbreviation, e.g., 'Los Angeles, CA'"
    )
    min_price: int = Field(
        description="Minimum price as an integer, e.g., 1500"
    )
    home_type: str = Field(
        default="apartments",
        description="Home type: one of apartments, houses, condos, townhomes. Defaults to apartments."
    )
    bedrooms: str = Field(
        default="2+",
        description="Bedroom filter: one of any, studio, 1+, 2+, 3+, 4+. For a studio, output 'studios'. Defaults to 2+."
    )
    bathrooms: str = Field(
        default="1+",
        description="Bathroom filter: one of any, 1+, 2+, 3+. Defaults to 1+."
    )

# Set up the output parser.
parser = PydanticOutputParser(pydantic_object=ApartmentQuery)

prompt_template = """
You are a data extraction assistant. Extract the following details from the user's query:
- city: The city with its state abbreviation (e.g., "Los Angeles, CA").
- min_price: The minimum price as an integer (e.g., 1500).
- home_type: One of: apartments, houses, condos, townhomes. (Default: apartments)
- bedrooms: One of: any, studio, 1+, 2+, 3+, 4+ (Default: 2+). For a studio, output "studios".
- bathrooms: One of: any, 1+, 2+, 3+ (Default: 1+).

Return your answer as a JSON object with exactly these keys: "city", "min_price", "home_type", "bedrooms", "bathrooms".
Query: "{query}"
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

def generate_apartments_url(params: dict) -> str:
    city_slug = re.sub(r'[,\s]+', '-', params["city"].lower().strip())
    home_type_lower = params["home_type"].lower().strip()
    if home_type_lower not in ["apartments", "houses", "condos", "townhomes"]:
        home_type_lower = "apartments"
    bed_map = {
        "any": "",
        "studio": "studios",
        "1+": "min-1-bedrooms",
        "2+": "min-2-bedrooms",
        "3+": "min-3-bedrooms",
        "4+": "min-4-bedrooms"
    }
    bedroom_seg = bed_map.get(params["bedrooms"].lower().strip(), "min-2-bedrooms")
    bath_map = {
        "any": "",
        "1+": "1-bathrooms",
        "2+": "2-bathrooms",
        "3+": "3-bathrooms"
    }
    bathroom_seg = bath_map.get(params["bathrooms"].lower().strip(), "1-bathrooms")
    price_segment = f"over-{params['min_price']}"
    segments = []
    if bedroom_seg:
        segments.append(bedroom_seg)
    if bathroom_seg:
        segments.append(bathroom_seg)
    segments.append(price_segment)
    middle_segment = "-".join(segments)
    return f"https://www.apartments.com/{home_type_lower}/{city_slug}/{middle_segment}/"

def parse_json_with_stripped_keys(text: str) -> dict:
    try:
        data = json.loads(text)
    except json.JSONDecodeError as e:
        print("Failed to decode JSON from LLM response. Response text:")
        print(text)
        raise e
    return {k.strip(): v for k, v in data.items()}

def build_search_url(query: str) -> str:
    formatted_prompt = prompt.format(
        query=query,
        format_instructions=parser.get_format_instructions()
    )
    llm_response = llm_url.invoke(formatted_prompt)
    response_text = llm_response.content if hasattr(llm_response, "content") else str(llm_response)
    
    # If the response is empty, log and use default parameters
    if not response_text.strip():
        print("LLM response is empty. Using default parameters.")
        default_data = {
            "city": "Los Angeles, CA",
            "min_price": 1500,
            "home_type": "apartments",
            "bedrooms": "2+",
            "bathrooms": "1+"
        }
        parsed_params = ApartmentQuery(**default_data)
        return generate_apartments_url(parsed_params.dict())
    
    # Try to parse the JSON response
    try:
        fixed_data = parse_json_with_stripped_keys(response_text)
    except json.JSONDecodeError as e:
        print("Failed to decode JSON from LLM response. Response text:")
        print(response_text)
        # Use fallback default parameters if JSON parsing fails.
        default_data = {
            "city": "Los Angeles, CA",
            "min_price": 1500,
            "home_type": "apartments",
            "bedrooms": "2+",
            "bathrooms": "1+"
        }
        parsed_params = ApartmentQuery(**default_data)
        return generate_apartments_url(parsed_params.dict())
    
    parsed_params = ApartmentQuery(**fixed_data)
    return generate_apartments_url(parsed_params.dict())


# ------------------------------------------------------------------------------
# Custom Web Loader
# ------------------------------------------------------------------------------

class CustomWebLoader(WebBaseLoader):
    def load(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)
        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        filter_section = SoupStrainer(
            "div",
            id="placardContainer",
            class_="placardContainer",
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        content = BeautifulSoup(response.text, "html.parser", parse_only=filter_section)\
            .get_text(separator="\n", strip=True)
        return [Document(page_content=content, metadata={"source": self.web_path})]

# ------------------------------------------------------------------------------
# Downstream Processing: Indexing & QA Pipeline
# ------------------------------------------------------------------------------

user_query = "I want condos in Los Angeles, CA with studio options, priced over 1500, and any bathrooms."
target_url = build_search_url(user_query)
print("Generated URL:", target_url)

loader = CustomWebLoader(target_url)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
all_splits = text_splitter.split_documents(docs)
vector_store.add_documents(documents=all_splits)

# Updated generate function with strict instructions to avoid hallucinations.
def generate(state: dict) -> dict:
    # Combine all retrieved document contents.
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = [
        {
            "role": "system",
            "content": (
                "You are a highly confident, precise real estate recommendation engine. "
                "Based solely on the provided crawled context, you must select one specific condo listing that best meets the search criteria. "
                "Even if some details are missing, combine the available information to provide a clear, detailed recommendation. "
                "Do NOT state that you cannot recommend or express uncertainty. "
                "Always output a recommendation that includes the exact address, unit type, and amenities as they appear in the context. "
                "If a particular detail is not mentioned, simply omit it—do not add or hallucinate any information."
            )
        },
        {
            "role": "user",
            "content": (
                "Below is the crawled context from apartments.com:\n\n"
                f"{docs_content}\n\n"
                "Based solely on this data, please provide a detailed recommendation for one specific condo listing that meets the search criteria. "
                "Include all available details (e.g. address, unit type, and amenities) exactly as found in the context."
            )
        }
    ]
    response = llm.invoke(messages)
    state["answer"] = response.content
    return state


def retrieve(state: dict) -> dict:
    state["context"] = vector_store.similarity_search(state["question"])
    return state

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def run_pipeline(state: State) -> State:
    state = retrieve(state)
    state = generate(state)
    return state

initial_state: State = {"question": user_query, "context": [], "answer": ""}
result_state = run_pipeline(initial_state)
print("QA Answer:", result_state["answer"])


/Users/taejunsong/workspace/rag_tutorial/.env
Failed to decode JSON from LLM response. Response text:
{ city: "Los Angeles, CA", min_price: 1500, home_type: "condos", bedrooms: "studios", bathrooms: "any" }
Failed to decode JSON from LLM response. Response text:
{ city: "Los Angeles, CA", min_price: 1500, home_type: "condos", bedrooms: "studios", bathrooms: "any" }
Generated URL: https://www.apartments.com/apartments/los-angeles-ca/min-2-bedrooms-1-bathrooms-over-1500/


/var/folders/5z/52k791n10qj37p5d0t3151w80000gn/T/ipykernel_72387/2442583574.py:162: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  return generate_apartments_url(parsed_params.dict())


QA Answer: I recommend the condo listing at:

**755 S Spring St, Los Angeles, CA 90014**

This studio condominium (Unit Type: 1) is a great match for your search criteria. The building offers various amenities, including:

- In Unit Washer & Dryer
- Dishwasher
- Kitchen with refrigerator

Additional details include:

- Pets Allowed
- Specials available

You can reach the property manager at (747) 307-6352 or via email to inquire about this unit.

Please note that there are two other listings ($3,100 - $3,200 and $8,950), but they have different amenities and prices, which may not align with your search criteria. This studio condo at 755 S Spring St appears to be the most suitable option based on the provided information.
