In [1]:
from decouple import Config, RepositoryEnv
import os
from pathlib import Path

from langchain.chat_models import init_chat_model
from langchain_ollama import OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
import urllib.parse
from langchain.chains import RetrievalQA

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import bs4
from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document

from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
root_dir = Path().resolve()
print(root_dir.parent/'.env')


USER_AGENT environment variable not set, consider setting it to identify your requests.


/Users/taejunsong/workspace/rag_tutorial/.env


In [4]:
config = Config(RepositoryEnv(root_dir.parent/'.env'))  # Explicitly load .env
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = config('LANGSMITH_API_KEY')
# os.environ["GROQ_API_KEY"] = config("GROQ_API_KEY")
# llm = init_chat_model("llama3-8b-8192", model_provider="groq")
llm = ChatOllama(model="llama3.2")

In [5]:
embeddings = OllamaEmbeddings(model="llama3.2")
vector_store = InMemoryVectorStore(embeddings)

In [18]:
# TODO: Change this into LLM
def build_search_url(query: str) -> str:
    """
    Build the search URL from a user query.
    Note: This URL structure is hypothetical. You’ll need to adjust the base URL
    and query parameters based on how apartment.com constructs its search URLs.
    """
    base_url = "https://www.apartment.com/search"  # Adjust as necessary
    params = {"q": query}
    search_url = f"{base_url}?{urllib.parse.urlencode(params)}"
    return search_url
    
class CustomWebLoader(WebBaseLoader):
    """
    A custom web loader that fetches a webpage with custom headers and retry logic,
    then filters the HTML to only include the <section> tag with the specified attributes.
    The filtered HTML is returned as a LangChain Document object.
    """
    def load(self):
        # Define custom headers to mimic a browser.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        # Create a session with retry logic.
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()  # Check for HTTP errors.
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        
        # Use SoupStrainer to filter only the desired <section> tag.
        # filter_section = SoupStrainer("section", attrs={
        #     "class": "placards placardsv2",
        #     "id": "placards",
        #     "data-nosnippet": ""
        # })
        filter_section = SoupStrainer(
            "div", 
            id="placardContainer", 
            class_="placardContainer", 
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        content = BeautifulSoup(response.text, "html.parser", parse_only=filter_section).get_text(separator="\n", strip=True)
        doc = Document(page_content=content, metadata={"source": self.web_path})
        return [doc]


In [39]:
target_url

'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'

In [32]:
# Load and chunk contents of the blog
user_query = "2 bedroom apartments in Los Angeles"
target_url = build_search_url(user_query)
loader = CustomWebLoader(target_url)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [33]:
docs

[Document(metadata={'source': 'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'}, page_content='Apartments for Rent in Chicago IL - 23,465 Rentals\nPresidential Towers\n555 W Madison St, Chicago, IL 60661\n1\n/\n72\n3D Tours\nVideos\nVirtual Tour\n$1,550 - $9,400\nStudio - 2 Beds\nPets Allowed\nFitness Center\nGrill\nCourtyard\n(708) 725-1991\nEmail\nThe Residences at NewCity\n1457 N Halsted St, Chicago, IL 60642\n1\n/\n72\n3D Tours\nVideos\nVirtual Tour\n$2,699 - $5,265\nStudio - 2 Beds\n1 Month Free\nPets Allowed\nFitness Center\nPool\nDishwasher\nRefrigerator\nKitchen\n(708) 919-2291\nEmail\nMILA\n201 N Garland Ct, Chicago, IL 60601\n1\n/\n74\n3D Tours\nVideos\nVirtual Tour\n$2,232 - $5,765\nStudio - 2 Beds\nPets Allowed\nFitness Center\nPool\nPackage Service\n(833) 787-0144\nEmail\nSentral Michigan Avenue\n808 S Michigan Ave, Chicago, IL 60605\n1\n/\n31\n3D Tours\nVideos\nVirtual Tour\n$2,141 - $11,500\nStudio - 4 Beds\nSpecials\nPets Allowed\nFitness Center\

In [38]:
messages = {
    
        "system":
        "You are a confident world-class realtor. Don't say you don't know or have not enough information. Provide detailed information to the users about the apartments.",
        "question": "I want you to recommend the room in Chicago",
}
response = graph.invoke(messages)
print(response["answer"])

I don't know. The context seems to be about a building, but I'm not sure which one and what room features you're referring to. If you could provide more information or clarify your question, I'll do my best to assist you.


In [None]:
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")

In [None]:
print(docs[0].page_content[:500])

In [None]:
def build_search_url(query: str) -> str:
    """
    Build the search URL from a user query.
    Note: This URL structure is hypothetical. You’ll need to adjust the base URL
    and query parameters based on how apartment.com constructs its search URLs.
    """
    base_url = "https://www.apartment.com/search"  # Adjust as necessary
    params = {"q": query}
    search_url = f"{base_url}?{urllib.parse.urlencode(params)}"
    return search_url

def search_apartment(query: str) -> str:
    """
    Given a search query, build the URL, load the webpage content,
    and then use a retrieval pipeline to extract relevant information.
    """
    # Construct the search URL
    search_url = build_search_url(query)
    print(f"Searching URL: {search_url}")
    
    # Use the WebBaseLoader to load content from the constructed URL
    loader = WebBaseLoader(search_url)
    documents = loader.load()
    
    # Build embeddings and vector store from the retrieved documents
    embeddings = OllamaEmbeddings()  # Ensure your OpenAI API key is set
    vectorstore = InMemoryVectorStore.from_documents(documents, embeddings)
    
    # Build the RetrievalQA pipeline using the vector store
    qa = RetrievalQA.from_chain_type(
        llm=ChatOllama(),
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )
    
    # Use the QA system to get an answer based on your original query
    result = qa.run(query)
    return result

# Example usage:
if __name__ == "__main__":
    user_query = "2 bedroom apartments in Los Angeles"
    result = search_apartment(user_query)
    print("Search Result:")
    print(result)


In [None]:
import requests

In [None]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup

url = 'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'  # Replace with your target URL

# Create a session with retry logic
session = requests.Session()
retry_strategy = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

# Define headers to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}

try:
    response = session.get(url, headers=headers, timeout=10)
    response.raise_for_status()  # Raise an error for bad responses
except requests.exceptions.RequestException as e:
    print("Error fetching the page:", e)
else:
    # Parse and prettify the HTML with BeautifulSoup
    only_placards = bs4.SoupStrainer("section", attrs={
    "class": "placards placardsv2",
    "id": "placards",
    "data-nosnippet": ""
    })
    soup = BeautifulSoup(response.text, 'html.parser', parse_only=only_placards)
    print(soup.prettify())


In [2]:


class CustomWebLoader(WebBaseLoader):
    """
    A custom web loader that fetches a webpage with custom headers and retry logic,
    then filters the HTML to only include the <section> tag with the specified attributes.
    The filtered HTML is returned as a LangChain Document object.
    """
    def load(self):
        # Define custom headers to mimic a browser.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        # Create a session with retry logic.
        session = requests.Session()
        retry_strategy = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        try:
            response = session.get(self.web_path, headers=headers, timeout=10)
            response.raise_for_status()  # Check for HTTP errors.
        except requests.exceptions.RequestException as e:
            print("Error fetching the page:", e)
            return []
        
        # Use SoupStrainer to filter only the desired <section> tag.
        # filter_section = SoupStrainer("section", attrs={
        #     "class": "placards placardsv2",
        #     "id": "placards",
        #     "data-nosnippet": ""
        # })
        filter_section = SoupStrainer(
            "div", 
            id="placardContainer", 
            class_="placardContainer", 
            attrs={"data-analytics-profiletype": "Unknown"}
        )
        soup = BeautifulSoup(response.text, "html.parser", parse_only=filter_section)
        
        # Prettify the filtered HTML (if found) and create a Document.
        content = soup.prettify() if soup and soup.contents else ""
        doc = Document(page_content=content, metadata={"source": self.web_path})
        return [doc]

# Example usage:
if __name__ == "__main__":
    target_url = 'https://www.apartment.com/search?q=2+bedroom+apartments+in+Los+Angeles'
    loader = CustomWebLoader(target_url)
    documents = loader.load()
    
    if documents and documents[0].page_content:
        print("Filtered content:")
        print(documents[0].page_content)
    else:
        print("No matching content found.")


Filtered content:
<div class="placardContainer" data-analytics-profiletype="Unknown" id="placardContainer">
 <h1 class="placardSearchHeading">
  Apartments for Rent in Chicago IL - 23,460 Rentals
 </h1>
 <ul>
  <li class="mortar-wrapper">
   <article class="placard placard-option-diamond has-header js-diamond" data-ck="80ple21" data-countrycode="US" data-listingid="dsd9v8j" data-streetaddress="555 W Madison St" data-url="https://www.apartments.com/presidential-towers-chicago-il/dsd9v8j/">
    <header class="placard-header has-logo">
     <div class="property-information">
      <a aria-label="Presidential Towers, Chicago, IL" class="property-link" href="https://www.apartments.com/presidential-towers-chicago-il/dsd9v8j/">
       <div class="property-title" title="Presidential Towers, Chicago, IL">
        <span class="js-placardTitle title">
         Presidential Towers
        </span>
       </div>
       <div class="property-address js-url" title="555 W Madison St, Chicago, IL 60661">

In [15]:
# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
docs

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistake