### Import Libraries and environment variables setup

In [69]:
import load_dotenv
load_dotenv.load_dotenv()
from langchain_aws import ChatBedrock
from langchain_core.tools import Tool
from langchain_google_community import GoogleSearchAPIWrapper
from langchain_core.messages import SystemMessage, HumanMessage
from langgraph.constants import Send
from langchain_community.document_loaders import AsyncChromiumLoader, AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer, Html2TextTransformer

### LLM Setup

In [55]:
llm = ChatBedrock(
    model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    model_kwargs=dict(temperature=0.2),
)

In [50]:
llm.invoke("hi").content

'Hello! How can I help you today?'

### Structured output llms definition

In [None]:
from typing import Annotated, List, TypedDict, operator, Dict
from pydantic import BaseModel, Field

class Product(BaseModel):
    id: int = Field(None, description="Product ID."),
    name: str = Field(None, description="Product name."),
    description: str = Field(None, description="Product description."),
    model_number: str = Field(None, description="Product model number."),
    price: float = Field(None, description="Product price."),
    features: Dict = Field(None, description="Product features (key-value pairs)."),  
    images: List[str] = Field(None, description="Product images.")
 
class ProductListing(BaseModel):
    products: List[Product] = Field(None, description="Product list with details for given category.")

class SearchQuery(BaseModel):
    search_query: str = Field(None, title="Search Query", description="Query that is optimized for search engines")
    justification: str = Field(None, title="Justification", description="Why this query is relevant for the user's request")
 
product_listing_llm = llm.with_structured_output(ProductListing)
search_query_llm = llm.with_structured_output(SearchQuery)

### Langgraph State Definition

In [64]:
# Graph State
class State(TypedDict):
    search_query: SearchQuery
    google_search_links: list
    category_name: str
    products: List[Product]
    completed_products: Annotated[list, operator.add] # all workers write to this in parallel

class WorkerState(TypedDict):
    link: str
    products: List[Product]
    completed_products: Annotated[list, operator.add]

### Langgraph Function and Tools Define

In [60]:
search = GoogleSearchAPIWrapper()
def top_results(query):
    return search.results(query, 10, {
        "gl": "in",
        "hl": "en",
    })

tool = Tool(
    name="google_search",
    description="Search Google for recent results.",
    func=top_results,
)

In [None]:
def scrap_link_content(link):
    

### Langgraph Nodes Define

In [66]:
# Nodes
def top_google_search_links(state: State):
    results = tool.run(state["search_query"].search_query)
    links = []
    for result in results:
        link = result["link"]
        if not link.contains("indiamart"):
            links.append(result['link'])
    print(links)
    return {"google_search_links": results}

In [None]:
from langgraph.constants import Send

def scrap_content_and_make_products(state: WorkerState):
    section = llm.invoke(
        [
            SystemMessage(content=""),
            HumanMessage(content=f"Here is the section name: {state['section'].name} and description: {state['section'].description}")
        ]
    )
    return {"completed_sections": [section.content]}

In [27]:
import asyncio

async def func():
    print("started.")
    await asyncio.sleep(2)
    print("done.")

await func()

started.
done.


In [29]:
import asyncio
loop = asyncio.get_event_loop()
# loop.create_task(some_async_function())

In [31]:
%autoawait asyncio

In [33]:
from langchain_community.document_loaders import AsyncChromiumLoader, AsyncHtmlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer, Html2TextTransformer

# Load HTML
loader = AsyncHtmlLoader(["https://www.hondaindiapower.com/product-category/water-pumps"])
html = loader.load()

In [34]:
html

[Document(metadata={'source': 'https://www.hondaindiapower.com/product-category/water-pumps', 'language': 'en'}, page_content='<!doctype html><html lang="en"><head><meta charset="utf-8"><link rel="icon" href="/favicon.ico"><meta name="author" content="Honda Power"><meta name="viewport" content="width=device-width,initial-scale=1"><meta name="theme-color" content="#000000"><link rel="apple-touch-icon" href="/favicon.ico"><link rel="manifest" href="/manifest.json"><meta name="facebook-domain-verification" content="p7o95aw9ypz39twgx5yz1l188se21a"><link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.slim.min.js"></script><script async src="https://www.googletagmanager.com/gtag/js?id=UA-116821454-1"></script><script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","UA-116821454-1")</script><script async 

In [35]:
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(html)
len(docs_transformed)

1

In [36]:
docs_transformed[0].page_content[0:500]

'You need to enable JavaScript to run this app.\n\n'

In [None]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer

from langchain_community.document_loaders import AsyncHtmlLoader

# urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
# loader = AsyncHtmlLoader(urls)
# docs = loader.load()

# Load HTML
urls = ["https://docs.smith.langchain.com/"]
loader = AsyncChromiumLoader(urls)
docs = loader.load()
docs[0].page_content[0:100]

# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html)

In [34]:
docs[0]



In [45]:
def scrap_link_content(link: str):
    loader = AsyncHtmlLoader([link])
    html = loader.load()
    docs_transformed = html2text.transform_documents(html)
    return docs_transformed[0].page_content

In [47]:
scrap_link_content("https://powerequipment.honda.com/pumps/models/wx10")

'# Access Denied\n\nYou don\'t have permission to access\n"http://powerequipment.honda.com/pumps/models/wx10" on this server.\n\nReference #18.356c3f17.1740743202.815cf18\n\nhttps://errors.edgesuite.net/18.356c3f17.1740743202.815cf18\n\n'