In [1]:
import getpass
import os

In [2]:
# Import relevant functionality
from langchain_anthropic import ChatAnthropic
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.messages import HumanMessage, SystemMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent
from langchain_deepseek import ChatDeepSeek
from langchain.prompts import (
    SystemMessagePromptTemplate,
    PromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate
)

# Create the agent
memory = MemorySaver()
model = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
search = TavilySearchResults(max_results=2)
tools = [search]
agent_executor = create_react_agent(model, tools, checkpointer=memory)

In [3]:
system_prompt = """
You are an expert support agent for partselect.com. 
Your role is to answer user questions related to refrigerator and dishwasher parts strictly and exclusively using the information found on the company website partselect.com.

Respond only and exclusively using the information contained in the provided website. 
Do not introduce any information that is not present in the website.
If the provided website does not contain sufficient information to answer the question, or if the website does not directly address the user’s query, redirect the user to customer support then prompt them to ask a question within your expertise in a happy manner.

Do not explain your answer or provide any additional commentary. 
Your responses should be concise and focused on addressing the user's query using only the provided information.

Adhere to the context and limitations at all times. 
If any part of the question cannot be answered with the provided website, you must refrain from speculation or the use of external knowledge.

Ask follow up questions if necessary. 
Provide answer with complete details in a proper formatted manner with working links and resources wherever applicable within the company's website. 
Never provide wrong links.
"""

In [8]:
msg = {"messages": [HumanMessage(content="I want to cancel my order"), SystemMessage(content=system_prompt)]}
config = {"configurable": {"thread_id": "dsooh"}}
answer = agent_executor.invoke(msg, config)

In [14]:
answer["messages"][-1].content

'To assist you with canceling your order, please contact our customer support team directly. They will be happy to help you with your request. \n\nYou can reach them [here](https://www.partselect.com/Support/). Let me know if you have any other questions about parts or appliances!'

In [75]:
# Use the agent
config = {"configurable": {"thread_id": "dsooh"}}
for step in agent_executor.stream(
    {"messages": [HumanMessage(content="I want to cancel my order"), SystemMessage(content=system_prompt)]},
    config,
    stream_mode="values",
):
    step["messages"][-1].pretty_print()



You are an expert support agent for partselect.com. 
Refer to partselect.com as 'our' website.
Your role is to answer user questions related to refrigerator and dishwasher parts strictly and exclusively using the information found on the company website partselect.com.

Respond only and exclusively using the information contained in the provided website. 
Do not introduce any information that is not present in the website.
If the provided website does not contain sufficient information to answer the question, or if the website does not directly address the user’s query, redirect the user to customer support then prompt them to ask a question within your expertise in a happy manner.

Do not explain your answer or provide any additional commentary. 
Your responses should be concise and focused on addressing the user's query using only the provided information.

Adhere to the context and limitations at all times. 
If any part of the question cannot be answered with the provided website,

In [49]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import Html2TextTransformer
import nest_asyncio

nest_asyncio.apply()

url = ["https://www.partselect.com/"]
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
loader = AsyncChromiumLoader(url, user_agent=user_agent, headless=False)
docs = loader.load()
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs_transformed[0].page_content

"Skip to main content\n\nSign In | Create Account\n\nYour Account | Your Account\n\n  * Your Orders \n  * Your Models \n  * Your Subscriptions \n  * Your Models \n  *   * Home\n  * Find by Brand\n  * Find by Product\n  * Find by Symptom\n  * Blog\n  * Repair Help\n  * Water Filter Finder\n  *   * 1-866-319-8402\n  * Contact \n  * Order Status \n  * Sign out\n\nBack\n\n  * Appliance Brands\n  * Admiral\n  * Frigidaire\n  * General Electric\n  * Jenn-Air\n  * KitchenAid\n  * LG\n  * Maytag\n  * Whirlpool\n  * White-Westinghouse\n  * See all Appliance Brands\n\n  * Lawn Equipment Brands\n  * Ariens\n  * Briggs and Stratton\n  * Echo\n  * Husqvarna\n  * MTD\n  * Murray\n  * Poulan\n  * Troy-Bilt\n  * See all Lawn Equipment Brands\n\nBack\n\n  * Appliances\n  * Dishwasher\n  * Dryer\n  * Stove\n  * Refrigerator\n  * Washer\n  * See all Appliances\n\n  * Lawn Equipment\n  * Lawn Mower\n  * Snow Blower\n  * Lawn Tractor\n  * Chainsaw\n  * Tiller\n  * Generator\n  * See all Lawn Equipment\n\nC

In [66]:
import asyncio
from crawl4ai import *
import nest_asyncio
nest_asyncio.apply()

async def main():
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=1, 
            include_external=False
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True
    )
    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun(
            url="http://www.partselect.com/PS12364199-Frigidaire-242126602-Refrigerator-Door-Shelf-Bin.htm/",
            config=config
        )
        print(f"Crawled {len(results)} pages in total")
    return results
results = asyncio.run(main())

Crawled 181 pages in total


In [67]:
results.links

AttributeError: 'list' object has no attribute 'links'

In [68]:
import asyncio
from langchain.schema import Document
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

async def main():
    # 1) Browser config: headless, bigger viewport, no proxy
    browser_conf = BrowserConfig(
        headless=True,
        viewport_width=1280,
        viewport_height=720
    )

    # 2) Example extraction strategy
    schema = {
        "name": "Articles",
        "baseSelector": "div.article",
        "fields": [
            {"name": "title", "selector": "h2", "type": "text"},
            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
        ]
    }
    extraction = JsonCssExtractionStrategy(schema)

    # 3) Example LLM content filtering

    llm_config = LLMConfig(
        provider="deepseek/deepseek-chat",
        api_token = "env:DEEPSEEK_API_TOKEN"
    )

    # Initialize LLM filter with specific instruction
    filter = LLMContentFilter(
        llm_config=llm_config,  # or your preferred provider
        instruction="""
        Focus on extracting the core educational content.
        Include:
        - Key concepts and explanations
        - Important code examples
        - Essential technical details
        Exclude:
        - Navigation elements
        - Sidebars
        - Footer content
        Format the output as clean markdown with proper code blocks and headers.
        """,
        chunk_token_threshold=500,  # Adjust based on your needs
        verbose=True
    )

    md_generator = DefaultMarkdownGenerator(
    content_filter=filter,
    options={"ignore_links": True}

    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
        markdown_generator=md_generator,
        extraction_strategy=extraction,
        cache_mode=CacheMode.BYPASS,
    )

    async with AsyncWebCrawler(config=browser_conf) as crawler:
        # 4) Execute the crawl
        result = await crawler.arun(url="https://example.com/news", config=run_conf)

        if result.success:
            print("Extracted content:", result.extracted_content)
        else:
            print("Error:", result.error_message)

if __name__ == "__main__":
    asyncio.run(main())

SyntaxError: '(' was never closed (3727510329.py, line 51)