In [None]:
# CELL 1: Setup Environment and Dependencies
import os
import subprocess

# Create .env directory if it doesn't exist
os.makedirs('.env', exist_ok=True)

# Write environment file
env_content = """GOOGLE_API_KEY=xxxxxx
SERPER_API_KEY=xxxxxx"""

with open('.env/.env', 'w') as f:
    f.write(env_content)

print("✅ Environment file created successfully!")

# Install required packages
packages = [
    "langgraph",
    "langchain",
    "langchain-google-genai",
    "beautifulsoup4",
    "requests",
    "python-dotenv"
]

for package in packages:
    subprocess.run(["pip", "install", package], capture_output=True)
    print(f"✅ Installed {package}")

print("🚀 Setup complete! Run the next cells.")

✅ Environment file created successfully!
✅ Installed langgraph
✅ Installed langchain
✅ Installed langchain-google-genai
✅ Installed beautifulsoup4
✅ Installed requests
✅ Installed python-dotenv
🚀 Setup complete! Run the next cells.


In [None]:

# COMPLETE FIXED SYSTEM - Copy and run this entire code

import os
from dotenv import load_dotenv
from typing import TypedDict, Literal
from langgraph.graph import StateGraph, END, START
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage
import requests
from bs4 import BeautifulSoup
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
import json

# Load environment variables
load_dotenv('.env/.env')

# Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0
)

# FIXED State Schema
class ResearchState(TypedDict):
    original_query: str
    intent: Literal["research_only", "scrape_only", "research_and_scrape"]
    urls_to_scrape: list[str]
    search_query: str
    search_results: list[dict]
    scraped_content: str
    final_summary: str

# FIXED APP-LEVEL ROUTER
def app_level_router(query: str) -> str:
    """LLM-powered graph selection"""

    prompt = f"""
    Analyze this user query and determine workflow:
    Query: "{query}"

    Rules:
    - If query contains research + (summary OR summarize) → "research_and_summary"
    - If query only asks for research → "research_only"
    - If query asks to summarize provided text → "summary_only"

    Examples:
    - "research about X and provide summary" → research_and_summary
    - "find information about X" → research_only
    - "summarize this text: ..." → summary_only

    Respond with exactly one option:
    research_only
    summary_only
    research_and_summary
    """

    response = llm.invoke([HumanMessage(content=prompt)])
    decision = response.content.strip().lower()

    if "research_and_summary" in decision:
        return "research_and_summary"
    elif "research_only" in decision:
        return "research_only"
    elif "summary_only" in decision:
        return "summary_only"
    else:
        return "research_and_summary"  # Default fallback

# FIXED CLASSIFIER NODE
def classifier_node(state: ResearchState) -> ResearchState:
    """Classify user intent - this determines the edge routing"""
    query = state["original_query"]

    # Pre-check for research+summary pattern
    needs_summary = any([
        "summary" in query.lower(),
        "summarize" in query.lower(),
        "provide summary" in query.lower()
    ]) and "research" in query.lower()

    prompt = f"""
    Classify this research query:
    Query: "{query}"

    Determine:
    1. Intent (choose exactly one):
       - "research_only" = just research, no summary needed
       - "research_and_scrape" = research + scrape content for summary
       - "scrape_only" = process provided URLs only

    2. URLs: Extract any URLs or return "empty"
    3. Search Query: Clean query for Google search

    Format:
    Intent: [choice]
    URLs: [urls or empty]
    Search Query: [clean terms]
    """

    response = llm.invoke([HumanMessage(content=prompt)])
    result = response.content.strip()

    print(f"🔍 Classifier Response:\n{result}")

    # Parse response
    try:
        lines = result.split('\n')
        intent = None
        urls_text = "empty"
        search_query = query

        for line in lines:
            if 'Intent:' in line:
                intent = line.split(':', 1)[1].strip()
            elif 'URLs:' in line:
                urls_text = line.split(':', 1)[1].strip()
            elif 'Search Query:' in line:
                search_query = line.split(':', 1)[1].strip()

        # Override with pattern detection if needed
        if needs_summary and intent != "research_and_scrape":
            intent = "research_and_scrape"
            print(f"🔄 Override: Detected summary request, setting intent to research_and_scrape")

        urls = []
        if urls_text.lower() != "empty":
            urls = [url.strip() for url in urls_text.split(',')]

        # Clean search query
        search_query = search_query.replace("research about", "").replace("provide summary", "").strip()

    except Exception as e:
        print(f"❌ Parsing error: {e}")
        intent = "research_and_scrape" if needs_summary else "research_only"
        urls = []
        search_query = query

    print(f"✅ Final Classification:")
    print(f"   Intent: {intent}")
    print(f"   Search Query: {search_query}")

    return {
        **state,
        "intent": intent,
        "urls_to_scrape": urls,
        "search_query": search_query
    }

# CORRECT EDGE FUNCTION - Used right after classifier
def classifier_edge_router(state: ResearchState) -> str:
    """Edge function used RIGHT AFTER classifier node"""
    intent = state["intent"]

    print(f"🧭 Edge Router (after classifier):")
    print(f"   Intent from classifier: {intent}")

    if intent == "research_only":
        print("   → Route: research (no scraping needed)")
        return "research_only"
    elif intent == "scrape_only":
        print("   → Route: scraper (direct scraping)")
        return "scraper"
    elif intent == "research_and_scrape":
        print("   → Route: research (then will scrape)")
        return "research_then_scrape"
    else:
        print("   → Route: research (fallback)")
        return "research_only"

# RESEARCH NODE (unchanged)
def api_research_node(state: ResearchState) -> ResearchState:
    """Perform Google search using Serper API"""
    search_query = state["search_query"]

    print(f"🔎 Searching for: {search_query}")

    try:
        url = "https://google.serper.dev/search"
        payload = json.dumps({"q": search_query})
        headers = {
            'X-API-KEY': os.getenv("SERPER_API_KEY"),
            'Content-Type': 'application/json'
        }

        response = requests.post(url, headers=headers, data=payload)
        results = response.json()
        search_results = results.get("organic", [])[:5]

        print(f"✅ Found {len(search_results)} search results")

    except Exception as e:
        print(f"❌ Search error: {e}")
        search_results = [{"title": f"Mock result for {search_query}", "link": "https://example.com", "snippet": f"Information about {search_query}"}]

    return {**state, "search_results": search_results}

# SCRAPER NODE (unchanged)
def api_scraper_node(state: ResearchState) -> ResearchState:
    """Scrape web pages"""
    urls_to_scrape = state.get("urls_to_scrape", [])

    if not urls_to_scrape and state.get("search_results"):
        urls_to_scrape = [result["link"] for result in state["search_results"][:3]]

    print(f"🕷️ Scraping {len(urls_to_scrape)} URLs...")

    scraped_content = ""
    for url in urls_to_scrape:
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')

            for script in soup(["script", "style"]):
                script.decompose()

            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)[:2000]

            scraped_content += f"\n\n--- Content from {url} ---\n{text}"
            print(f"✅ Scraped: {url}")

        except Exception as e:
            scraped_content += f"\n\n--- Error scraping {url}: {str(e)} ---"
            print(f"❌ Error scraping {url}")

    return {**state, "scraped_content": scraped_content}

# SUMMARY NODE
def summarization_node(state: ResearchState) -> ResearchState:
    """Summarize using research content"""

    print("📝 Creating summary...")

    # Use scraped content if available, otherwise search results
    if state.get("scraped_content"):
        content = state["scraped_content"]
        print("✅ Using scraped content")
    elif state.get("search_results"):
        content = "\n\n".join([
            f"Title: {r.get('title', '')}\nURL: {r.get('link', '')}\nSnippet: {r.get('snippet', '')}"
            for r in state["search_results"]
        ])
        print("✅ Using search results")
    else:
        content = f"Limited information available about: {state['original_query']}"
        print("⚠️ Using fallback content")

    summary_prompt = f"""
    Create a comprehensive summary about "{state.get('search_query', state['original_query'])}" based on this research:

    RESEARCH DATA:
    {content}

    Provide:
    1. Overview of the topic
    2. Key findings from research
    3. Important details
    4. Conclusion

    SUMMARY:
    """

    try:
        response = llm.invoke([HumanMessage(content=summary_prompt)])
        summary = response.content
        print("✅ Summary completed!")
    except Exception as e:
        summary = f"Summary error: {e}\n\nResearch data:\n{content[:500]}..."
        print(f"❌ Summary error: {e}")

    return {**state, "final_summary": summary}

# CORRECTED GRAPH CONSTRUCTION
def create_research_graph_corrected():
    """Research graph with CORRECT edge placement"""
    workflow = StateGraph(ResearchState)

    # Add nodes
    workflow.add_node("classifier", classifier_node)
    workflow.add_node("research", api_research_node)
    workflow.add_node("scraper", api_scraper_node)

    # Start with classifier
    workflow.set_entry_point("classifier")

    # CRITICAL: Edge function RIGHT AFTER classifier
    workflow.add_conditional_edges(
        "classifier",  # FROM classifier
        classifier_edge_router,  # Edge function
        {
            "research_only": "research",      # Just research, then end
            "research_then_scrape": "research",  # Research first, then scrape
            "scraper": "scraper"              # Direct to scraper
        }
    )

    # After research, check if we need to scrape
    def post_research_router(state: ResearchState) -> str:
        if state["intent"] == "research_and_scrape":
            return "scraper"
        else:
            return "end"

    workflow.add_conditional_edges(
        "research",
        post_research_router,
        {"scraper": "scraper", "end": END}
    )

    workflow.add_edge("scraper", END)

    return workflow.compile()

# MAIN EXECUTION
def execute_complete_fixed(user_query: str):
    """Complete fixed execution"""

    print(f"🚀 QUERY: {user_query}")
    print("=" * 60)

    # App-level routing
    routing_decision = app_level_router(user_query)
    print(f"🧭 App Router: {routing_decision}")
    print()

    # Initialize state
    initial_state = ResearchState(
        original_query=user_query,
        intent="research_only",
        urls_to_scrape=[],
        search_query="",
        search_results=[],
        scraped_content="",
        final_summary=""
    )

    # Create graphs
    research_graph = create_research_graph_corrected()

    summary_workflow = StateGraph(ResearchState)
    summary_workflow.add_node("summarize", summarization_node)
    summary_workflow.set_entry_point("summarize")
    summary_workflow.add_edge("summarize", END)
    summary_graph = summary_workflow.compile()

    if routing_decision == "research_and_summary":
        print("📊 EXECUTING: Research + Summary Pipeline")
        print("-" * 40)

        # Research phase
        research_result = research_graph.invoke(initial_state)
        print(f"Research completed: {len(research_result.get('search_results', []))} results")
        print(f"Content scraped: {'Yes' if research_result.get('scraped_content') else 'No'}")

        # Summary phase
        final_result = summary_graph.invoke(research_result)

        print("\n" + "=" * 60)
        print("🎯 FINAL SUMMARY:")
        print("=" * 60)
        print(final_result.get('final_summary'))
        print("=" * 60)

    elif routing_decision == "research_only":
        result = research_graph.invoke(initial_state)
        print("📊 RESEARCH RESULTS:")
        for i, item in enumerate(result.get('search_results', []), 1):
            print(f"{i}. {item.get('title')}")
            print(f"   {item.get('link')}")
            print()

    elif routing_decision == "summary_only":
        result = summary_graph.invoke(initial_state)
        print("📋 SUMMARY:")
        print(result.get('final_summary'))

print("✅ COMPLETE FIXED SYSTEM READY!")
print("🎯 Edge function now correctly placed RIGHT AFTER classifier")
print("📊 Flow: Classifier → Edge Router → Research/Scraper → Summary")
print("\n🚀 Test with:")
print('execute_complete_fixed("research about gemengserv PMC and provide summary")')

✅ COMPLETE FIXED SYSTEM READY!
🎯 Edge function now correctly placed RIGHT AFTER classifier
📊 Flow: Classifier → Edge Router → Research/Scraper → Summary

🚀 Test with:
execute_complete_fixed("research about gemengserv PMC and provide summary")


In [None]:
execute_complete_fixed("go to gemengserv.com and extract only urls appearing in menu and provide summary")

🚀 QUERY: go to gemengserv.com and extract only urls appearing in menu and provide summary
🧭 App Router: research_and_summary

📊 EXECUTING: Research + Summary Pipeline
----------------------------------------
🔍 Classifier Response:
Intent: research_and_scrape
URLs: gemengserv.com
Search Query: gemengserv.com website menu links
✅ Final Classification:
   Intent: research_and_scrape
   Search Query: gemengserv.com website menu links
🧭 Edge Router (after classifier):
   Intent from classifier: research_and_scrape
   → Route: research (then will scrape)
🔎 Searching for: gemengserv.com website menu links
✅ Found 5 search results
🕷️ Scraping 1 URLs...
❌ Error scraping gemengserv.com
Research completed: 5 results
Content scraped: Yes
📝 Creating summary...
✅ Using scraped content
✅ Summary completed!

🎯 FINAL SUMMARY:
## Summary of gemengserv.com Website Menu Links

**1. Overview of the Topic:**

This summary aims to describe the menu links found on the website gemengserv.com.  The initial rese