In [None]:
!pip install langchain google-generativeai requests pydantic



In [None]:

# Cell 1: Setup and Configuration
# Install required packages - FINAL LIST
!pip install langchain google-generativeai requests pydantic

import os
import re
import json
import requests
import time
from typing import List, Optional
from datetime import datetime

# LangChain imports
from langchain.chains import SequentialChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
from langchain.schema import BaseOutputParser

# Pydantic for structured outputs
from pydantic import BaseModel, Field

# Google Generative AI
import google.generativeai as genai

# API Configuration - Replace with your actual API keys
SERPER_API_KEY = ""  # Your working Serper API key
BRIGHTDATA_API_TOKEN = ""
GOOGLE_API_KEY = ""  # Replace with your Google AI API key

# Configure Google AI
genai.configure(api_key=GOOGLE_API_KEY)

# Pydantic Models for Structured Outputs
class DomainOutput(BaseModel):
    domain: str = Field(description="Extracted domain from email")
    original_email: str = Field(description="Original email address")

class SearchQueryOutput(BaseModel):
    search_query: str = Field(description="Optimized search query for the domain")
    domain: str = Field(description="Domain being searched")

class SearchResult(BaseModel):
    title: str = Field(description="Title of the search result")
    url: str = Field(description="URL of the search result")
    snippet: str = Field(description="Description/snippet of the search result")

class SearchResultsOutput(BaseModel):
    results: List[SearchResult] = Field(description="List of top 5 search results")
    query_used: str = Field(description="Search query that was used")

class URLSelectionOutput(BaseModel):
    selected_url: str = Field(description="The best URL selected by Gemini")
    reasoning: str = Field(description="Reasoning for URL selection")
    confidence_score: float = Field(description="Confidence score (0-1)")

class ScrapedContentOutput(BaseModel):
    url: str = Field(description="URL that was scraped")
    html_content: str = Field(description="Raw HTML content from the page")
    scrape_status: str = Field(description="Status of scraping operation")

class FinalSummaryOutput(BaseModel):
    summary: str = Field(description="One-line summary of the website")
    url: str = Field(description="URL of the summarized website")
    domain: str = Field(description="Domain of the website")
    timestamp: str = Field(description="When the analysis was completed")

# Custom Gemini LLM Wrapper
class GeminiLLM(LLM):
    model_name: str = "gemini-1.5-flash"  # Using Gemini 1.5 Flash for better performance
    temperature: float = 0.1

    @property
    def _llm_type(self) -> str:
        return "gemini"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        try:
            model = genai.GenerativeModel(self.model_name)
            response = model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error: {str(e)}"

# Initialize Gemini LLM
gemini_llm = GeminiLLM(temperature=0.1)

# Custom Output Parsers
class DomainOutputParser(BaseOutputParser):
    def parse(self, text: str) -> DomainOutput:
        try:
            import json
            parsed = json.loads(text)
            return DomainOutput(**parsed)
        except:
            # Fallback parsing
            domain_match = re.search(r'"domain":\s*"([^"]+)"', text)
            email_match = re.search(r'"original_email":\s*"([^"]+)"', text)

            domain = domain_match.group(1) if domain_match else "unknown"
            email = email_match.group(1) if email_match else "unknown"

            return DomainOutput(domain=domain, original_email=email)

class SearchQueryOutputParser(BaseOutputParser):
    def parse(self, text: str) -> SearchQueryOutput:
        try:
            import json
            parsed = json.loads(text)
            return SearchQueryOutput(**parsed)
        except:
            # Fallback parsing
            query_match = re.search(r'"search_query":\s*"([^"]+)"', text)
            domain_match = re.search(r'"domain":\s*"([^"]+)"', text)

            query = query_match.group(1) if query_match else text.strip()
            domain = domain_match.group(1) if domain_match else "unknown"

            return SearchQueryOutput(search_query=query, domain=domain)

class URLSelectionOutputParser(BaseOutputParser):
    def parse(self, text: str) -> URLSelectionOutput:
        try:
            import json
            parsed = json.loads(text)
            return URLSelectionOutput(**parsed)
        except:
            # Fallback parsing
            url_match = re.search(r'"selected_url":\s*"([^"]+)"', text)
            reasoning_match = re.search(r'"reasoning":\s*"([^"]+)"', text)
            confidence_match = re.search(r'"confidence_score":\s*([0-9.]+)', text)

            url = url_match.group(1) if url_match else "unknown"
            reasoning = reasoning_match.group(1) if reasoning_match else "No reasoning provided"
            confidence = float(confidence_match.group(1)) if confidence_match else 0.5

            return URLSelectionOutput(
                selected_url=url,
                reasoning=reasoning,
                confidence_score=confidence
            )

class FinalSummaryOutputParser(BaseOutputParser):
    def parse(self, text: str) -> FinalSummaryOutput:
        try:
            import json
            parsed = json.loads(text)
            return FinalSummaryOutput(**parsed)
        except:
            # Fallback parsing - just use the text as summary
            return FinalSummaryOutput(
                summary=text.strip(),
                url="unknown",
                domain="unknown",
                timestamp=datetime.now().isoformat()
            )

print("✅ Setup complete! All imports and configurations loaded.")
print("🔧 Remember to replace API keys with your actual keys before running the chain.")
print("📦 Required packages: langchain, google-generativeai, requests, pydantic")
print("📋 Next: Run Cell 2 to create the individual chain components.")

✅ Setup complete! All imports and configurations loaded.
🔧 Remember to replace API keys with your actual keys before running the chain.
📦 Required packages: langchain, google-generativeai, requests, pydantic
📋 Next: Run Cell 2 to create the individual chain components.


In [None]:

# Cell 2: Chain Components and API Functions

import time  # Added for polling delays

# Utility Functions
def extract_domain_from_email(email: str) -> DomainOutput:
    """Extract domain from email address"""
    try:
        # Simple regex to extract domain
        domain_match = re.search(r'@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', email)
        if domain_match:
            domain = domain_match.group(1)
            return DomainOutput(domain=domain, original_email=email)
        else:
            return DomainOutput(domain="invalid", original_email=email)
    except Exception as e:
        return DomainOutput(domain="error", original_email=email)

def call_serper_api(query: str) -> SearchResultsOutput:
    """Call Serper API to get search results - Updated to match working pattern"""
    url = "https://google.serper.dev/search"

    headers = {
        "X-API-KEY": SERPER_API_KEY,
        "Content-Type": "application/json"
    }

    payload = {
        "q": query,
        "num": 5  # Request 5 results
    }

    try:
        print(f"🔍 Searching Serper API: '{query}'")
        response = requests.post(url, headers=headers, json=payload, timeout=30)

        if response.status_code == 200:
            data = response.json()
            organic_results = data.get('organic', [])[:5]

            print(f"✅ Found {len(organic_results)} results")

            results = []
            for result in organic_results:
                results.append(SearchResult(
                    title=result.get('title', ''),
                    url=result.get('link', ''),
                    snippet=result.get('snippet', '')
                ))

            return SearchResultsOutput(results=results, query_used=query)

        else:
            print(f"❌ Serper API Error: {response.status_code}")
            if response.status_code == 401:
                print("💡 Check your API key is correct")
            elif response.status_code == 429:
                print("💡 Rate limit exceeded - wait a moment")

            return SearchResultsOutput(results=[], query_used=query)

    except Exception as e:
        print(f"❌ Serper API Exception: {e}")
        return SearchResultsOutput(results=[], query_used=query)

def call_brightdata_api(url: str, dataset_id: str = "gd_m6gjtfmeh43we6cqc") -> ScrapedContentOutput:
    """
    Call Bright Data API with proper two-phase workflow:
    Phase 1: Trigger scraping job
    Phase 2: Poll and retrieve results
    """
    print(f"\n🤖 Starting Bright Data scraping for: {url}")

    # Phase 1: Trigger the scraping job
    trigger_url = "https://api.brightdata.com/datasets/v3/trigger"

    headers = {
        "Authorization": f"Bearer {BRIGHTDATA_API_TOKEN}",
        "Content-Type": "application/json"
    }

    params = {
        'dataset_id': dataset_id,
        'format': 'json'
    }

    payload = [{"url": url}]

    try:
        print("⏳ Phase 1: Triggering scraping job...")
        response = requests.post(
            trigger_url,
            headers=headers,
            params=params,
            json=payload,
            timeout=30
        )

        if not response.ok:
            print(f"❌ Trigger failed: {response.status_code} - {response.text}")
            return simple_scrape(url)

        result = response.json()

        if 'snapshot_id' not in result:
            print(f"❌ No snapshot_id in response: {result}")
            return simple_scrape(url)

        snapshot_id = result['snapshot_id']
        print(f"✅ Job triggered successfully! Snapshot ID: {snapshot_id}")

        # Phase 2: Poll and retrieve results
        scraped_content = poll_and_retrieve_results(snapshot_id, url)
        return scraped_content

    except Exception as e:
        print(f"❌ Bright Data API Error: {e}")
        return simple_scrape(url)

def poll_and_retrieve_results(snapshot_id: str, original_url: str, max_wait_minutes: int = 5) -> ScrapedContentOutput:
    """
    Poll Bright Data for results and retrieve when ready
    """
    print(f"⏳ Phase 2: Polling for results (max {max_wait_minutes} minutes)...")

    headers = {
        "Authorization": f"Bearer {BRIGHTDATA_API_TOKEN}",
        "Content-Type": "application/json"
    }

    progress_url = f"https://api.brightdata.com/datasets/v3/progress/{snapshot_id}"
    start_time = time.time()
    max_wait_seconds = max_wait_minutes * 60
    poll_count = 0

    while True:
        elapsed_time = time.time() - start_time
        poll_count += 1

        if elapsed_time > max_wait_seconds:
            print(f"⏰ Timeout after {max_wait_minutes} minutes")
            return ScrapedContentOutput(
                url=original_url,
                html_content="",
                scrape_status=f"timeout_after_{max_wait_minutes}min"
            )

        try:
            print(f"📡 Poll #{poll_count} - Checking status... ({elapsed_time:.0f}s elapsed)")

            response = requests.get(progress_url, headers=headers, timeout=30)

            if response.ok:
                status_data = response.json()
                current_status = status_data.get('status', 'unknown')

                print(f"📊 Status: {current_status}")

                if current_status == 'done' or current_status == 'ready':
                    print("✅ Scraping completed! Downloading results...")
                    return download_scraped_results(snapshot_id, original_url)

                elif current_status == 'failed':
                    print("❌ Scraping failed!")
                    return ScrapedContentOutput(
                        url=original_url,
                        html_content="",
                        scrape_status="failed"
                    )

                elif current_status == 'running':
                    print("⏳ Still processing... waiting 30 seconds")
                    time.sleep(30)
                    continue

                else:
                    print(f"⚠️ Unknown status: {current_status}, waiting 30 seconds")
                    time.sleep(30)
                    continue

            else:
                print(f"❌ Status check failed: {response.status_code}")
                if poll_count >= 3:  # Give up after 3 failed status checks
                    break
                time.sleep(30)
                continue

        except Exception as e:
            print(f"❌ Error during polling: {e}")
            if poll_count >= 3:  # Give up after 3 errors
                break
            time.sleep(30)
            continue

    # If we get here, something went wrong
    print("❌ Polling failed, using fallback scraping")
    return simple_scrape(original_url)

def download_scraped_results(snapshot_id: str, original_url: str) -> ScrapedContentOutput:
    """
    Download the actual scraped content from Bright Data
    """
    headers = {
        "Authorization": f"Bearer {BRIGHTDATA_API_TOKEN}",
        "Content-Type": "application/json"
    }

    # Try multiple download endpoints
    download_urls = [
        f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}?format=json",
        f"https://api.brightdata.com/datasets/v3/download/{snapshot_id}",
        f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
    ]

    for download_url in download_urls:
        try:
            print(f"📥 Trying download: {download_url}")

            response = requests.get(download_url, headers=headers, timeout=60)

            if response.ok:
                print(f"✅ Download successful! ({len(response.content)} bytes)")

                try:
                    # Try to parse as JSON
                    scraped_data = response.json()

                    # Extract HTML content from the data structure
                    html_content = extract_html_from_response(scraped_data)

                    return ScrapedContentOutput(
                        url=original_url,
                        html_content=html_content,
                        scrape_status="success"
                    )

                except json.JSONDecodeError:
                    # If not JSON, treat as plain text
                    return ScrapedContentOutput(
                        url=original_url,
                        html_content=response.text,
                        scrape_status="success_text"
                    )

            else:
                print(f"❌ Download failed: {response.status_code}")
                continue

        except Exception as e:
            print(f"❌ Download error: {e}")
            continue

    # All download attempts failed
    print("❌ All download attempts failed")
    return ScrapedContentOutput(
        url=original_url,
        html_content="",
        scrape_status="download_failed"
    )

def extract_html_from_response(scraped_data) -> str:
    """
    Extract HTML content from Bright Data response structure
    """
    try:
        # Bright Data typically returns data in different structures
        # Try common patterns

        if isinstance(scraped_data, list) and scraped_data:
            # If it's a list, take the first item
            item = scraped_data[0]

            # Look for common HTML fields
            html_fields = ['html', 'page_html', 'content', 'body', 'raw_html']
            for field in html_fields:
                if isinstance(item, dict) and field in item:
                    return str(item[field])

            # If no specific HTML field, convert the whole item to string
            return str(item)

        elif isinstance(scraped_data, dict):
            # Look for HTML in dictionary
            html_fields = ['html', 'page_html', 'content', 'body', 'raw_html', 'data']
            for field in html_fields:
                if field in scraped_data:
                    content = scraped_data[field]
                    if isinstance(content, list) and content:
                        return str(content[0])
                    return str(content)

            # If no specific field, return the whole dict as string
            return str(scraped_data)

        else:
            # Return as-is if not dict or list
            return str(scraped_data)

    except Exception as e:
        print(f"⚠️ Error extracting HTML: {e}")
        return str(scraped_data)

def simple_scrape(url: str) -> ScrapedContentOutput:
    """Fallback scraping using requests"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            return ScrapedContentOutput(
                url=url,
                html_content=response.text[:5000],  # Limit content size
                scrape_status="success_fallback"
            )
        else:
            return ScrapedContentOutput(
                url=url,
                html_content="",
                scrape_status=f"failed_{response.status_code}"
            )
    except Exception as e:
        return ScrapedContentOutput(
            url=url,
            html_content="",
            scrape_status=f"error_{str(e)}"
        )

# Chain 1: Domain Extraction (No LLM needed)
def domain_extraction_chain(inputs):
    email = inputs['email']
    result = extract_domain_from_email(email)
    return {'domain_output': result, 'domain': result.domain}

# Chain 2: Search Query Builder
search_query_prompt = PromptTemplate(
    input_variables=["domain"],
    template="""
You are tasked with creating an optimal search query to find the official website for a company domain.

Domain: {domain}

Create a search query that will help find the official company website. Consider:
- The domain name itself
- Adding terms like "official website" if helpful
- Avoiding overly complex queries

Return your response in this exact JSON format:
{{
    "search_query": "your optimized search query here",
    "domain": "{domain}"
}}
"""
)

search_query_chain = LLMChain(
    llm=gemini_llm,
    prompt=search_query_prompt,
    output_parser=SearchQueryOutputParser(),
    output_key="search_query_output"
)

# Chain 3: Serper API Call (Custom function)
def serper_search_chain(inputs):
    search_query_output = inputs['search_query_output']
    query = search_query_output.search_query
    results = call_serper_api(query)
    return {'search_results_output': results}

# Chain 4: URL Selection
url_selection_prompt = PromptTemplate(
    input_variables=["search_results"],
    template="""
You are an expert at identifying official company websites from search results.

Search Results:
{search_results}

Analyze these search results and select the BEST URL that represents the official company website. Consider:
- Official company domains vs third-party sites
- Homepage vs subpages
- Credibility and authority of the source
- Relevance to the original domain

Return your response in this exact JSON format:
{{
    "selected_url": "the best URL from the results",
    "reasoning": "brief explanation of why you chose this URL",
    "confidence_score": 0.95
}}
"""
)

def format_search_results_for_prompt(search_results_output):
    results_text = ""
    for i, result in enumerate(search_results_output.results, 1):
        results_text += f"{i}. Title: {result.title}\n"
        results_text += f"   URL: {result.url}\n"
        results_text += f"   Snippet: {result.snippet}\n\n"
    return results_text

url_selection_chain = LLMChain(
    llm=gemini_llm,
    prompt=url_selection_prompt,
    output_parser=URLSelectionOutputParser(),
    output_key="url_selection_output"
)

# Chain 5: Content Scraping (Custom function)
def content_scraping_chain(inputs):
    url_selection_output = inputs['url_selection_output']
    selected_url = url_selection_output.selected_url

    # Extract domain root if needed
    try:
        from urllib.parse import urlparse
        parsed = urlparse(selected_url)
        root_url = f"{parsed.scheme}://{parsed.netloc}"
    except:
        root_url = selected_url

    scraped_content = call_brightdata_api(root_url)
    return {'scraped_content_output': scraped_content}

# Chain 6: Summary Generation
summary_prompt = PromptTemplate(
    input_variables=["scraped_content", "url", "domain"],
    template="""
You are tasked with creating a concise, one-line summary of a website based on its scraped content.

Website URL: {url}
Domain: {domain}
Scraped Content: {scraped_content}

Create a single, clear sentence that describes what this website/company does. Focus on:
- Main business purpose or service
- Industry or sector
- Key value proposition

Return your response in this exact JSON format:
{{
    "summary": "One clear sentence describing what this company/website does",
    "url": "{url}",
    "domain": "{domain}",
    "timestamp": "{timestamp}"
}}
"""
)

summary_chain = LLMChain(
    llm=gemini_llm,
    prompt=summary_prompt,
    output_parser=FinalSummaryOutputParser(),
    output_key="final_summary"
)

print("✅ All chain components created successfully!")
print("📋 Individual chains ready:")
print("   1. Domain Extraction ✓")
print("   2. Search Query Builder ✓")
print("   3. Serper API Search ✓")
print("   4. URL Selection ✓")
print("   5. Content Scraping ✓")
print("   6. Summary Generation ✓")
print("\n🚀 Next: Run Cell 3 to create the main SequentialChain and test it!")

✅ All chain components created successfully!
📋 Individual chains ready:
   1. Domain Extraction ✓
   2. Search Query Builder ✓
   3. Serper API Search ✓
   4. URL Selection ✓
   5. Content Scraping ✓
   6. Summary Generation ✓

🚀 Next: Run Cell 3 to create the main SequentialChain and test it!


In [None]:

# Cell 3: Main Sequential Chain and Execution

from langchain.chains.base import Chain
from typing import Dict, Any, List, ClassVar

# Custom wrapper chains to integrate functions with LangChain
class DomainExtractionChain(Chain):
    """Custom chain for domain extraction"""

    input_keys: ClassVar[List[str]] = ["email"]
    output_keys: ClassVar[List[str]] = ["domain_output", "domain"]

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return domain_extraction_chain(inputs)

class SerperSearchChain(Chain):
    """Custom chain for Serper API search"""

    input_keys: ClassVar[List[str]] = ["search_query_output"]
    output_keys: ClassVar[List[str]] = ["search_results_output"]

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return serper_search_chain(inputs)

class ContentScrapingChain(Chain):
    """Custom chain for content scraping"""

    input_keys: ClassVar[List[str]] = ["url_selection_output"]
    output_keys: ClassVar[List[str]] = ["scraped_content_output"]

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return content_scraping_chain(inputs)

# Custom chain to format search results for URL selection
class URLSelectionPreprocessChain(Chain):
    """Preprocess search results for URL selection"""

    input_keys: ClassVar[List[str]] = ["search_results_output"]
    output_keys: ClassVar[List[str]] = ["search_results"]

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        search_results_output = inputs['search_results_output']
        formatted_results = format_search_results_for_prompt(search_results_output)
        return {'search_results': formatted_results}

# Custom chain to format scraped content for summary
class SummaryPreprocessChain(Chain):
    """Preprocess scraped content for summary generation"""

    input_keys: ClassVar[List[str]] = ["scraped_content_output", "url_selection_output", "domain"]
    output_keys: ClassVar[List[str]] = ["scraped_content", "url", "timestamp"]

    def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        scraped_output = inputs['scraped_content_output']
        url_output = inputs['url_selection_output']
        domain = inputs['domain']

        # Limit content size for processing
        content = scraped_output.html_content[:2000] + "..." if len(scraped_output.html_content) > 2000 else scraped_output.html_content

        return {
            'scraped_content': content,
            'url': url_output.selected_url,
            'timestamp': datetime.now().isoformat()
        }

# Create the main sequential chain
def create_email_to_summary_chain():
    """Create the main sequential chain"""

    chains = [
        DomainExtractionChain(),           # email → domain
        search_query_chain,                # domain → search_query
        SerperSearchChain(),               # search_query → search_results
        URLSelectionPreprocessChain(),     # search_results → formatted_results
        url_selection_chain,               # formatted_results → selected_url
        ContentScrapingChain(),            # selected_url → scraped_content
        SummaryPreprocessChain(),          # scraped_content → formatted_content
        summary_chain                      # formatted_content → final_summary
    ]

    sequential_chain = SequentialChain(
        chains=chains,
        input_variables=["email"],
        output_variables=["final_summary", "domain", "url_selection_output", "scraped_content_output"],
        verbose=True
    )

    return sequential_chain

# Main execution function
def analyze_email_domain(email: str):
    """
    Main function to analyze an email domain and generate a website summary

    Args:
        email (str): Email address (e.g., "name@company.com")

    Returns:
        dict: Complete analysis results
    """
    print(f"🚀 Starting analysis for: {email}")
    print("=" * 60)
    print("⏳ This process may take 3-5 minutes due to Bright Data scraping...")
    print("📋 Steps: Domain Extract → Search → Select → Scrape (with polling) → Summarize")
    print("=" * 60)

    try:
        # Create and run the chain
        main_chain = create_email_to_summary_chain()

        # Execute the chain
        start_time = time.time()
        result = main_chain({"email": email})
        total_time = time.time() - start_time

        # Extract and display results
        final_summary = result['final_summary']
        domain = result['domain']
        url_selection = result['url_selection_output']
        scraped_content = result['scraped_content_output']

        print(f"\n✅ Analysis Complete! (took {total_time:.1f} seconds)")
        print("=" * 60)
        print(f"📧 Original Email: {email}")
        print(f"🌐 Extracted Domain: {domain}")
        print(f"🔗 Selected URL: {url_selection.selected_url}")
        print(f"🤖 Scraping Status: {scraped_content.scrape_status}")
        print(f"📝 Website Summary: {final_summary.summary}")
        print(f"🎯 Confidence Score: {url_selection.confidence_score}")
        print(f"💭 Selection Reasoning: {url_selection.reasoning}")
        print(f"⏰ Completed: {final_summary.timestamp}")

        return {
            'email': email,
            'domain': domain,
            'selected_url': url_selection.selected_url,
            'summary': final_summary.summary,
            'confidence': url_selection.confidence_score,
            'reasoning': url_selection.reasoning,
            'scrape_status': scraped_content.scrape_status,
            'content_length': len(scraped_content.html_content),
            'processing_time': total_time,
            'timestamp': final_summary.timestamp,
            'full_results': result
        }

    except Exception as e:
        print(f"❌ Error during analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return {
            'email': email,
            'error': str(e),
            'timestamp': datetime.now().isoformat()
        }

# Test function with multiple emails
def test_multiple_emails(emails: List[str]):
    """Test the chain with multiple email addresses"""
    results = []

    for email in emails:
        print(f"\n{'='*80}")
        print(f"Testing: {email}")
        print(f"{'='*80}")

        result = analyze_email_domain(email)
        results.append(result)

        print(f"\n⏳ Waiting 2 seconds before next request...")
        import time
        time.sleep(2)

    return results

# Example usage and test cases
if __name__ == "__main__":
    print("🔧 Email to Website Summary Chain Ready!")
    print("=" * 60)
    print("📋 Available functions:")
    print("1. analyze_email_domain(email) - Analyze single email")
    print("2. test_multiple_emails([emails]) - Test multiple emails")
    print()
    print("⚠️  Before running, make sure to:")
    print("   - Replace SERPER_API_KEY with your actual Serper.dev API key")
    print("   - Replace GOOGLE_API_KEY with your actual Google AI API key")
    print("   - Bright Data token is already configured")
    print()
    print("🚀 Example usage:")
    print("   result = analyze_email_domain('contact@gemengserv.com')")
    print()
    print("📊 Test with multiple emails:")
    print("   test_emails = ['contact@gemengserv.com', 'info@example.com']")
    print("   results = test_multiple_emails(test_emails)")

    # Uncomment to run a quick test (after adding your API keys):
    # result = analyze_email_domain('contact@gemengserv.com')

    print("\n" + "="*60)
    print("✅ Setup complete! Ready to analyze email domains.")

🔧 Email to Website Summary Chain Ready!
📋 Available functions:
1. analyze_email_domain(email) - Analyze single email
2. test_multiple_emails([emails]) - Test multiple emails

⚠️  Before running, make sure to:
   - Replace SERPER_API_KEY with your actual Serper.dev API key
   - Replace GOOGLE_API_KEY with your actual Google AI API key
   - Bright Data token is already configured

🚀 Example usage:
   result = analyze_email_domain('contact@gemengserv.com')

📊 Test with multiple emails:
   test_emails = ['contact@gemengserv.com', 'info@example.com']
   results = test_multiple_emails(test_emails)

✅ Setup complete! Ready to analyze email domains.


In [None]:
result = analyze_email_domain('contact@gemengserv.com')

🚀 Starting analysis for: contact@gemengserv.com
⏳ This process may take 3-5 minutes due to Bright Data scraping...
📋 Steps: Domain Extract → Search → Select → Scrape (with polling) → Summarize


[1m> Entering new SequentialChain chain...[0m
🔍 Searching Serper API: 'gemengserv.com official website'
✅ Found 5 results

🤖 Starting Bright Data scraping for: https://gemengserv.com
⏳ Phase 1: Triggering scraping job...
✅ Job triggered successfully! Snapshot ID: s_mfbc01331v88vjuqdd
⏳ Phase 2: Polling for results (max 5 minutes)...
📡 Poll #1 - Checking status... (0s elapsed)
📊 Status: running
⏳ Still processing... waiting 30 seconds
📡 Poll #2 - Checking status... (31s elapsed)
📊 Status: running
⏳ Still processing... waiting 30 seconds
📡 Poll #3 - Checking status... (61s elapsed)
📊 Status: running
⏳ Still processing... waiting 30 seconds
📡 Poll #4 - Checking status... (92s elapsed)
📊 Status: running
⏳ Still processing... waiting 30 seconds
📡 Poll #5 - Checking status... (122s elapsed)
📊 Statu