In [59]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

def get_chat_response(user_message, system_message="You are a helpful assistant.", model="gpt-4o"):
    """
    Get a response from OpenAI's chat completion API.
    
    Args:
        user_message (str): The user's message/question
        system_message (str): The system prompt (default: "You are a helpful assistant.")
        model (str): The OpenAI model to use (default: "gpt-4o")
    
    Returns:
        str: The assistant's response
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )
    return response.choices[0].message.content


## Load Data

In [60]:
import pandas as pd

vendors = pd.read_csv('data/vendors.csv')
specialties = pd.read_csv('data/specialties.csv')

# Display basic info
# print("Vendors data:")
# print(vendors.head())
# print(f"\nShape: {vendors.shape}")
# print(f"Columns: {vendors.columns.tolist()}")

# print("\n" + "="*50 + "\n")

# print("Specialties data:")
# print(specialties.head())
# print(f"\nShape: {specialties.shape}")
# print(f"Columns: {specialties.columns.tolist()}")
len(vendors)

100

In [61]:
tags = set(specialties.iloc[:, 0])
len(tags)

979

## scrape a given webpage 

In [None]:
import requests
from bs4 import BeautifulSoup

## obtains content for a given webpage 
def get_webpage_content(url, max_length=10000):
    """
    Fetch and extract clean text content from a webpage.
    
    Args:
        url (str): The URL to scrape
        max_length (int): Maximum characters to return (for LLM token limits)
    
    Returns:
        dict: Contains 'url', 'text', 'title', and 'success' status
    """
    try:
        # Set a user agent to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        # Fetch the page
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()
        
        # Get text
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        # Get page title
        title = soup.title.string if soup.title else "No title"
        
        # Truncate if too long
        if len(text) > max_length:
            text = text[:max_length] + "..."
        
        return {
            'url': url,
            'title': title,
            'text': text,
            'success': True
        }
        
    except Exception as e:
        return {
            'url': url,
            'title': None,
            'text': None,
            'success': False,
            'error': str(e)
        }

# Test it out
url = "http://www.atg-biosynthetics.com/"  
result = get_webpage_content(url)

if result['success']:
    print(f"Title: {result['title']}")
    print(f"\nContent preview (first 500 chars):")
    print(result['text'][:500])
    print(f"\nTotal characters: {len(result['text'])}")
else:
    print(f"Error: {result['error']}")

Title: ATG:biosynthetics GmbH - your reliable partner for synthetic biology

Content preview (first 500 chars):
ATG:biosynthetics GmbH - your reliable partner for synthetic biology Customer Sign In Home Contact Us Follow us Gene Synthesis Simple Genes Complex Genes Gene Clusters Modules Inquiry & Order Optimizations DNA Optimization (EvoMag) Cluster Design Inquiry DNA optimization Libraries PepID Tags Inquiry Expression & Construction MultiGene MultiLabel Toggle flex TEC Pricing Inquiry Company Contact Consulting Team Partners/Distributors Partner/Collaborate with us Our Customers Jobs Downloads Terms & C

Total characters: 1784


## scrape a website

In [68]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

def normalize_domain(url):
    """
    Normalize domain to handle www and http/https variants.
    """
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    # Remove www. prefix for comparison
    domain = domain.replace('www.', '')
    return domain

def get_all_links(url, base_url=None):
    """
    Extract all links from a webpage, handling redirects properly.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        session = requests.Session()
        response = session.get(url, headers=headers, timeout=10, allow_redirects=True)
        response.raise_for_status()
        
        # Use the final URL after redirects as the base
        actual_url = response.url
        if base_url is None:
            parsed = urlparse(actual_url)
            base_url = f"{parsed.scheme}://{parsed.netloc}"
        
        # Normalize base domain for comparison
        base_domain = normalize_domain(base_url)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        internal_links = set()
        external_links = set()
        
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            
            if not href or href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
                continue
            
            full_url = urljoin(actual_url, href)
            full_url = full_url.split('#')[0]
            
            # Normalize and compare domains
            link_domain = normalize_domain(full_url)
            
            if link_domain == base_domain:
                internal_links.add(full_url)
            else:
                external_links.add(full_url)
        
        return {
            'url': url,
            'actual_url': actual_url,
            'base_url': base_url,
            'internal_links': list(internal_links),
            'external_links': list(external_links),
            'total_internal': len(internal_links),
            'total_external': len(external_links),
            'success': True
        }
        
    except Exception as e:
        return {
            'url': url,
            'success': False,
            'error': str(e)
        }


def get_webpage_content(url, max_length=15000):
    """
    Fetch and extract clean text content from a webpage.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        session = requests.Session()
        response = session.get(url, headers=headers, timeout=10, allow_redirects=True)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()
        
        # Get text
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        # Get page title
        title = soup.title.string if soup.title else "No title"
        
        # Truncate if too long
        if len(text) > max_length:
            text = text[:max_length] + "..."
        
        return {
            'url': url,
            'actual_url': response.url,
            'title': title,
            'text': text,
            'success': True
        }
        
    except Exception as e:
        return {
            'url': url,
            'title': None,
            'text': None,
            'success': False,
            'error': str(e)
        }


def prioritize_links_with_llm(links, vendor_name, vendor_description, top_k=15):
    """
    Use LLM to rank links by relevance for finding vendor specialties.
    
    Args:
        links (list): List of URLs to prioritize
        vendor_name (str): Name of vendor
        vendor_description (str): Description of vendor
        top_k (int): Number of top links to return
    
    Returns:
        list: Top k prioritized links
    """
    if len(links) <= top_k:
        return links
    
    # Create prompt
    links_str = "\n".join([f"{i+1}. {link}" for i, link in enumerate(links)])
    
    prompt = f"""You are helping prioritize which pages to crawl from a vendor website to find information about their specialties and services.

VENDOR: {vendor_name}
DESCRIPTION: {vendor_description[:500]}

AVAILABLE LINKS ({len(links)} total):
{links_str}

Task: Select the TOP {top_k} links most likely to contain information about their services, capabilities, specialties, or products. Prioritize:
- Service/product pages
- Capabilities/technology pages
- About/company pages
- Portfolio/case studies
- Avoid: Contact, careers, news, blog, legal, terms

Return ONLY a JSON array of the top {top_k} link numbers (1-{len(links)}):
[1, 5, 8, ...]
"""
    
    try:
        response = get_chat_response(
            user_message=prompt,
            system_message="You are an expert at identifying relevant pages on vendor websites. Return only valid JSON.",
            model="gpt-4o"
        )
        
        # Parse response
        response = response.strip()
        if response.startswith('```'):
            response = response.split('```')[1]
            if response.startswith('json'):
                response = response[4:]
        
        import json
        selected_indices = json.loads(response)
        
        # Convert 1-indexed to 0-indexed and get links
        prioritized = [links[i-1] for i in selected_indices if 0 < i <= len(links)]
        
        return prioritized[:top_k]
        
    except Exception as e:
        print(f"   Warning: LLM prioritization failed ({str(e)}), using first {top_k} links")
        return links[:top_k]


def crawl_website(start_url, max_pages=10, delay=2, use_llm_prioritization=True, 
                 vendor_name=None, vendor_description=None):
    """
    Crawl a website starting from a URL, following internal links.
    
    Args:
        start_url (str): Starting URL
        max_pages (int): Maximum pages to crawl
        delay (float): Delay between requests
        use_llm_prioritization (bool): Use LLM to prioritize links (NEW)
        vendor_name (str): Vendor name for LLM prioritization (NEW)
        vendor_description (str): Vendor description for LLM prioritization (NEW)
    
    Returns:
        dict: Crawl results
    """
    # First request to get the actual URL after redirects
    try:
        session = requests.Session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        }
        initial_response = session.get(start_url, headers=headers, timeout=10, allow_redirects=True)
        actual_start_url = initial_response.url
        print(f"Starting URL: {start_url}")
        print(f"Actual URL after redirect: {actual_start_url}\n")
    except:
        actual_start_url = start_url
    
    parsed = urlparse(actual_start_url)
    base_url = f"{parsed.scheme}://{parsed.netloc}"
    base_domain = normalize_domain(base_url)
    
    visited = set()
    to_visit = [actual_start_url]
    crawled_pages = []
    
    print(f"Starting crawl of {base_url}")
    print(f"Max pages: {max_pages}, Delay: {delay}s")
    if use_llm_prioritization:
        print(f"LLM prioritization: ENABLED")
    print()
    
    while to_visit and len(visited) < max_pages:
        current_url = to_visit.pop(0)
        
        # Normalize URL for comparison
        if normalize_domain(current_url) != base_domain:
            continue
        
        if current_url in visited:
            continue
        
        print(f"Crawling [{len(visited) + 1}/{max_pages}]: {current_url}")
        
        page_content = get_webpage_content(current_url)
        
        if page_content['success']:
            links_data = get_all_links(current_url, base_url)
            
            if links_data['success']:
                new_links = [link for link in links_data['internal_links'] 
                            if link not in visited and link not in to_visit]
                
                # NEW: Prioritize links with LLM if enabled
                if use_llm_prioritization and new_links and vendor_name:
                    remaining_slots = max_pages - len(visited) - len(to_visit)
                    if remaining_slots > 0:
                        print(f"   Prioritizing {len(new_links)} new links...")
                        prioritized = prioritize_links_with_llm(
                            links=new_links,
                            vendor_name=vendor_name,
                            vendor_description=vendor_description or "",
                            top_k=min(remaining_slots, 15)
                        )
                        to_visit.extend(prioritized)
                        print(f"   Added {len(prioritized)} prioritized links to queue")
                else:
                    # Original behavior: add all links
                    to_visit.extend(new_links)
                
                crawled_pages.append({
                    'url': current_url,
                    'title': page_content['title'],
                    'content': page_content['text'],
                    'internal_links_count': links_data['total_internal'],
                    'external_links_count': links_data['total_external']
                })
        else:
            print(f"   Error: {page_content.get('error', 'Unknown error')}")
        
        visited.add(current_url)
        time.sleep(delay)
    
    print(f"\nCrawl complete! Visited {len(visited)} pages.")
    
    return {
        'base_url': base_url,
        'pages_crawled': len(visited),
        'pages': crawled_pages,
        'visited_urls': list(visited)
    }

In [69]:
def analyze_page_for_specialties(page_content, specialty_list):
    """
    Use LLM to analyze a webpage and identify which specialties are mentioned.
    
    Args:
        page_content (dict): Dict with 'url', 'title', 'content'
        specialty_list (list): List of specialties to look for
    
    Returns:
        dict: Analysis results with found specialties
    """
    # Create a formatted list of specialties
    specialties_str = "\n".join([f"- {spec}" for spec in specialty_list])
    
    prompt = f"""Analyze this webpage content and identify which of the following specialties/services are offered or mentioned.

SPECIALTIES TO LOOK FOR:
{specialties_str}

WEBPAGE TITLE: {page_content['title']}
WEBPAGE URL: {page_content['url']}

WEBPAGE CONTENT:
{page_content['content'][:8000]}  

Instructions:
1. Only identify specialties that are clearly mentioned or strongly implied in the content
2. Be flexible with terminology - "DNA sequencing" matches "DNA Sequencing Services" 
3. Consider synonyms and related terms (e.g., "gene synthesis" could match "DNA synthesis")
4. Return ONLY a JSON array of objects, nothing else
5. Each object should have: "specialty" (exact name from list), "confidence" (HIGH/MEDIUM/LOW), "evidence" (brief quote or context)
6. If no specialties found, return an empty array: []

Response format:
[
  {{"specialty": "...", "confidence": "HIGH", "evidence": "..."}},
  {{"specialty": "...", "confidence": "MEDIUM", "evidence": "..."}}
]
"""
    
    try:
        response = get_chat_response(
            user_message=prompt,
            system_message="You are an expert at analyzing biotech/pharmaceutical vendor capabilities. Return only valid JSON arrays.",
            model="gpt-4o"
        )
        
        # Try to parse JSON response
        # Remove markdown code blocks if present
        response = response.strip()
        if response.startswith('```'):
            response = response.split('```')[1]
            if response.startswith('json'):
                response = response[4:]
        
        findings = json.loads(response)
        
        return {
            'url': page_content['url'],
            'title': page_content['title'],
            'findings': findings,
            'success': True
        }
        
    except json.JSONDecodeError as e:
        return {
            'url': page_content['url'],
            'title': page_content['title'],
            'findings': [],
            'success': False,
            'error': f"JSON parse error: {str(e)}",
            'raw_response': response
        }
    except Exception as e:
        return {
            'url': page_content['url'],
            'title': page_content['title'],
            'findings': [],
            'success': False,
            'error': str(e)
        }


def analyze_crawled_website(crawl_results, specialty_list):
    """
    Analyze all pages from a crawl for specialties.
    
    Args:
        crawl_results (dict): Results from crawl_website()
        specialty_list (list): List of specialties to search for
    
    Returns:
        dict: Combined analysis results
    """
    all_findings = []
    
    print(f"Analyzing {crawl_results['pages_crawled']} pages for specialties...\n")
    
    for i, page in enumerate(crawl_results['pages'], 1):
        print(f"Analyzing [{i}/{crawl_results['pages_crawled']}]: {page['title']}")
        
        analysis = analyze_page_for_specialties(page, specialty_list)
        
        if analysis['success'] and analysis['findings']:
            print(f"   ✓ Found {len(analysis['findings'])} specialties")
            all_findings.append(analysis)
        else:
            print(f"   - No specialties found")
        
        time.sleep(0.5)  # Small delay between API calls
    
    # Aggregate all unique specialties found
    unique_specialties = {}
    for finding in all_findings:
        for item in finding['findings']:
            spec_name = item['specialty']
            if spec_name not in unique_specialties:
                unique_specialties[spec_name] = {
                    'specialty': spec_name,
                    'highest_confidence': item['confidence'],
                    'occurrences': 1,
                    'pages': [{'url': finding['url'], 'evidence': item['evidence']}]
                }
            else:
                unique_specialties[spec_name]['occurrences'] += 1
                unique_specialties[spec_name]['pages'].append({
                    'url': finding['url'], 
                    'evidence': item['evidence']
                })
    
    return {
        'base_url': crawl_results['base_url'],
        'pages_analyzed': crawl_results['pages_crawled'],
        'page_analyses': all_findings,
        'unique_specialties_found': list(unique_specialties.values()),
        'total_unique_specialties': len(unique_specialties)
    }

In [70]:
import json 
def filter_specialties_by_vendor(vendor_info, all_specialties, max_specialties=50):
    """
    Use LLM to narrow down which specialties are likely relevant for a vendor
    based on their description and homepage content.
    
    Args:
        vendor_info (dict): Contains 'name', 'url', 'description', 'homepage_content'
        all_specialties (list): Complete list of all specialties
        max_specialties (int): Maximum number of specialties to return
    
    Returns:
        dict: Filtered specialty list with reasoning
    """
    specialties_str = "\n".join([f"- {spec}" for spec in all_specialties])
    
    prompt = f"""You are analyzing a biotech/pharmaceutical vendor to identify which specialties/services they are LIKELY to offer.

VENDOR INFORMATION:
Name: {vendor_info['name']}
Website: {vendor_info['url']}
Description: {vendor_info.get('description', 'N/A')}

HOMEPAGE CONTENT:
{vendor_info.get('homepage_content', 'N/A')[:5000]}

ALL POSSIBLE SPECIALTIES (choose from this list):
{specialties_str}

Task: Select UP TO {max_specialties} specialties that this vendor is MOST LIKELY to offer based on:
1. Their company description
2. Keywords and services mentioned on their homepage
3. The type of company they appear to be (CRO, CMO, testing lab, etc.)

Be selective - only include specialties that have a reasonable chance of being offered.
Exclude specialties that are clearly unrelated to their focus area.

Return ONLY a JSON object with this structure:
{{
  "likely_specialties": ["specialty1", "specialty2", ...],
  "reasoning": "Brief explanation of why these specialties were selected",
  "vendor_type": "Brief categorization (e.g., 'Analytical Testing Lab', 'Manufacturing CRO', etc.)"
}}
"""
    
    try:
        response = get_chat_response(
            user_message=prompt,
            system_message="You are an expert at categorizing biotech vendor capabilities. Return only valid JSON.",
            model="gpt-4o"
        )
        
        # Clean up response
        response = response.strip()
        if response.startswith('```'):
            response = response.split('```')[1]
            if response.startswith('json'):
                response = response[4:]
        
        result = json.loads(response)
        
        return {
            'vendor_name': vendor_info['name'],
            'vendor_url': vendor_info['url'],
            'filtered_specialties': result['likely_specialties'],
            'total_filtered': len(result['likely_specialties']),
            'original_total': len(all_specialties),
            'reasoning': result.get('reasoning', ''),
            'vendor_type': result.get('vendor_type', 'Unknown'),
            'success': True
        }
        
    except Exception as e:
        # If filtering fails, return all specialties as fallback
        return {
            'vendor_name': vendor_info['name'],
            'vendor_url': vendor_info['url'],
            'filtered_specialties': all_specialties,
            'total_filtered': len(all_specialties),
            'original_total': len(all_specialties),
            'reasoning': f'Filtering failed: {str(e)}',
            'vendor_type': 'Unknown',
            'success': False
        }

In [71]:
def analyze_vendor_complete(vendor_name, vendor_url, vendor_description, all_specialties, max_pages=10):
    """
    Complete pipeline: filter specialties, crawl website, analyze pages.
    """
    print(f"{'='*60}")
    print(f"ANALYZING VENDOR: {vendor_name}")
    print(f"{'='*60}\n")
    
    # Step 1: Get homepage content
    print("Step 1: Fetching homepage content...")
    homepage = get_webpage_content(vendor_url)
    
    if not homepage['success']:
        print(f"   ✗ Failed to fetch homepage: {homepage['error']}")
        return None
    
    print(f"   ✓ Homepage fetched: {len(homepage['text'])} characters\n")
    
    # Step 2: Filter specialties based on vendor info + homepage
    print(f"Step 2: Filtering specialties (from {len(all_specialties)} total)...")
    vendor_info = {
        'name': vendor_name,
        'url': vendor_url,
        'description': vendor_description,
        'homepage_content': homepage['text']
    }
    
    filter_results = filter_specialties_by_vendor(vendor_info, all_specialties, max_specialties=50)
    
    print(f"   ✓ Filtered to {filter_results['total_filtered']} likely specialties")
    print(f"   Vendor Type: {filter_results['vendor_type']}")
    print(f"   Reasoning: {filter_results['reasoning'][:150]}...\n")
    
    # Step 3: Crawl website
    print(f"Step 3: Crawling website (max {max_pages} pages)...")
    crawl_results = crawl_website(vendor_url, max_pages=max_pages, delay=2)
    print()
    
    # Step 4: Analyze pages with filtered specialty list
    print(f"Step 4: Analyzing {crawl_results['pages_crawled']} pages with LLM...")
    analysis_results = analyze_crawled_website(crawl_results, filter_results['filtered_specialties'])
    
    # Combine all results
    complete_results = {
        'vendor_name': vendor_name,
        'vendor_url': vendor_url,
        'vendor_description': vendor_description,
        'vendor_type': filter_results['vendor_type'],
        'filtering': {
            'original_specialty_count': filter_results['original_total'],
            'filtered_specialty_count': filter_results['total_filtered'],
            'filtered_specialties': filter_results['filtered_specialties'],
            'reasoning': filter_results['reasoning']
        },
        'crawling': {
            'pages_crawled': crawl_results['pages_crawled'],
            'urls_visited': crawl_results['visited_urls']
        },
        'analysis': {
            'pages_analyzed': analysis_results['pages_analyzed'],
            'unique_specialties_found': analysis_results['unique_specialties_found'],
            'total_unique_specialties': analysis_results['total_unique_specialties']
        }
    }
    
    # Display summary WITH EVIDENCE
    print(f"\n{'='*60}")
    print(f"ANALYSIS COMPLETE: {vendor_name}")
    print(f"{'='*60}")
    print(f"Vendor Type: {complete_results['vendor_type']}")
    print(f"Specialties filtered: {filter_results['original_total']} → {filter_results['total_filtered']}")
    print(f"Pages crawled: {crawl_results['pages_crawled']}")
    print(f"Specialties found: {analysis_results['total_unique_specialties']}")
    
    print(f"\n{'='*60}")
    print(f"FOUND SPECIALTIES WITH EVIDENCE:")
    print(f"{'='*60}\n")
    
    for i, spec in enumerate(analysis_results['unique_specialties_found'], 1):
        print(f"{i}. {spec['specialty']}")
        print(f"   Confidence: {spec['highest_confidence']}")
        print(f"   Found on {spec['occurrences']} page(s)")
        print(f"   Evidence:")
        
        # Show evidence from each page (limit to first 3 pages)
        for j, page in enumerate(spec['pages'][:3], 1):
            print(f"      [{j}] {page['url']}")
            print(f"          \"{page['evidence']}\"")
        
        if len(spec['pages']) > 3:
            print(f"      ... and {len(spec['pages']) - 3} more page(s)")
        
        print()
    
    return complete_results

In [72]:
def print_specialty_evidence(specialty_data, max_evidence_per_specialty=3):
    """
    Pretty print a specialty with all its evidence.
    
    Args:
        specialty_data (dict): Single specialty from unique_specialties_found
        max_evidence_per_specialty (int): Max evidence snippets to show
    """
    print(f"\n{'─'*60}")
    print(f"SPECIALTY: {specialty_data['specialty']}")
    print(f"{'─'*60}")
    print(f"Confidence Level: {specialty_data['highest_confidence']}")
    print(f"Mentioned on {specialty_data['occurrences']} page(s)")
    print(f"\nEvidence:")
    
    for i, page in enumerate(specialty_data['pages'][:max_evidence_per_specialty], 1):
        print(f"\n  [{i}] Page: {page['url']}")
        print(f"      Quote: \"{page['evidence']}\"")
    
    if len(specialty_data['pages']) > max_evidence_per_specialty:
        remaining = len(specialty_data['pages']) - max_evidence_per_specialty
        print(f"\n  ... {remaining} more occurrence(s) not shown")


def display_all_evidence(result):
    """
    Display detailed evidence for all found specialties.
    
    Args:
        result (dict): Output from analyze_vendor_complete()
    """
    print(f"\n{'='*70}")
    print(f"DETAILED EVIDENCE REPORT: {result['vendor_name']}")
    print(f"{'='*70}")
    
    specialties = result['analysis']['unique_specialties_found']
    
    # Sort by confidence and occurrences
    high_conf = [s for s in specialties if s['highest_confidence'] == 'HIGH']
    medium_conf = [s for s in specialties if s['highest_confidence'] == 'MEDIUM']
    low_conf = [s for s in specialties if s['highest_confidence'] == 'LOW']
    
    if high_conf:
        print(f"\n{'='*70}")
        print(f"HIGH CONFIDENCE SPECIALTIES ({len(high_conf)})")
        print(f"{'='*70}")
        for spec in high_conf:
            print_specialty_evidence(spec)
    
    if medium_conf:
        print(f"\n{'='*70}")
        print(f"MEDIUM CONFIDENCE SPECIALTIES ({len(medium_conf)})")
        print(f"{'='*70}")
        for spec in medium_conf:
            print_specialty_evidence(spec)
    
    if low_conf:
        print(f"\n{'='*70}")
        print(f"LOW CONFIDENCE SPECIALTIES ({len(low_conf)})")
        print(f"{'='*70}")
        for spec in low_conf:
            print_specialty_evidence(spec)

## results (preliminary)

In [73]:
vendor = vendors.iloc[5] ## for a given website ... 
print(vendor)
result = analyze_vendor_complete(
    vendor_name=vendor['company_name'],
    vendor_url=vendor['website_url'],
    vendor_description=vendor['company_description'],
    all_specialties=tags,
    max_pages=10
)

# Display detailed evidence
if result:
    display_all_evidence(result)

company_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  ATG:biosynthetics
website_url                                                                                                                                                                             

## Evaluation: Decoy Set

In [16]:
import random
import pandas as pd

def create_test_set(correct_tags, all_possible_tags, num_decoys=None):
    """
    Create a test set with correct tags + random decoy tags.
    
    Args:
        correct_tags (list): Ground truth tags for the vendor
        all_possible_tags (list): All available tags to sample decoys from
        num_decoys (int): Number of wrong tags to add (default: same as correct_tags)
    
    Returns:
        dict: Test set with correct and decoy tags
    """
    if num_decoys is None:
        num_decoys = len(correct_tags)
    
    # Remove correct tags from the pool to avoid duplicates
    available_decoys = [tag for tag in all_possible_tags if tag not in correct_tags]
    
    # Sample random decoy tags
    decoys = random.sample(available_decoys, min(num_decoys, len(available_decoys)))
    
    # Combine and shuffle
    test_tags = correct_tags + decoys
    random.shuffle(test_tags)
    
    return {
        'test_tags': test_tags,
        'correct_tags': set(correct_tags),
        'decoy_tags': set(decoys),
        'total_test': len(test_tags),
        'num_correct': len(correct_tags),
        'num_decoys': len(decoys)
    }


def evaluate_predictions(predicted_tags, ground_truth_tags, decoy_tags):
    """
    Evaluate prediction performance with precision, recall, F1.
    
    Args:
        predicted_tags (list): Tags predicted by the system
        ground_truth_tags (set): Actual correct tags
        decoy_tags (set): Known incorrect tags
    
    Returns:
        dict: Evaluation metrics
    """
    predicted_set = set(predicted_tags)
    
    # True Positives: correctly identified real tags
    true_positives = predicted_set & ground_truth_tags
    
    # False Positives: incorrectly identified decoy tags as real
    false_positives = predicted_set & decoy_tags
    
    # False Negatives: missed real tags
    false_negatives = ground_truth_tags - predicted_set
    
    # True Negatives: correctly rejected decoy tags
    true_negatives = decoy_tags - predicted_set
    
    # Calculate metrics
    precision = len(true_positives) / len(predicted_set) if len(predicted_set) > 0 else 0
    recall = len(true_positives) / len(ground_truth_tags) if len(ground_truth_tags) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (len(true_positives) + len(true_negatives)) / (len(ground_truth_tags) + len(decoy_tags))
    
    return {
        'true_positives': list(true_positives),
        'false_positives': list(false_positives),
        'false_negatives': list(false_negatives),
        'true_negatives': list(true_negatives),
        'num_tp': len(true_positives),
        'num_fp': len(false_positives),
        'num_fn': len(false_negatives),
        'num_tn': len(true_negatives),
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy
    }


def run_verification_test(vendor_name, vendor_url, vendor_description, 
                         ground_truth_tags, all_possible_tags, 
                         num_decoys=None, max_pages=10, confidence_threshold='MEDIUM'):
    """
    Run complete verification test on a vendor.
    
    Args:
        vendor_name (str): Vendor name
        vendor_url (str): Vendor URL
        vendor_description (str): Vendor description
        ground_truth_tags (list): Known correct tags for this vendor
        all_possible_tags (list): All available tags
        num_decoys (int): Number of decoy tags to add
        max_pages (int): Max pages to crawl
        confidence_threshold (str): Minimum confidence to accept ('HIGH', 'MEDIUM', 'LOW')
    
    Returns:
        dict: Complete test results with metrics
    """
    print(f"\n{'='*70}")
    print(f"VERIFICATION TEST: {vendor_name}")
    print(f"{'='*70}\n")
    
    # Step 1: Create test set
    print("Step 1: Creating test set...")
    test_set = create_test_set(ground_truth_tags, all_possible_tags, num_decoys)
    print(f"   Test tags: {test_set['total_test']} ({test_set['num_correct']} correct + {test_set['num_decoys']} decoys)")
    
    # Step 2: Run analysis
    print("\nStep 2: Running analysis...")
    result = analyze_vendor_complete(
        vendor_name=vendor_name,
        vendor_url=vendor_url,
        vendor_description=vendor_description,
        all_specialties=test_set['test_tags'],
        max_pages=max_pages
    )
    
    if not result:
        print("   ✗ Analysis failed")
        return None
    
    # Step 3: Extract predictions based on confidence threshold
    print(f"\nStep 3: Extracting predictions (threshold: {confidence_threshold})...")
    
    confidence_levels = {'HIGH': 3, 'MEDIUM': 2, 'LOW': 1}
    threshold_value = confidence_levels[confidence_threshold]
    
    predicted_tags = []
    for spec in result['analysis']['unique_specialties_found']:
        spec_confidence = confidence_levels[spec['highest_confidence']]
        if spec_confidence >= threshold_value:
            predicted_tags.append(spec['specialty'])
    
    print(f"   Predicted {len(predicted_tags)} tags")
    
    # Step 4: Evaluate
    print("\nStep 4: Evaluating predictions...")
    metrics = evaluate_predictions(predicted_tags, test_set['correct_tags'], test_set['decoy_tags'])
    
    # Compile results
    verification_results = {
        'vendor_name': vendor_name,
        'vendor_url': vendor_url,
        'test_set': test_set,
        'analysis_result': result,
        'predicted_tags': predicted_tags,
        'metrics': metrics,
        'confidence_threshold': confidence_threshold
    }
    
    # Display results
    print(f"\n{'='*70}")
    print(f"VERIFICATION RESULTS: {vendor_name}")
    print(f"{'='*70}")
    print(f"\nTest Configuration:")
    print(f"  Ground truth tags: {test_set['num_correct']}")
    print(f"  Decoy tags: {test_set['num_decoys']}")
    print(f"  Total test tags: {test_set['total_test']}")
    print(f"  Confidence threshold: {confidence_threshold}")
    
    print(f"\nPerformance Metrics:")
    print(f"  Precision: {metrics['precision']:.2%} ({metrics['num_tp']}/{metrics['num_tp'] + metrics['num_fp']})")
    print(f"  Recall:    {metrics['recall']:.2%} ({metrics['num_tp']}/{metrics['num_tp'] + metrics['num_fn']})")
    print(f"  F1 Score:  {metrics['f1_score']:.2%}")
    print(f"  Accuracy:  {metrics['accuracy']:.2%}")
    
    print(f"\nConfusion Matrix:")
    print(f"  True Positives:  {metrics['num_tp']} (correctly found real tags)")
    print(f"  False Positives: {metrics['num_fp']} (incorrectly identified decoys)")
    print(f"  False Negatives: {metrics['num_fn']} (missed real tags)")
    print(f"  True Negatives:  {metrics['num_tn']} (correctly rejected decoys)")
    
    if metrics['false_positives']:
        print(f"\n❌ False Positives (decoys incorrectly identified):")
        for tag in metrics['false_positives']:
            print(f"     - {tag}")
    
    if metrics['false_negatives']:
        print(f"\n⚠️  False Negatives (real tags missed):")
        for tag in metrics['false_negatives']:
            print(f"     - {tag}")
    
    if metrics['true_positives']:
        print(f"\n✅ True Positives (correctly identified):")
        for tag in metrics['true_positives'][:5]:  # Show first 5
            print(f"     - {tag}")
        if len(metrics['true_positives']) > 5:
            print(f"     ... and {len(metrics['true_positives']) - 5} more")
    
    return verification_results


def save_verification_results(results, filename):
    """Save verification results to JSON file."""
    # Remove large nested objects for cleaner JSON
    simplified_results = {
        'vendor_name': results['vendor_name'],
        'vendor_url': results['vendor_url'],
        'test_config': {
            'ground_truth_count': results['test_set']['num_correct'],
            'decoy_count': results['test_set']['num_decoys'],
            'confidence_threshold': results['confidence_threshold']
        },
        'metrics': results['metrics'],
        'ground_truth_tags': list(results['test_set']['correct_tags']),
        'decoy_tags': list(results['test_set']['decoy_tags']),
        'predicted_tags': results['predicted_tags']
    }
    
    with open(filename, 'w') as f:
        json.dump(simplified_results, f, indent=2)
    
    print(f"\nResults saved to {filename}")

In [23]:
# Extract ground truth from your results
atg_result = result  # Your existing result

# Get all HIGH confidence tags as ground truth
ground_truth_tags = [
    spec['specialty'] 
    for spec in atg_result['analysis']['unique_specialties_found'] 
    if spec['highest_confidence'] == 'HIGH'
]

print(f"Ground Truth Tags (HIGH confidence): {len(ground_truth_tags)}")
for tag in ground_truth_tags:
    print(f"  - {tag}")

# Create test set with decoys
test_set = create_test_set(
    correct_tags=ground_truth_tags,
    all_possible_tags=tags,  # Your full tag list
    num_decoys=len(ground_truth_tags)  # Same number as correct tags
)

print(f"\n{'='*60}")
print(f"TEST SET CREATED")
print(f"{'='*60}")
print(f"Correct tags: {test_set['num_correct']}")
print(f"Decoy tags: {test_set['num_decoys']}")
print(f"Total test tags: {test_set['total_test']}")

print(f"\nSample of test tags (showing first 10):")
for i, tag in enumerate(test_set['test_tags'][:10], 1):
    is_correct = "✓" if tag in test_set['correct_tags'] else "✗"
    print(f"  {i}. {is_correct} {tag}")

Ground Truth Tags (HIGH confidence): 9
  - Gene Synthesis
  - DNA Assembly
  - Sequence Optimization
  - Bioinformatics Consulting
  - Synthetic and Analytical Bioinformatics
  - Protein Expression
  - Mammalian and Microbial Cell Manufacturing
  - Pathway Analysis and Metabolic Network Reconstruction
  - Comparative Genomics

TEST SET CREATED
Correct tags: 9
Decoy tags: 9
Total test tags: 18

Sample of test tags (showing first 10):
  1. ✗ Tablet Compression
  2. ✓ Pathway Analysis and Metabolic Network Reconstruction
  3. ✓ DNA Assembly
  4. ✓ Comparative Genomics
  5. ✓ Bioinformatics Consulting
  6. ✗ Inductively Coupled Plasma (ICP)
  7. ✗ Amination
  8. ✓ Synthetic and Analytical Bioinformatics
  9. ✓ Mammalian and Microbial Cell Manufacturing
  10. ✓ Sequence Optimization


In [72]:
# Strategy 1: Only HIGH confidence as ground truth
high_only = [s['specialty'] for s in atg_result['analysis']['unique_specialties_found'] 
             if s['highest_confidence'] == 'HIGH']

# Strategy 2: HIGH + MEDIUM as ground truth
high_medium = [s['specialty'] for s in atg_result['analysis']['unique_specialties_found'] 
               if s['highest_confidence'] in ['HIGH', 'MEDIUM']]

print(f"Strategy 1 (HIGH only): {len(high_only)} tags")
print(f"Strategy 2 (HIGH + MEDIUM): {len(high_medium)} tags")

# Test both
for strategy_name, ground_truth in [('HIGH_only', high_only), ('HIGH_MEDIUM', high_medium)]:
    print(f"\n{'#'*70}")
    print(f"Testing Strategy: {strategy_name}")
    print(f"{'#'*70}")
    
    results = run_verification_test(
        vendor_name=atg_result['vendor_name'],
        vendor_url=atg_result['vendor_url'],
        vendor_description=atg_result['vendor_description'],
        ground_truth_tags=ground_truth,
        all_possible_tags=tags,
        num_decoys=len(ground_truth),
        max_pages=10,
        confidence_threshold='MEDIUM'
    )
    
    if results:
        save_verification_results(results, f'verification_ATG_{strategy_name}.json')

Strategy 1 (HIGH only): 7 tags
Strategy 2 (HIGH + MEDIUM): 15 tags

######################################################################
Testing Strategy: HIGH_only
######################################################################

VERIFICATION TEST: ATG:biosynthetics

Step 1: Creating test set...
   Test tags: 14 (7 correct + 7 decoys)

Step 2: Running analysis...
ANALYZING VENDOR: ATG:biosynthetics

Step 1: Fetching homepage content...
   ✓ Homepage fetched: 1784 characters

Step 2: Filtering specialties (from 14 total)...
   ✓ Filtered to 7 likely specialties
   Vendor Type: Synthetic Biology and Bioinformatics Services Provider
   Reasoning: ATG:biosynthetics specializes in synthetic biology and biotechnology, focusing on gene synthesis, gene optimization, and multi-gene expression. They o...

Step 3: Crawling website (max 10 pages)...
Starting URL: http://www.atg-biosynthetics.com/
Actual URL after redirect: https://www.atg-biosynthetics.com/

Starting crawl of https://www.

## Eval Strategy 2: inter-annotator consistency

In [24]:
import numpy as np
from collections import defaultdict

def calculate_cohens_kappa(annotations1, annotations2, all_possible_tags):
    """
    Calculate Cohen's Kappa for agreement between two annotation runs.
    
    Args:
        annotations1 (set): Tags from first run
        annotations2 (set): Tags from second run
        all_possible_tags (set): All tags that could be identified
    
    Returns:
        float: Cohen's Kappa coefficient
    """
    # Create binary vectors for both annotations
    n = len(all_possible_tags)
    
    # Observed agreement
    agree = 0
    for tag in all_possible_tags:
        if (tag in annotations1 and tag in annotations2) or (tag not in annotations1 and tag not in annotations2):
            agree += 1
    
    po = agree / n  # Proportion of observed agreement
    
    # Expected agreement by chance
    p1_yes = len(annotations1) / n
    p2_yes = len(annotations2) / n
    p1_no = 1 - p1_yes
    p2_no = 1 - p2_yes
    
    pe = (p1_yes * p2_yes) + (p1_no * p2_no)  # Expected agreement
    
    # Cohen's Kappa
    if pe == 1:
        return 1.0
    kappa = (po - pe) / (1 - pe)
    
    return kappa


def calculate_fleiss_kappa(all_annotations, all_possible_tags):
    """
    Calculate Fleiss' Kappa for agreement among multiple runs (3+).
    
    Args:
        all_annotations (list of sets): List of tag sets from each run
        all_possible_tags (set): All tags that could be identified
    
    Returns:
        float: Fleiss' Kappa coefficient
    """
    n_items = len(all_possible_tags)  # Number of items being judged
    n_raters = len(all_annotations)   # Number of raters (runs)
    
    # Create matrix: items x raters
    # For each tag, count how many runs identified it
    counts = []
    for tag in all_possible_tags:
        yes_count = sum(1 for annotation in all_annotations if tag in annotation)
        no_count = n_raters - yes_count
        counts.append([yes_count, no_count])
    
    counts = np.array(counts)
    
    # Calculate P_i (proportion of agreement for each item)
    P_i = []
    for row in counts:
        sum_sq = sum(c**2 for c in row)
        P_i.append((sum_sq - n_raters) / (n_raters * (n_raters - 1)))
    
    P_bar = np.mean(P_i)  # Mean proportion of agreement
    
    # Calculate P_e (expected agreement by chance)
    p_j = counts.sum(axis=0) / (n_items * n_raters)  # Proportion for each category
    P_e = sum(p**2 for p in p_j)
    
    # Fleiss' Kappa
    if P_e == 1:
        return 1.0
    kappa = (P_bar - P_e) / (1 - P_e)
    
    return kappa


def run_inter_annotation_agreement(vendor_name, vendor_url, vendor_description, 
                                   test_tags, num_runs=3, max_pages=10, 
                                   confidence_threshold='MEDIUM'):
    """
    Run multiple annotations and calculate inter-annotator agreement.
    
    Args:
        vendor_name (str): Vendor name
        vendor_url (str): Vendor URL
        vendor_description (str): Vendor description
        test_tags (list): Tags to test
        num_runs (int): Number of times to run (3+ recommended)
        max_pages (int): Pages to crawl
        confidence_threshold (str): Confidence threshold for predictions
    
    Returns:
        dict: Agreement metrics and detailed results
    """
    print(f"\n{'='*70}")
    print(f"INTER-ANNOTATION AGREEMENT TEST")
    print(f"{'='*70}")
    print(f"Vendor: {vendor_name}")
    print(f"Number of runs: {num_runs}")
    print(f"Confidence threshold: {confidence_threshold}\n")
    
    all_annotations = []
    all_results = []
    confidence_levels = {'HIGH': 3, 'MEDIUM': 2, 'LOW': 1}
    threshold_value = confidence_levels[confidence_threshold]
    
    # Run multiple times
    for run_num in range(num_runs):
        print(f"\n{'='*60}")
        print(f"RUN {run_num + 1}/{num_runs}")
        print(f"{'='*60}")
        
        result = analyze_vendor_complete(
            vendor_name=vendor_name,
            vendor_url=vendor_url,
            vendor_description=vendor_description,
            all_specialties=test_tags,
            max_pages=max_pages
        )
        
        if result:
            # Extract predictions based on confidence threshold
            predicted_tags = set([
                spec['specialty'] 
                for spec in result['analysis']['unique_specialties_found']
                if confidence_levels[spec['highest_confidence']] >= threshold_value
            ])
            
            all_annotations.append(predicted_tags)
            all_results.append(result)
            
            print(f"   Found {len(predicted_tags)} tags in this run")
        else:
            print(f"   ✗ Run {run_num + 1} failed")
            return None
        
        time.sleep(2)  # Brief pause between runs
    
    # Calculate agreement metrics
    print(f"\n{'='*70}")
    print(f"CALCULATING AGREEMENT METRICS")
    print(f"{'='*70}")
    
    all_possible_tags = set(test_tags)
    
    # Pairwise Cohen's Kappa
    pairwise_kappas = []
    for i in range(len(all_annotations)):
        for j in range(i + 1, len(all_annotations)):
            kappa = calculate_cohens_kappa(all_annotations[i], all_annotations[j], all_possible_tags)
            pairwise_kappas.append(kappa)
            print(f"Cohen's Kappa (Run {i+1} vs Run {j+1}): {kappa:.3f}")
    
    avg_cohens_kappa = np.mean(pairwise_kappas)
    
    # Fleiss' Kappa (if 3+ runs)
    fleiss_kappa = None
    if num_runs >= 3:
        fleiss_kappa = calculate_fleiss_kappa(all_annotations, all_possible_tags)
        print(f"\nFleiss' Kappa (overall agreement): {fleiss_kappa:.3f}")
    
    # Analyze tag-level consistency
    tag_frequency = defaultdict(int)
    tag_confidence_by_run = defaultdict(list)
    
    for run_idx, result in enumerate(all_results):
        for spec in result['analysis']['unique_specialties_found']:
            tag_frequency[spec['specialty']] += 1
            tag_confidence_by_run[spec['specialty']].append(spec['highest_confidence'])
    
    # Categorize tags by consistency
    always_found = [tag for tag, count in tag_frequency.items() if count == num_runs]
    sometimes_found = [tag for tag, count in tag_frequency.items() if 0 < count < num_runs]
    never_found = [tag for tag in test_tags if tag not in tag_frequency]
    
    # Simple agreement percentage
    union_tags = set.union(*all_annotations)
    intersection_tags = set.intersection(*all_annotations)
    agreement_pct = len(intersection_tags) / len(union_tags) if union_tags else 0
    
    # Display results
    print(f"\n{'='*70}")
    print(f"INTER-ANNOTATION AGREEMENT RESULTS")
    print(f"{'='*70}")
    
    print(f"\nOverall Metrics:")
    print(f"  Average Cohen's Kappa: {avg_cohens_kappa:.3f}")
    if fleiss_kappa is not None:
        print(f"  Fleiss' Kappa: {fleiss_kappa:.3f}")
    print(f"  Simple Agreement: {agreement_pct:.2%}")
    
    print(f"\nInterpretation of Kappa:")
    kappa_to_check = fleiss_kappa if fleiss_kappa is not None else avg_cohens_kappa
    if kappa_to_check < 0:
        interpretation = "Poor (worse than chance)"
    elif kappa_to_check < 0.20:
        interpretation = "Slight"
    elif kappa_to_check < 0.40:
        interpretation = "Fair"
    elif kappa_to_check < 0.60:
        interpretation = "Moderate"
    elif kappa_to_check < 0.80:
        interpretation = "Substantial"
    else:
        interpretation = "Almost Perfect"
    print(f"  {interpretation}")
    
    print(f"\nTag Consistency:")
    print(f"  Found in ALL {num_runs} runs: {len(always_found)} tags")
    print(f"  Found in SOME runs: {len(sometimes_found)} tags")
    print(f"  Found in NO runs: {len(never_found)} tags")
    
    if always_found:
        print(f"\n✓ CONSISTENT TAGS (found in all {num_runs} runs):")
        for tag in sorted(always_found)[:10]:
            confidences = tag_confidence_by_run[tag]
            print(f"    • {tag} (confidence: {', '.join(confidences)})")
        if len(always_found) > 10:
            print(f"    ... and {len(always_found) - 10} more")
    
    if sometimes_found:
        print(f"\n~ INCONSISTENT TAGS (found in some runs):")
        for tag in sorted(sometimes_found)[:10]:
            count = tag_frequency[tag]
            confidences = tag_confidence_by_run[tag]
            print(f"    • {tag} ({count}/{num_runs} runs, confidence: {', '.join(confidences)})")
        if len(sometimes_found) > 10:
            print(f"    ... and {len(sometimes_found) - 10} more")
    
    # Detailed per-tag analysis
    print(f"\n{'='*70}")
    print(f"PER-TAG CONSISTENCY ANALYSIS")
    print(f"{'='*70}")
    
    tag_consistency_scores = {}
    for tag in union_tags:
        appearances = sum(1 for annotation in all_annotations if tag in annotation)
        consistency = appearances / num_runs
        tag_consistency_scores[tag] = consistency
    
    # Sort by consistency
    sorted_tags = sorted(tag_consistency_scores.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nTop 15 Most Consistent Tags:")
    for i, (tag, score) in enumerate(sorted_tags[:15], 1):
        print(f"  {i}. {tag}: {score:.1%} ({int(score * num_runs)}/{num_runs} runs)")
    
    # Return comprehensive results
    return {
        'vendor_name': vendor_name,
        'num_runs': num_runs,
        'confidence_threshold': confidence_threshold,
        'metrics': {
            'avg_cohens_kappa': avg_cohens_kappa,
            'fleiss_kappa': fleiss_kappa,
            'simple_agreement': agreement_pct,
            'pairwise_kappas': pairwise_kappas
        },
        'tag_analysis': {
            'always_found': sorted(always_found),
            'sometimes_found': sorted(sometimes_found),
            'never_found': sorted(never_found),
            'tag_frequency': dict(tag_frequency),
            'tag_consistency_scores': tag_consistency_scores
        },
        'all_annotations': [list(a) for a in all_annotations],
        'all_results': all_results
    }


def save_agreement_results(agreement_results, filename):
    """Save agreement results to JSON."""
    # Simplify for JSON serialization
    simplified = {
        'vendor_name': agreement_results['vendor_name'],
        'num_runs': agreement_results['num_runs'],
        'confidence_threshold': agreement_results['confidence_threshold'],
        'metrics': agreement_results['metrics'],
        'tag_analysis': {
            'always_found': agreement_results['tag_analysis']['always_found'],
            'sometimes_found': agreement_results['tag_analysis']['sometimes_found'],
            'never_found': agreement_results['tag_analysis']['never_found'][:50],  # Limit for file size
            'tag_frequency': agreement_results['tag_analysis']['tag_frequency'],
            'consistency_scores': {k: v for k, v in sorted(
                agreement_results['tag_analysis']['tag_consistency_scores'].items(), 
                key=lambda x: x[1], reverse=True
            )[:50]}  # Top 50 most consistent
        },
        'all_annotations': agreement_results['all_annotations']
    }
    
    with open(filename, 'w') as f:
        json.dump(simplified, f, indent=2)
    
    print(f"\nAgreement results saved to {filename}")

In [25]:
# Test inter-annotation agreement on ATG
agreement_results = run_inter_annotation_agreement(
    vendor_name='ATG:biosynthetics',
    vendor_url='http://www.atg-biosynthetics.com/',
    vendor_description=atg_result['vendor_description'],
    test_tags=atg_result['filtering']['filtered_specialties'],  # Use filtered tags
    num_runs=3,
    max_pages=10,
    confidence_threshold='MEDIUM'
)

if agreement_results:
    save_agreement_results(agreement_results, 'inter_annotation_agreement_ATG.json')


INTER-ANNOTATION AGREEMENT TEST
Vendor: ATG:biosynthetics
Number of runs: 3
Confidence threshold: MEDIUM


RUN 1/3
ANALYZING VENDOR: ATG:biosynthetics

Step 1: Fetching homepage content...
   ✓ Homepage fetched: 1784 characters

Step 2: Filtering specialties (from 29 total)...
   ✓ Filtered to 11 likely specialties
   Vendor Type: Synthetic Biology and Bioinformatics Service Provider
   Reasoning: ATG:biosynthetics focuses on synthetic biology and bioinformatics, offering services such as gene synthesis, optimization, and multi-gene expression s...

Step 3: Crawling website (max 10 pages)...
Starting URL: http://www.atg-biosynthetics.com/
Actual URL after redirect: https://www.atg-biosynthetics.com/

Starting crawl of https://www.atg-biosynthetics.com
Max pages: 10, Delay: 2s

Crawling [1/10]: https://www.atg-biosynthetics.com/
Crawling [2/10]: https://www.atg-biosynthetics.com/News/CompanyNews.html
Crawling [3/10]: https://www.atg-biosynthetics.com/Optimizations/InfoRequestOpt.html
C

## Eval Strategy 3: Audit the tags + evidence

In [27]:
def audit_evidence_quality(result, audit_criteria=None):
    """
    Audit the quality of evidence provided for each tag prediction.
    
    Args:
        result (dict): Result from analyze_vendor_complete()
        audit_criteria (dict): Custom criteria for evidence quality
    
    Returns:
        dict: Audit results with quality scores
    """
    if audit_criteria is None:
        audit_criteria = {
            'min_evidence_length': 20,  # Minimum characters in evidence
            'max_evidence_length': 500,  # Maximum for meaningful excerpt
            'keywords_required': True,   # Evidence should contain keywords
            'multiple_sources_bonus': True  # Higher score for multiple pages
        }
    
    audited_tags = []
    
    print(f"\n{'='*70}")
    print(f"EVIDENCE QUALITY AUDIT")
    print(f"{'='*70}")
    print(f"Vendor: {result['vendor_name']}\n")
    
    for spec in result['analysis']['unique_specialties_found']:
        tag_name = spec['specialty']
        confidence = spec['highest_confidence']
        occurrences = spec['occurrences']
        pages = spec['pages']
        
        # Audit each piece of evidence
        evidence_scores = []
        evidence_issues = []
        
        for page in pages:
            evidence = page['evidence']
            url = page['url']
            score = 0
            issues = []
            
            # Check 1: Evidence length
            if len(evidence) < audit_criteria['min_evidence_length']:
                issues.append(f"Too short ({len(evidence)} chars)")
                score -= 1
            elif len(evidence) > audit_criteria['max_evidence_length']:
                issues.append(f"Too long ({len(evidence)} chars)")
                score -= 0.5
            else:
                score += 1
            
            # Check 2: Contains relevant keywords
            tag_keywords = tag_name.lower().split()
            evidence_lower = evidence.lower()
            keyword_matches = sum(1 for keyword in tag_keywords if keyword in evidence_lower)
            
            if audit_criteria['keywords_required']:
                if keyword_matches == 0:
                    issues.append("No tag keywords found in evidence")
                    score -= 2
                elif keyword_matches < len(tag_keywords) / 2:
                    issues.append("Few tag keywords found")
                    score -= 0.5
                else:
                    score += 1
            
            # Check 3: Evidence is not too generic
            generic_phrases = [
                'services', 'we offer', 'our company', 'contact us',
                'home', 'about', 'menu', 'click here'
            ]
            generic_count = sum(1 for phrase in generic_phrases if phrase in evidence_lower)
            if generic_count > 2:
                issues.append("Evidence appears generic/boilerplate")
                score -= 1
            
            # Check 4: Evidence is specific and descriptive
            if any(word in evidence_lower for word in ['analysis', 'testing', 'development', 'synthesis', 'production']):
                score += 0.5
            
            evidence_scores.append({
                'url': url,
                'evidence': evidence,
                'score': score,
                'issues': issues
            })
        
        # Calculate aggregate quality score
        avg_evidence_score = sum(e['score'] for e in evidence_scores) / len(evidence_scores)
        
        # Bonus for multiple sources
        if audit_criteria['multiple_sources_bonus'] and occurrences > 1:
            avg_evidence_score += min(occurrences * 0.2, 1.0)
        
        # Normalize to 0-10 scale
        quality_score = max(0, min(10, (avg_evidence_score + 3) * 2))
        
        # Quality rating
        if quality_score >= 8:
            quality_rating = "EXCELLENT"
        elif quality_score >= 6:
            quality_rating = "GOOD"
        elif quality_score >= 4:
            quality_rating = "FAIR"
        else:
            quality_rating = "POOR"
        
        audited_tags.append({
            'specialty': tag_name,
            'confidence': confidence,
            'occurrences': occurrences,
            'quality_score': quality_score,
            'quality_rating': quality_rating,
            'evidence_details': evidence_scores,
            'all_issues': [issue for e in evidence_scores for issue in e['issues']]
        })
    
    # Sort by quality score
    audited_tags.sort(key=lambda x: x['quality_score'], reverse=True)
    
    # Display audit results
    print(f"\nTags sorted by evidence quality:\n")
    
    for i, tag_audit in enumerate(audited_tags, 1):
        print(f"{i}. {tag_audit['specialty']}")
        print(f"   Confidence: {tag_audit['confidence']}")
        print(f"   Quality Score: {tag_audit['quality_score']:.1f}/10 ({tag_audit['quality_rating']})")
        print(f"   Found on {tag_audit['occurrences']} page(s)")
        
        if tag_audit['all_issues']:
            print(f"   ⚠️  Issues: {len(tag_audit['all_issues'])} total")
            for issue in set(tag_audit['all_issues'])[:3]:
                print(f"      - {issue}")
        
        print()
    
    # Summary statistics
    avg_quality = sum(t['quality_score'] for t in audited_tags) / len(audited_tags)
    excellent_count = sum(1 for t in audited_tags if t['quality_rating'] == 'EXCELLENT')
    poor_count = sum(1 for t in audited_tags if t['quality_rating'] == 'POOR')
    
    print(f"\n{'='*70}")
    print(f"AUDIT SUMMARY")
    print(f"{'='*70}")
    print(f"Total tags audited: {len(audited_tags)}")
    print(f"Average quality score: {avg_quality:.1f}/10")
    print(f"Quality distribution:")
    print(f"  EXCELLENT: {excellent_count}")
    print(f"  GOOD: {sum(1 for t in audited_tags if t['quality_rating'] == 'GOOD')}")
    print(f"  FAIR: {sum(1 for t in audited_tags if t['quality_rating'] == 'FAIR')}")
    print(f"  POOR: {poor_count}")
    
    return {
        'vendor_name': result['vendor_name'],
        'audited_tags': audited_tags,
        'summary': {
            'avg_quality_score': avg_quality,
            'total_tags': len(audited_tags),
            'excellent_count': excellent_count,
            'poor_count': poor_count
        }
    }


def detailed_evidence_audit(tag_audit):
    """
    Display detailed audit for a single tag with all evidence.
    
    Args:
        tag_audit (dict): Single tag audit from audit_evidence_quality()
    """
    print(f"\n{'='*70}")
    print(f"DETAILED EVIDENCE AUDIT: {tag_audit['specialty']}")
    print(f"{'='*70}")
    print(f"Overall Quality: {tag_audit['quality_score']:.1f}/10 ({tag_audit['quality_rating']})")
    print(f"Confidence: {tag_audit['confidence']}")
    print(f"Found on {tag_audit['occurrences']} page(s)\n")
    
    for i, evidence_detail in enumerate(tag_audit['evidence_details'], 1):
        print(f"Evidence #{i}")
        print(f"{'─'*70}")
        print(f"URL: {evidence_detail['url']}")
        print(f"Score: {evidence_detail['score']:.1f}")
        print(f"Evidence: \"{evidence_detail['evidence']}\"")
        
        if evidence_detail['issues']:
            print(f"Issues:")
            for issue in evidence_detail['issues']:
                print(f"  ⚠️  {issue}")
        else:
            print(f"✓ No issues found")
        
        print()


def create_evidence_audit_report(result, filename):
    """
    Create a detailed evidence audit report and save to file.
    """
    audit_results = audit_evidence_quality(result)
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"EVIDENCE QUALITY AUDIT REPORT\n")
        f.write(f"{'='*70}\n\n")
        f.write(f"Vendor: {result['vendor_name']}\n")
        f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        f.write(f"SUMMARY\n")
        f.write(f"{'─'*70}\n")
        f.write(f"Total tags: {audit_results['summary']['total_tags']}\n")
        f.write(f"Average quality score: {audit_results['summary']['avg_quality_score']:.1f}/10\n")
        f.write(f"Excellent quality: {audit_results['summary']['excellent_count']}\n")
        f.write(f"Poor quality: {audit_results['summary']['poor_count']}\n\n")
        
        f.write(f"DETAILED TAG AUDITS\n")
        f.write(f"{'='*70}\n\n")
        
        for tag_audit in audit_results['audited_tags']:
            f.write(f"\n{'─'*70}\n")
            f.write(f"TAG: {tag_audit['specialty']}\n")
            f.write(f"{'─'*70}\n")
            f.write(f"Confidence: {tag_audit['confidence']}\n")
            f.write(f"Quality Score: {tag_audit['quality_score']:.1f}/10 ({tag_audit['quality_rating']})\n")
            f.write(f"Found on {tag_audit['occurrences']} page(s)\n\n")
            
            for i, evidence in enumerate(tag_audit['evidence_details'], 1):
                f.write(f"  Evidence #{i}:\n")
                f.write(f"    URL: {evidence['url']}\n")
                f.write(f"    Score: {evidence['score']:.1f}\n")
                f.write(f"    Text: \"{evidence['evidence']}\"\n")
                
                if evidence['issues']:
                    f.write(f"    Issues:\n")
                    for issue in evidence['issues']:
                        f.write(f"      - {issue}\n")
                else:
                    f.write(f"    ✓ No issues\n")
                
                f.write(f"\n")
    
    print(f"\nEvidence audit report saved to {filename}")
    return audit_results


def compare_evidence_quality_across_confidence(result):
    """
    Compare evidence quality across different confidence levels.
    """
    audit_results = audit_evidence_quality(result)
    
    by_confidence = {'HIGH': [], 'MEDIUM': [], 'LOW': []}
    
    for tag_audit in audit_results['audited_tags']:
        by_confidence[tag_audit['confidence']].append(tag_audit['quality_score'])
    
    print(f"\n{'='*70}")
    print(f"EVIDENCE QUALITY BY CONFIDENCE LEVEL")
    print(f"{'='*70}")
    
    for conf_level in ['HIGH', 'MEDIUM', 'LOW']:
        if by_confidence[conf_level]:
            avg_quality = sum(by_confidence[conf_level]) / len(by_confidence[conf_level])
            print(f"{conf_level:6s}: Avg quality = {avg_quality:.1f}/10 (n={len(by_confidence[conf_level])})")
    
    return by_confidence


def flag_suspicious_evidence(result, min_quality_score=4):
    """
    Flag tags with poor quality evidence for manual review.
    """
    audit_results = audit_evidence_quality(result)
    
    suspicious = [
        tag for tag in audit_results['audited_tags'] 
        if tag['quality_score'] < min_quality_score
    ]
    
    print(f"\n{'='*70}")
    print(f"TAGS FLAGGED FOR MANUAL REVIEW")
    print(f"{'='*70}")
    print(f"Quality threshold: {min_quality_score}/10")
    print(f"Flagged: {len(suspicious)}/{audit_results['summary']['total_tags']} tags\n")
    
    if suspicious:
        for tag in suspicious:
            print(f"⚠️  {tag['specialty']}")
            print(f"   Quality: {tag['quality_score']:.1f}/10 ({tag['quality_rating']})")
            print(f"   Confidence: {tag['confidence']}")
            print(f"   Common issues: {', '.join(set(tag['all_issues'])[:2])}")
            print()
    else:
        print("✓ No tags flagged. All evidence quality is acceptable.")
    
    return suspicious

In [28]:
def llm_validate_evidence(specialty, evidence, url):
    """
    Use LLM to validate if evidence actually supports the specialty claim.
    
    Args:
        specialty (str): The specialty/tag being claimed
        evidence (str): The evidence text
        url (str): URL where evidence was found
    
    Returns:
        dict: Validation results
    """
    prompt = f"""You are auditing evidence quality for a vendor tag classification system.

SPECIALTY/TAG: {specialty}
EVIDENCE PROVIDED: "{evidence}"
SOURCE URL: {url}

Evaluate this evidence and answer:

1. Does this evidence ACTUALLY support that the vendor offers "{specialty}"? (YES/NO/UNCLEAR)
2. Quality rating: STRONG/ADEQUATE/WEAK/IRRELEVANT
3. What specifically in the evidence supports or contradicts the tag?
4. Is this evidence specific enough, or is it too generic/vague?
5. Suggested improvement: What would better evidence look like?

Respond in JSON format:
{{
  "supports_tag": "YES/NO/UNCLEAR",
  "quality": "STRONG/ADEQUATE/WEAK/IRRELEVANT",
  "reasoning": "brief explanation",
  "specificity": "SPECIFIC/GENERIC/VAGUE",
  "improvement_suggestion": "what better evidence would look like"
}}
"""
    
    try:
        response = get_chat_response(
            user_message=prompt,
            system_message="You are an expert auditor evaluating evidence quality for classification systems.",
            model="gpt-4o"
        )
        
        # Parse response
        response = response.strip()
        if response.startswith('```'):
            response = response.split('```')[1]
            if response.startswith('json'):
                response = response[4:]
        
        validation = json.loads(response)
        return validation
        
    except Exception as e:
        return {
            'error': str(e),
            'supports_tag': 'UNCLEAR',
            'quality': 'UNKNOWN'
        }


def llm_audit_all_evidence(result, sample_size=None):
    """
    Use LLM to audit evidence for all or sampled tags.
    
    Args:
        result (dict): Result from analyze_vendor_complete()
        sample_size (int): If set, randomly sample this many tags to audit
    """
    tags_to_audit = result['analysis']['unique_specialties_found']
    
    if sample_size and sample_size < len(tags_to_audit):
        tags_to_audit = random.sample(tags_to_audit, sample_size)
        print(f"Auditing random sample of {sample_size} tags...")
    else:
        print(f"Auditing all {len(tags_to_audit)} tags...")
    
    audit_results = []
    
    for spec in tags_to_audit:
        print(f"\nAuditing: {spec['specialty']}")
        
        # Audit first piece of evidence for each tag
        evidence = spec['pages'][0]['evidence']
        url = spec['pages'][0]['url']
        
        validation = llm_validate_evidence(spec['specialty'], evidence, url)
        
        audit_results.append({
            'specialty': spec['specialty'],
            'confidence': spec['highest_confidence'],
            'validation': validation
        })
        
        print(f"  LLM says: {validation.get('supports_tag', 'UNKNOWN')} ({validation.get('quality', 'UNKNOWN')})")
        
        time.sleep(1)  # Rate limiting
    
    # Summary
    print(f"\n{'='*70}")
    print(f"LLM EVIDENCE AUDIT SUMMARY")
    print(f"{'='*70}")
    
    supports_yes = sum(1 for a in audit_results if a['validation'].get('supports_tag') == 'YES')
    supports_no = sum(1 for a in audit_results if a['validation'].get('supports_tag') == 'NO')
    supports_unclear = sum(1 for a in audit_results if a['validation'].get('supports_tag') == 'UNCLEAR')
    
    print(f"Tags with supporting evidence: {supports_yes}/{len(audit_results)}")
    print(f"Tags with contradicting evidence: {supports_no}/{len(audit_results)}")
    print(f"Tags with unclear evidence: {supports_unclear}/{len(audit_results)}")
    
    quality_strong = sum(1 for a in audit_results if a['validation'].get('quality') == 'STRONG')
    quality_adequate = sum(1 for a in audit_results if a['validation'].get('quality') == 'ADEQUATE')
    quality_weak = sum(1 for a in audit_results if a['validation'].get('quality') == 'WEAK')
    
    print(f"\nEvidence quality:")
    print(f"  STRONG: {quality_strong}")
    print(f"  ADEQUATE: {quality_adequate}")
    print(f"  WEAK/IRRELEVANT: {quality_weak}")
    
    # Show problematic cases
    problematic = [a for a in audit_results if a['validation'].get('supports_tag') in ['NO', 'UNCLEAR']]
    if problematic:
        print(f"\n⚠️  PROBLEMATIC TAGS (for manual review):")
        for a in problematic:
            print(f"  - {a['specialty']} ({a['confidence']})")
            print(f"    Reason: {a['validation'].get('reasoning', 'N/A')[:80]}...")
    
    return audit_results

In [30]:
# Or audit everything (expensive!)
llm_audit_results = llm_audit_all_evidence(atg_result)

Auditing all 18 tags...

Auditing: Gene Synthesis
  LLM says: YES (ADEQUATE)

Auditing: DNA Assembly
  LLM says: UNCLEAR (WEAK)

Auditing: Sequence Optimization
  LLM says: UNCLEAR (WEAK)

Auditing: Synthetic Gene Circuits
  LLM says: UNCLEAR (WEAK)

Auditing: Bioinformatics Consulting
  LLM says: UNCLEAR (WEAK)

Auditing: Synthetic and Analytical Bioinformatics
  LLM says: UNCLEAR (WEAK)

Auditing: Protein Expression
  LLM says: YES (ADEQUATE)

Auditing: Mammalian and Microbial Cell Manufacturing
  LLM says: YES (ADEQUATE)

Auditing: Computational Biology
  LLM says: UNCLEAR (WEAK)

Auditing: Gene Expression Measurements
  LLM says: UNCLEAR (WEAK)

Auditing: Protein Design
  LLM says: UNCLEAR (WEAK)

Auditing: Pathway Analysis and Metabolic Network Reconstruction
  LLM says: UNCLEAR (WEAK)

Auditing: Target Functional Assays
  LLM says: NO (IRRELEVANT)

Auditing: Comparative Genomics
  LLM says: UNCLEAR (WEAK)

Auditing: Peptide Biomarker Discovery
  LLM says: UNCLEAR (WEAK)

Auditing

In [31]:
atg_result

{'vendor_name': 'ATG:biosynthetics',
 'vendor_url': 'http://www.atg-biosynthetics.com/',
 'vendor_description': 'ATG:biosynthetics GmbH was founded in 2001 by Dr. Hubert Bernauer and has its headquarters in Merzhausen near Freiburg, Germany.  ATG is a synthetic biology and biotechnology company. We offer straightforward gene synthesis, gene optimization and expression systems but are specifically experts for synthetic bioinformatics that enable complex combinatorial gene designs, multi-gene expression strategies and projects involving gene cluster design.  We offer bioinformatics and wet-lab expertise, products and services. Our mainstays are multi-gene and multi-peptide expression products and services along with bioinformatics analysis and work-up for academic and industrial R&D in the fields of drug discovery, industrial and medical biotechnology, molecular diagnostics, and bioengineering.',
 'vendor_type': 'Synthetic Biology and Bioinformatics Service Provider',
 'filtering': {'ori

## Can we do better? Multi-agent debate

In [34]:
import json
from collections import Counter
import time

def multi_annotator_consensus(page_content, specialty_list, num_annotators=3, 
                              consensus_threshold=0.67, temperature=0):
    """
    Use multiple LLM calls to analyze the same page and reach consensus.
    Only include tags that most annotators agree on.
    
    Args:
        page_content (dict): Page to analyze
        specialty_list (list): Specialties to look for
        num_annotators (int): Number of independent annotations
        consensus_threshold (float): Minimum agreement (0.67 = 2/3 annotators)
        temperature (float): LLM temperature (0 for deterministic)
    
    Returns:
        dict: Consensus results with agreement scores
    """
    all_annotations = []
    
    print(f"   Running {num_annotators} independent annotations...")
    
    for i in range(num_annotators):
        annotation = analyze_page_for_specialties(page_content, specialty_list)
        if annotation['success']:
            all_annotations.append(annotation['findings'])
        time.sleep(0.3)  # Brief pause between calls
    
    if not all_annotations:
        return {'findings': [], 'success': False, 'error': 'All annotations failed'}
    
    # Count votes for each specialty
    specialty_votes = {}
    specialty_confidences = {}
    specialty_evidence = {}
    
    for annotation in all_annotations:
        for finding in annotation:
            spec = finding['specialty']
            if spec not in specialty_votes:
                specialty_votes[spec] = 0
                specialty_confidences[spec] = []
                specialty_evidence[spec] = []
            
            specialty_votes[spec] += 1
            specialty_confidences[spec].append(finding['confidence'])
            specialty_evidence[spec].append(finding['evidence'])
    
    # Calculate consensus
    consensus_findings = []
    for spec, votes in specialty_votes.items():
        agreement_rate = votes / num_annotators
        
        if agreement_rate >= consensus_threshold:
            # Most common confidence level
            confidence_counts = Counter(specialty_confidences[spec])
            consensus_confidence = confidence_counts.most_common(1)[0][0]
            
            # Use evidence from first annotator that found it
            evidence = specialty_evidence[spec][0]
            
            consensus_findings.append({
                'specialty': spec,
                'confidence': consensus_confidence,
                'evidence': evidence,
                'agreement_rate': agreement_rate,
                'votes': f"{votes}/{num_annotators}"
            })
    
    print(f"   Consensus: {len(consensus_findings)}/{len(specialty_votes)} tags met threshold")
    
    return {
        'url': page_content['url'],
        'title': page_content['title'],
        'findings': consensus_findings,
        'total_candidates': len(specialty_votes),
        'consensus_count': len(consensus_findings),
        'success': True
    }


def self_consistency_check(page_content, specialty_list, num_samples=3):
    """
    Sample multiple times and only keep tags that appear consistently.
    More efficient than full multi-annotator but still catches inconsistencies.
    """
    tag_appearances = {}
    
    for i in range(num_samples):
        result = analyze_page_for_specialties(page_content, specialty_list)
        if result['success']:
            for finding in result['findings']:
                spec = finding['specialty']
                if spec not in tag_appearances:
                    tag_appearances[spec] = {
                        'count': 0,
                        'confidences': [],
                        'evidence': finding['evidence']
                    }
                tag_appearances[spec]['count'] += 1
                tag_appearances[spec]['confidences'].append(finding['confidence'])
        
        time.sleep(0.2)
    
    # Only keep tags that appeared in majority of samples
    consistent_tags = []
    for spec, data in tag_appearances.items():
        if data['count'] >= (num_samples / 2):
            most_common_conf = Counter(data['confidences']).most_common(1)[0][0]
            consistent_tags.append({
                'specialty': spec,
                'confidence': most_common_conf,
                'evidence': data['evidence'],
                'consistency_score': data['count'] / num_samples
            })
    
    return {
        'url': page_content['url'],
        'findings': consistent_tags,
        'success': True
    }


def real_time_evidence_audit(finding, audit_threshold=5):
    """
    Audit a finding's evidence quality immediately and flag if poor.
    
    Args:
        finding (dict): Single specialty finding with evidence
        audit_threshold (float): Minimum quality score (0-10)
    
    Returns:
        dict: Finding with quality score and flag
    """
    evidence = finding['evidence']
    spec_name = finding['specialty']
    
    # Quick heuristic quality checks
    quality_score = 5.0  # Start at middle
    issues = []
    
    # Check 1: Evidence length
    if len(evidence) < 20:
        quality_score -= 2
        issues.append("Evidence too short")
    elif len(evidence) > 500:
        quality_score -= 0.5
        issues.append("Evidence too long/generic")
    else:
        quality_score += 1
    
    # Check 2: Contains specialty keywords
    spec_keywords = spec_name.lower().split()
    evidence_lower = evidence.lower()
    keyword_matches = sum(1 for kw in spec_keywords if kw in evidence_lower)
    
    if keyword_matches == 0:
        quality_score -= 2
        issues.append("No specialty keywords in evidence")
    elif keyword_matches >= len(spec_keywords) / 2:
        quality_score += 1
    
    # Check 3: Not too generic
    generic_words = ['service', 'offer', 'company', 'contact', 'home', 'menu']
    generic_count = sum(1 for word in generic_words if word in evidence_lower)
    if generic_count > 3:
        quality_score -= 1.5
        issues.append("Evidence appears generic")
    
    # Check 4: Contains technical/specific terms
    technical_terms = ['analysis', 'synthesis', 'testing', 'development', 'assay', 'protocol']
    if any(term in evidence_lower for term in technical_terms):
        quality_score += 0.5
    
    # Normalize to 0-10
    quality_score = max(0, min(10, quality_score))
    
    # Flag if below threshold
    flagged = quality_score < audit_threshold
    
    return {
        **finding,
        'quality_score': quality_score,
        'flagged': flagged,
        'issues': issues
    }


def analyze_crawled_website_with_quality_control(crawl_results, specialty_list, 
                                                 use_consensus=True, 
                                                 use_audit=True,
                                                 consensus_threshold=0.67,
                                                 audit_threshold=5):
    """
    Enhanced version with built-in quality control.
    
    Args:
        crawl_results (dict): Results from crawl_website()
        specialty_list (list): Specialties to search for
        use_consensus (bool): Use multi-annotator consensus
        use_audit (bool): Audit evidence quality in real-time
        consensus_threshold (float): Agreement threshold for consensus
        audit_threshold (float): Minimum quality score
    
    Returns:
        dict: Analysis with quality metrics
    """
    all_findings = []
    flagged_findings = []
    
    print(f"\n{'='*60}")
    print(f"ANALYZING WITH QUALITY CONTROL")
    print(f"{'='*60}")
    print(f"Consensus: {'ON' if use_consensus else 'OFF'}")
    print(f"Evidence Audit: {'ON' if use_audit else 'OFF'}")
    print(f"Pages to analyze: {crawl_results['pages_crawled']}\n")
    
    for i, page in enumerate(crawl_results['pages'], 1):
        print(f"[{i}/{crawl_results['pages_crawled']}] {page['title'][:50]}...")
        
        # Step 1: Get findings (with or without consensus)
        if use_consensus:
            analysis = multi_annotator_consensus(
                page_content=page,
                specialty_list=specialty_list,
                num_annotators=3,
                consensus_threshold=consensus_threshold
            )
        else:
            analysis = analyze_page_for_specialties(page, specialty_list)
        
        if not analysis['success'] or not analysis['findings']:
            print(f"   No specialties found")
            continue
        
        # Step 2: Audit evidence quality
        audited_findings = []
        for finding in analysis['findings']:
            if use_audit:
                audited = real_time_evidence_audit(finding, audit_threshold)
                audited_findings.append(audited)
                
                if audited['flagged']:
                    flagged_findings.append({
                        'url': page['url'],
                        'specialty': audited['specialty'],
                        'quality_score': audited['quality_score'],
                        'issues': audited['issues']
                    })
                    print(f"   ⚠️  {audited['specialty']}: Quality {audited['quality_score']:.1f}/10")
                else:
                    print(f"   ✓ {audited['specialty']}: Quality {audited['quality_score']:.1f}/10")
            else:
                audited_findings.append(finding)
                print(f"   ✓ {finding['specialty']}")
        
        if audited_findings:
            all_findings.append({
                'url': page['url'],
                'title': page['title'],
                'findings': audited_findings
            })
        
        time.sleep(0.5)
    
    # Aggregate results
    unique_specialties = {}
    for finding_group in all_findings:
        for finding in finding_group['findings']:
            spec_name = finding['specialty']
            
            if spec_name not in unique_specialties:
                unique_specialties[spec_name] = {
                    'specialty': spec_name,
                    'highest_confidence': finding['confidence'],
                    'occurrences': 1,
                    'pages': [{
                        'url': finding_group['url'],
                        'evidence': finding['evidence'],
                        'quality_score': finding.get('quality_score', None),
                        'flagged': finding.get('flagged', False)
                    }],
                    'avg_quality': finding.get('quality_score', 10),
                    'flagged_count': 1 if finding.get('flagged', False) else 0
                }
            else:
                unique_specialties[spec_name]['occurrences'] += 1
                unique_specialties[spec_name]['pages'].append({
                    'url': finding_group['url'],
                    'evidence': finding['evidence'],
                    'quality_score': finding.get('quality_score', None),
                    'flagged': finding.get('flagged', False)
                })
                
                # Update quality metrics
                if finding.get('quality_score'):
                    current_avg = unique_specialties[spec_name]['avg_quality']
                    current_count = unique_specialties[spec_name]['occurrences'] - 1
                    new_avg = (current_avg * current_count + finding['quality_score']) / unique_specialties[spec_name]['occurrences']
                    unique_specialties[spec_name]['avg_quality'] = new_avg
                
                if finding.get('flagged', False):
                    unique_specialties[spec_name]['flagged_count'] += 1
    
    # Quality summary
    print(f"\n{'='*60}")
    print(f"QUALITY CONTROL SUMMARY")
    print(f"{'='*60}")
    print(f"Total unique specialties: {len(unique_specialties)}")
    print(f"Flagged findings: {len(flagged_findings)}")
    
    high_quality = [s for s in unique_specialties.values() if s['avg_quality'] >= 7]
    medium_quality = [s for s in unique_specialties.values() if 5 <= s['avg_quality'] < 7]
    low_quality = [s for s in unique_specialties.values() if s['avg_quality'] < 5]
    
    print(f"\nQuality Distribution:")
    print(f"  High (7-10): {len(high_quality)} specialties")
    print(f"  Medium (5-7): {len(medium_quality)} specialties")
    print(f"  Low (<5): {len(low_quality)} specialties")
    
    if flagged_findings:
        print(f"\n⚠️  Flagged for Review:")
        for flag in flagged_findings[:5]:
            print(f"  • {flag['specialty']} (score: {flag['quality_score']:.1f})")
            print(f"    Issues: {', '.join(flag['issues'])}")
    
    return {
        'base_url': crawl_results['base_url'],
        'pages_analyzed': crawl_results['pages_crawled'],
        'page_analyses': all_findings,
        'unique_specialties_found': list(unique_specialties.values()),
        'total_unique_specialties': len(unique_specialties),
        'quality_metrics': {
            'high_quality_count': len(high_quality),
            'medium_quality_count': len(medium_quality),
            'low_quality_count': len(low_quality),
            'flagged_count': len(flagged_findings),
            'flagged_findings': flagged_findings
        }
    }


def llm_debate_for_ambiguous_tags(tag_name, evidence_list, specialty_description=""):
    """
    Use LLM debate/chain-of-thought for ambiguous cases.
    Two LLMs argue for/against including the tag.
    
    Args:
        tag_name (str): The specialty in question
        evidence_list (list): All evidence found for this tag
        specialty_description (str): Definition of the specialty
    
    Returns:
        dict: Final verdict with reasoning
    """
    evidence_str = "\n".join([f"- {e}" for e in evidence_list])
    
    # Round 1: Advocate argues FOR including the tag
    advocate_prompt = f"""You are advocating FOR including "{tag_name}" as a tag for this vendor.

EVIDENCE:
{evidence_str}

SPECIALTY DEFINITION: {specialty_description or 'N/A'}

Argue why this evidence DOES support that the vendor offers this specialty.
Be specific about which evidence supports your position.
Provide your argument in 2-3 sentences."""
    
    advocate_response = get_chat_response(
        user_message=advocate_prompt,
        system_message="You are an advocate making the strongest case for including this tag.",
        model="gpt-4o"
    )
    
    # Round 2: Critic argues AGAINST including the tag
    critic_prompt = f"""You are critically evaluating whether "{tag_name}" should be included as a tag for this vendor.

EVIDENCE:
{evidence_str}

SPECIALTY DEFINITION: {specialty_description or 'N/A'}

ADVOCATE'S ARGUMENT (FOR):
{advocate_response}

Critically evaluate: Does this evidence really prove the vendor offers this specialty, or is it too weak/generic/tangential?
Argue AGAINST including the tag if appropriate. Be specific.
Provide your argument in 2-3 sentences."""
    
    critic_response = get_chat_response(
        user_message=critic_prompt,
        system_message="You are a critical evaluator challenging weak evidence.",
        model="gpt-4o"
    )
    
    # Round 3: Judge makes final decision
    judge_prompt = f"""You are a neutral judge deciding whether to include "{tag_name}" as a tag for this vendor.

EVIDENCE:
{evidence_str}

ADVOCATE'S ARGUMENT (FOR):
{advocate_response}

CRITIC'S ARGUMENT (AGAINST):
{critic_response}

Make a final decision: Should this tag be included?
Respond in JSON format:
{{
  "decision": "INCLUDE" or "REJECT",
  "confidence": "HIGH/MEDIUM/LOW",
  "reasoning": "brief explanation of your decision"
}}
"""
    
    judge_response = get_chat_response(
        user_message=judge_prompt,
        system_message="You are a neutral judge making a final decision based on evidence and arguments.",
        model="gpt-4o"
    )
    
    # Parse judge's decision
    try:
        judge_response = judge_response.strip()
        if judge_response.startswith('```'):
            judge_response = judge_response.split('```')[1]
            if judge_response.startswith('json'):
                judge_response = judge_response[4:]
        
        decision = json.loads(judge_response)
        
        return {
            'tag': tag_name,
            'decision': decision['decision'],
            'confidence': decision['confidence'],
            'reasoning': decision['reasoning'],
            'advocate_argument': advocate_response,
            'critic_argument': critic_response,
            'success': True
        }
    except:
        return {
            'tag': tag_name,
            'decision': 'UNCERTAIN',
            'success': False,
            'error': 'Could not parse judge decision'
        }


def analyze_vendor_complete_with_qc(vendor_name, vendor_url, vendor_description, 
                                    all_specialties, max_pages=10,
                                    use_consensus=True, use_audit=True,
                                    use_debate_for_ambiguous=False):
    """
    Complete analysis pipeline with integrated quality control.
    """
    print(f"\n{'='*70}")
    print(f"ANALYZING VENDOR WITH QUALITY CONTROL: {vendor_name}")
    print(f"{'='*70}")
    print(f"Quality Control Settings:")
    print(f"  • Multi-annotator consensus: {'ON' if use_consensus else 'OFF'}")
    print(f"  • Real-time evidence audit: {'ON' if use_audit else 'OFF'}")
    print(f"  • LLM debate for ambiguous: {'ON' if use_debate_for_ambiguous else 'OFF'}")
    print()
    
    # Step 1: Get homepage
    print("Step 1: Fetching homepage...")
    homepage = get_webpage_content(vendor_url)
    if not homepage['success']:
        return None
    print(f"   ✓ Fetched: {len(homepage['text'])} chars\n")
    
    # Step 2: Filter specialties
    print("Step 2: Filtering specialties...")
    vendor_info = {
        'name': vendor_name,
        'url': vendor_url,
        'description': vendor_description,
        'homepage_content': homepage['text']
    }
    filter_results = filter_specialties_by_vendor(vendor_info, all_specialties, max_specialties=50)
    print(f"   ✓ Filtered to {filter_results['total_filtered']}\n")
    
    # Step 3: Crawl website
    print("Step 3: Crawling website...")
    crawl_results = crawl_website(vendor_url, max_pages=max_pages, delay=2)
    print()
    
    # Step 4: Analyze with quality control
    print("Step 4: Analyzing with quality control...")
    analysis_results = analyze_crawled_website_with_quality_control(
        crawl_results=crawl_results,
        specialty_list=filter_results['filtered_specialties'],
        use_consensus=use_consensus,
        use_audit=use_audit,
        consensus_threshold=0.67,
        audit_threshold=5
    )
    
    # Step 5: Optional debate for ambiguous cases
    debated_tags = []
    if use_debate_for_ambiguous:
        print("\nStep 5: LLM debate for ambiguous tags...")
        
        # Identify ambiguous tags (low quality or low confidence)
        ambiguous = [
            s for s in analysis_results['unique_specialties_found']
            if s.get('avg_quality', 10) < 6 or s.get('flagged_count', 0) > 0
        ]
        
        print(f"   Found {len(ambiguous)} ambiguous tags for debate")
        
        for spec in ambiguous[:3]:  # Limit debates to avoid costs
            print(f"   Debating: {spec['specialty']}...")
            evidence_list = [p['evidence'] for p in spec['pages']]
            
            debate_result = llm_debate_for_ambiguous_tags(
                tag_name=spec['specialty'],
                evidence_list=evidence_list
            )
            
            debated_tags.append(debate_result)
            print(f"      Decision: {debate_result['decision']}")
    
    # Compile results
    return {
        'vendor_name': vendor_name,
        'vendor_url': vendor_url,
        'analysis': analysis_results,
        'quality_metrics': analysis_results['quality_metrics'],
        'debated_tags': debated_tags if use_debate_for_ambiguous else []
    }

In [36]:
vendor

company_name                                           ATG:biosynthetics
website_url                            http://www.atg-biosynthetics.com/
company_description    ATG:biosynthetics GmbH was founded in 2001 by ...
Name: 5, dtype: object

## Usage / Testing

In [66]:
# # Example 1: Full quality control (recommended)
# result = analyze_vendor_complete_with_qc(
#     vendor_name=vendor["company_name"],
#     vendor_url=vendor["website_url"],
#     vendor_description=vendor["company_description"],
#     all_specialties=tags,
#     max_pages=15,
#     use_consensus=True,      # Multi-annotator consensus
#     use_audit=True,          # Real-time evidence audit
#     use_debate_for_ambiguous=False  # Too expensive for all vendors
# )

# # Example 2: Fast mode (single pass with audit only)
# result = analyze_vendor_complete_with_qc(
#     vendor_name=vendor["company_name"],
#     vendor_url=vendor["website_url"],
#     vendor_description=vendor["company_description"],
#     all_specialties=tags,
#     max_pages=10,
#     use_consensus=False,     # Skip consensus (faster)
#     use_audit=True,          # Keep audit
#     use_debate_for_ambiguous=False
# )

# # Example 3: Maximum quality (expensive)
# result = analyze_vendor_complete_with_qc(
#     vendor_name=vendor["company_name"],
#     vendor_url=vendor["website_url"],
#     vendor_description=vendor["company_description"],
#     all_specialties=tags,
#     max_pages=15,
#     use_consensus=True,
#     use_audit=True,
#     use_debate_for_ambiguous=True  # Use debate for unclear cases
# )

In [55]:
vendor = vendors.iloc[14]

result = analyze_vendor_complete_with_qc(
    vendor_name=vendor["company_name"],
    vendor_url=vendor["website_url"],
    vendor_description=vendor["company_description"],
    all_specialties=tags,
    max_pages=15,
    use_consensus=False,
    use_audit=False,
    use_debate_for_ambiguous=False  # Use debate for unclear cases
)


ANALYZING VENDOR WITH QUALITY CONTROL: Sartorius
Quality Control Settings:
  • Multi-annotator consensus: OFF
  • Real-time evidence audit: OFF
  • LLM debate for ambiguous: OFF

Step 1: Fetching homepage...
   ✓ Fetched: 3080 chars

Step 2: Filtering specialties...
   ✓ Filtered to 20

Step 3: Crawling website...
Starting URL: https://www.sartorius.com/en
Actual URL after redirect: https://www.sartorius.com/en

Starting crawl of https://www.sartorius.com
Max pages: 15, Delay: 2s

Crawling [1/15]: https://www.sartorius.com/en
Crawling [2/15]: https://www.sartorius.com/en/company/investor-relations
Crawling [3/15]: https://www.sartorius.com/en/products/water-purification?ban_position=portfolio_carousel&ban_name=water_purification
Crawling [4/15]: https://www.sartorius.com/en/products/oem?ban_position=portfolio_carousel&ban_name=oem
Crawling [5/15]: https://www.sartorius.com/en/products/weighing/pipette-calibration?ban_position=new_slider&ban_name=lps_20251027_pipette_calibration
Crawli

In [56]:
result

{'vendor_name': 'Sartorius',
 'vendor_url': 'https://www.sartorius.com/en',
 'analysis': {'base_url': 'https://www.sartorius.com',
  'pages_analyzed': 15,
  'page_analyses': [{'url': 'https://www.sartorius.com/en',
    'title': 'Sartorius | Biopharma, Laboratory, Applied & Life Sciences',
    'findings': [{'specialty': 'Plasmid DNA Manufacturing',
      'confidence': 'HIGH',
      'evidence': 'Plasmid Manufacturing at the scale you need...scalable solutions in state-of-the-art GMP facilities.'},
     {'specialty': 'Bioreactors & Fermenters',
      'confidence': 'HIGH',
      'evidence': 'Bioreactors & Fermenters Learn More'},
     {'specialty': 'Microbiological Testing',
      'confidence': 'HIGH',
      'evidence': 'Microbiological Testing Learn More'},
     {'specialty': 'Cell Culture',
      'confidence': 'HIGH',
      'evidence': 'Cell Culture Media Learn More'},
     {'specialty': 'Process Filtration',
      'confidence': 'HIGH',
      'evidence': 'Process Filtration Learn More'},

In [54]:
vendor = vendors.iloc[14]

result = analyze_vendor_complete_with_qc(
    vendor_name=vendor["company_name"],
    vendor_url=vendor["website_url"],
    vendor_description=vendor["company_description"],
    all_specialties=tags,
    max_pages=15,
    use_consensus=True,
    use_audit=True,
    use_debate_for_ambiguous=True  # Use debate for unclear cases
)


ANALYZING VENDOR WITH QUALITY CONTROL: Sartorius
Quality Control Settings:
  • Multi-annotator consensus: ON
  • Real-time evidence audit: ON
  • LLM debate for ambiguous: ON

Step 1: Fetching homepage...
   ✓ Fetched: 3080 chars

Step 2: Filtering specialties...
   ✓ Filtered to 36

Step 3: Crawling website...
Starting URL: https://www.sartorius.com/en
Actual URL after redirect: https://www.sartorius.com/en

Starting crawl of https://www.sartorius.com
Max pages: 15, Delay: 2s

Crawling [1/15]: https://www.sartorius.com/en
Crawling [2/15]: https://www.sartorius.com/en/company/investor-relations
Crawling [3/15]: https://www.sartorius.com/en/products/water-purification?ban_position=portfolio_carousel&ban_name=water_purification
Crawling [4/15]: https://www.sartorius.com/en/products/oem?ban_position=portfolio_carousel&ban_name=oem
Crawling [5/15]: https://www.sartorius.com/en/products/weighing/pipette-calibration?ban_position=new_slider&ban_name=lps_20251027_pipette_calibration
Crawling 

## Scrambling to get in the right format.... really messy....

In [40]:
def analyze_vendor_simple(vendor_name, vendor_url, vendor_description, 
                         all_specialties, max_pages=10):
    """
    Simple analysis without quality control - more robust to errors.
    
    Args:
        vendor_name (str): Vendor company name
        vendor_url (str): Vendor website URL
        vendor_description (str): Vendor description
        all_specialties (list): Complete list of all possible specialties
        max_pages (int): Max pages to crawl
    
    Returns:
        dict: Analysis results or None if failed
    """
    try:
        print(f"\n{'='*60}")
        print(f"ANALYZING: {vendor_name}")
        print(f"{'='*60}\n")
        
        # Step 1: Get homepage content
        print("Step 1: Fetching homepage...")
        homepage = get_webpage_content(vendor_url)
        
        if not homepage['success']:
            print(f"   ✗ Failed to fetch homepage: {homepage.get('error', 'Unknown error')}")
            return None
        
        print(f"   ✓ Homepage fetched: {len(homepage['text'])} characters\n")
        
        # Step 2: Filter specialties
        print(f"Step 2: Filtering specialties (from {len(all_specialties)} total)...")
        vendor_info = {
            'name': vendor_name,
            'url': vendor_url,
            'description': vendor_description,
            'homepage_content': homepage['text']
        }
        
        filter_results = filter_specialties_by_vendor(vendor_info, all_specialties, max_specialties=50)
        
        print(f"   ✓ Filtered to {filter_results['total_filtered']} likely specialties")
        print(f"   Vendor Type: {filter_results['vendor_type']}\n")
        
        # Step 3: Crawl website
        print(f"Step 3: Crawling website (max {max_pages} pages)...")
        crawl_results = crawl_website(vendor_url, max_pages=max_pages, delay=2)
        
        if not crawl_results or crawl_results['pages_crawled'] == 0:
            print(f"   ✗ No pages crawled")
            return None
        
        print()
        
        # Step 4: Analyze pages
        print(f"Step 4: Analyzing {crawl_results['pages_crawled']} pages with LLM...")
        analysis_results = analyze_crawled_website(crawl_results, filter_results['filtered_specialties'])
        
        # Combine all results
        complete_results = {
            'vendor_name': vendor_name,
            'vendor_url': vendor_url,
            'vendor_description': vendor_description,
            'vendor_type': filter_results['vendor_type'],
            'filtering': {
                'original_specialty_count': filter_results['original_total'],
                'filtered_specialty_count': filter_results['total_filtered'],
                'filtered_specialties': filter_results['filtered_specialties'],
                'reasoning': filter_results['reasoning']
            },
            'crawling': {
                'pages_crawled': crawl_results['pages_crawled'],
                'urls_visited': crawl_results['visited_urls']
            },
            'analysis': {
                'pages_analyzed': analysis_results['pages_analyzed'],
                'unique_specialties_found': analysis_results['unique_specialties_found'],
                'total_unique_specialties': analysis_results['total_unique_specialties']
            }
        }
        
        # Display summary
        print(f"\n{'='*60}")
        print(f"ANALYSIS COMPLETE: {vendor_name}")
        print(f"{'='*60}")
        print(f"Vendor Type: {complete_results['vendor_type']}")
        print(f"Pages crawled: {crawl_results['pages_crawled']}")
        print(f"Specialties found: {analysis_results['total_unique_specialties']}")
        
        return complete_results
        
    except Exception as e:
        print(f"\n✗ Error analyzing {vendor_name}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


def process_first_n_vendors(vendors_df, all_tags, n=10, max_pages=10):
    """
    Process first N vendors with simple approach.
    Handles errors gracefully and continues processing.
    
    Args:
        vendors_df (pd.DataFrame): Vendor data
        all_tags (list): All possible specialty tags
        n (int): Number of vendors to process
        max_pages (int): Max pages to crawl per vendor
    
    Returns:
        list: Results for all vendors (including failures)
    """
    results = []
    n = min(n, len(vendors_df))
    
    print(f"\n{'#'*70}")
    print(f"PROCESSING FIRST {n} VENDORS")
    print(f"{'#'*70}\n")
    
    for idx in range(n):
        vendor = vendors_df.iloc[idx]
        
        print(f"\n{'='*70}")
        print(f"VENDOR {idx+1}/{n}: {vendor['company_name']}")
        print(f"URL: {vendor['website_url']}")
        print(f"{'='*70}")
        
        try:
            result = analyze_vendor_simple(
                vendor_name=vendor['company_name'],
                vendor_url=vendor['website_url'],
                vendor_description=vendor['company_description'],
                all_specialties=all_tags,
                max_pages=max_pages
            )
            
            if result:
                results.append({
                    'vendor_idx': idx,
                    'vendor_name': vendor['company_name'],
                    'vendor_url': vendor['website_url'],
                    'result': result,
                    'success': True,
                    'error': None
                })
                print(f"\n✓ SUCCESS: Found {result['analysis']['total_unique_specialties']} specialties")
            else:
                results.append({
                    'vendor_idx': idx,
                    'vendor_name': vendor['company_name'],
                    'vendor_url': vendor['website_url'],
                    'result': None,
                    'success': False,
                    'error': 'Analysis returned None'
                })
                print(f"\n✗ FAILED: No results returned")
        
        except Exception as e:
            results.append({
                'vendor_idx': idx,
                'vendor_name': vendor['company_name'],
                'vendor_url': vendor['website_url'],
                'result': None,
                'success': False,
                'error': str(e)
            })
            print(f"\n✗ ERROR: {str(e)}")
        
        # Pause between vendors
        if idx < n - 1:
            print(f"\n{'─'*70}")
            print(f"Pausing 3 seconds before next vendor...")
            print(f"{'─'*70}")
            time.sleep(3)
    
    # Summary
    successful = sum(1 for r in results if r['success'])
    print(f"\n{'='*70}")
    print(f"BATCH COMPLETE")
    print(f"{'='*70}")
    print(f"Total vendors: {n}")
    print(f"Successful: {successful}")
    print(f"Failed: {n - successful}")
    
    return results


def create_dataframes_from_results(results):
    """
    Create DataFrames from processing results.
    Handles missing/failed vendors gracefully.
    
    Args:
        results (list): Results from process_first_n_vendors()
    
    Returns:
        dict: Dictionary with all DataFrames
    """
    # 1. Vendor-Tag DataFrame (long format)
    tag_rows = []
    
    for vendor_result in results:
        if not vendor_result['success'] or not vendor_result['result']:
            continue
        
        vendor_name = vendor_result['vendor_name']
        vendor_url = vendor_result['vendor_url']
        
        specialties = vendor_result['result']['analysis']['unique_specialties_found']
        
        for spec in specialties:
            tag_rows.append({
                'vendor_name': vendor_name,
                'vendor_url': vendor_url,
                'specialty': spec['specialty'],
                'confidence': spec['highest_confidence'],
                'occurrences': spec['occurrences'],
                'num_pages': spec['occurrences']
            })
    
    df_tags = pd.DataFrame(tag_rows)
    
    # 2. Vendor Summary DataFrame
    summary_rows = []
    
    for vendor_result in results:
        vendor_name = vendor_result['vendor_name']
        vendor_url = vendor_result['vendor_url']
        
        if not vendor_result['success'] or not vendor_result['result']:
            summary_rows.append({
                'vendor_name': vendor_name,
                'vendor_url': vendor_url,
                'vendor_type': None,
                'total_specialties': 0,
                'high_confidence_count': 0,
                'medium_confidence_count': 0,
                'low_confidence_count': 0,
                'pages_crawled': 0,
                'success': False,
                'error': vendor_result.get('error', 'Unknown error')
            })
            continue
        
        result = vendor_result['result']
        specialties = result['analysis']['unique_specialties_found']
        
        high_count = sum(1 for s in specialties if s['highest_confidence'] == 'HIGH')
        medium_count = sum(1 for s in specialties if s['highest_confidence'] == 'MEDIUM')
        low_count = sum(1 for s in specialties if s['highest_confidence'] == 'LOW')
        
        summary_rows.append({
            'vendor_name': vendor_name,
            'vendor_url': vendor_url,
            'vendor_type': result['vendor_type'],
            'total_specialties': len(specialties),
            'high_confidence_count': high_count,
            'medium_confidence_count': medium_count,
            'low_confidence_count': low_count,
            'pages_crawled': result['crawling']['pages_crawled'],
            'success': True,
            'error': None
        })
    
    df_summary = pd.DataFrame(summary_rows)
    
    # 3. Evidence DataFrame
    evidence_rows = []
    
    for vendor_result in results:
        if not vendor_result['success'] or not vendor_result['result']:
            continue
        
        vendor_name = vendor_result['vendor_name']
        specialties = vendor_result['result']['analysis']['unique_specialties_found']
        
        for spec in specialties:
            for page in spec['pages']:
                evidence_rows.append({
                    'vendor_name': vendor_name,
                    'specialty': spec['specialty'],
                    'confidence': spec['highest_confidence'],
                    'page_url': page['url'],
                    'evidence': page['evidence']
                })
    
    df_evidence = pd.DataFrame(evidence_rows)
    
    return {
        'df_tags': df_tags,
        'df_summary': df_summary,
        'df_evidence': df_evidence
    }


def save_results(results, dataframes, output_prefix='first_10_vendors'):
    """
    Save all results to files.
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save DataFrames
    tags_file = f'{output_prefix}_tags_{timestamp}.csv'
    summary_file = f'{output_prefix}_summary_{timestamp}.csv'
    evidence_file = f'{output_prefix}_evidence_{timestamp}.csv'
    json_file = f'{output_prefix}_full_{timestamp}.json'
    
    dataframes['df_tags'].to_csv(tags_file, index=False)
    dataframes['df_summary'].to_csv(summary_file, index=False)
    dataframes['df_evidence'].to_csv(evidence_file, index=False)
    
    # Save full JSON (with custom serialization for datetime)
    with open(json_file, 'w') as f:
        json.dump(results, f, indent=2, default=str)
    
    print(f"\n{'='*70}")
    print(f"RESULTS SAVED")
    print(f"{'='*70}")
    print(f"\n✓ Tags: {tags_file}")
    print(f"  Rows: {len(dataframes['df_tags'])}")
    
    print(f"\n✓ Summary: {summary_file}")
    print(f"  Rows: {len(dataframes['df_summary'])}")
    
    print(f"\n✓ Evidence: {evidence_file}")
    print(f"  Rows: {len(dataframes['df_evidence'])}")
    
    print(f"\n✓ Full JSON: {json_file}")
    
    return {
        'tags_file': tags_file,
        'summary_file': summary_file,
        'evidence_file': evidence_file,
        'json_file': json_file
    }

In [43]:
import datetime

In [41]:
# Load your data
vendors = pd.read_csv('data/vendors.csv')
specialties_df = pd.read_csv('data/specialties.csv')
tags = list(set(specialties_df.iloc[:, 0]))

print(f"Loaded {len(vendors)} vendors and {len(tags)} tags")

# Process first 10 vendors
results = process_first_n_vendors(
    vendors_df=vendors,
    all_tags=tags,
    n=10,
    max_pages=10
)

# Create DataFrames
dataframes = create_dataframes_from_results(results)

# Save everything
files = save_results(results, dataframes, output_prefix='first_10_vendors')

# Display results
print(f"\n{'='*70}")
print(f"TAGS DATAFRAME")
print(f"{'='*70}")
print(dataframes['df_tags'].head(20))

print(f"\n{'='*70}")
print(f"SUMMARY DATAFRAME")
print(f"{'='*70}")
print(dataframes['df_summary'].to_string())

# Access the DataFrames
df_tags = dataframes['df_tags']
df_summary = dataframes['df_summary']
df_evidence = dataframes['df_evidence']

Loaded 100 vendors and 979 tags

######################################################################
PROCESSING FIRST 10 VENDORS
######################################################################


VENDOR 1/10: Synergent Biochem
URL: http://www.synergentbiochem.com/

ANALYZING: Synergent Biochem

Step 1: Fetching homepage...
   ✓ Homepage fetched: 64 characters

Step 2: Filtering specialties (from 979 total)...
   ✓ Filtered to 10 likely specialties
   Vendor Type: Biological Products Manufacturing and Testing CMO

Step 3: Crawling website (max 10 pages)...
Starting URL: http://www.synergentbiochem.com/
Actual URL after redirect: http://www.synergentbiochem.com/

Starting crawl of http://www.synergentbiochem.com
Max pages: 10, Delay: 2s

Crawling [1/10]: http://www.synergentbiochem.com/

Crawl complete! Visited 1 pages.

Step 4: Analyzing 1 pages with LLM...
Analyzing 1 pages for specialties...

Analyzing [1/1]: Website Disabled
   - No specialties found

ANALYSIS COMPLETE: Syne

NameError: name 'datetime' is not defined

In [44]:
dataframes

{'df_tags':                                        vendor_name  \
 0                                          Biostir   
 1                                          Biostir   
 2                                          Biostir   
 3                                          Biostir   
 4                                          Biostir   
 5                                          Biostir   
 6                                          Biostir   
 7                                          Biostir   
 8                                          Biostir   
 9                                            ImYoo   
 10                                           ImYoo   
 11                                           ImYoo   
 12                                           ImYoo   
 13                                           ImYoo   
 14                                           ImYoo   
 15                                           ImYoo   
 16                                           ImYoo   

In [42]:
# Most common tags
print("\n" + "="*70)
print("MOST COMMON TAGS")
print("="*70)
print(df_tags['specialty'].value_counts().head(15))

# Tags by confidence
print("\n" + "="*70)
print("CONFIDENCE DISTRIBUTION")
print("="*70)
print(df_tags['confidence'].value_counts())

# Vendor statistics
print("\n" + "="*70)
print("VENDOR STATISTICS")
print("="*70)
print(df_summary[['vendor_name', 'total_specialties', 'high_confidence_count', 'pages_crawled']].to_string())

# Create pivot table (vendors x specialties)
if len(df_tags) > 0:
    pivot = df_tags.pivot_table(
        index='vendor_name',
        columns='specialty',
        values='confidence',
        aggfunc='first',
        fill_value=''
    )
    print("\n" + "="*70)
    print("VENDOR x SPECIALTY MATRIX (sample)")
    print("="*70)
    print(pivot.iloc[:, :5].to_string())  # Show first 5 specialties


MOST COMMON TAGS


NameError: name 'df_tags' is not defined

In [45]:
def create_vendor_tags_table(df_tags, df_summary):
    """
    Create a clean table: one row per vendor with all their tags.
    
    Args:
        df_tags (pd.DataFrame): Tags dataframe from results
        df_summary (pd.DataFrame): Summary dataframe from results
    
    Returns:
        pd.DataFrame: Clean vendor-tags table
    """
    # Group tags by vendor
    vendor_tags = df_tags.groupby('vendor_name').agg({
        'specialty': lambda x: list(x),
        'confidence': lambda x: list(x)
    }).reset_index()
    
    # Create formatted tag strings
    vendor_tags['all_tags'] = vendor_tags.apply(
        lambda row: ', '.join(row['specialty']),
        axis=1
    )
    
    # High confidence tags only
    high_conf_tags = df_tags[df_tags['confidence'] == 'HIGH'].groupby('vendor_name')['specialty'].apply(list).reset_index()
    high_conf_tags.columns = ['vendor_name', 'high_confidence_tags']
    
    # Merge with summary
    result = df_summary[['vendor_name', 'vendor_url', 'vendor_type', 'total_specialties', 
                         'high_confidence_count', 'pages_crawled', 'success']].copy()
    
    # Add tag lists
    result = result.merge(vendor_tags[['vendor_name', 'all_tags']], on='vendor_name', how='left')
    result = result.merge(high_conf_tags, on='vendor_name', how='left')
    
    # Format high confidence tags
    result['high_confidence_tags'] = result['high_confidence_tags'].apply(
        lambda x: ', '.join(x) if isinstance(x, list) else ''
    )
    
    # Fill NaN for failed vendors
    result['all_tags'] = result['all_tags'].fillna('None - Analysis Failed')
    result['high_confidence_tags'] = result['high_confidence_tags'].fillna('None')
    
    return result


# Create the table
df_vendor_tags = create_vendor_tags_table(dataframes['df_tags'], dataframes['df_summary'])

# Display
print("\n" + "="*100)
print("VENDOR TAGS TABLE")
print("="*100)
print(df_vendor_tags.to_string())

# Save
df_vendor_tags.to_csv('vendor_tags_table.csv', index=False)
print("\n✓ Saved to: vendor_tags_table.csv")


VENDOR TAGS TABLE
                                      vendor_name                                                           vendor_url                                              vendor_type  total_specialties  high_confidence_count  pages_crawled  success                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              all_tags                                                                                                                                                              

In [49]:
# Add this at the top of your notebook/script
import pandas as pd

# Set display options to show everything
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

print("✓ Pandas display options configured - no truncation!")

✓ Pandas display options configured - no truncation!
