In [None]:
!pip install exa-py openai pandas aiohttp nest-asyncio

In [None]:
import asyncio
import pandas as pd
import json
import re
import time
import hashlib
import pickle
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
from functools import lru_cache
import aiohttp
import openai
from exa_py import Exa
import nest_asyncio
import os
from dotenv import load_dotenv

# Enable nested async in Jupyter
nest_asyncio.apply()

In [None]:
load_dotenv()

EXA_API_KEY = os.getenv("EXA_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY)

# Verify keys are set
if EXA_API_KEY == "your_exa_api_key_here" or OPENAI_API_KEY == "your_openai_api_key_here":
    print("⚠️  WARNING: Please update your API keys in this cell!")
else:
    print("✅ API keys configured")

In [None]:
class AIPipeline:
    def __init__(self, exa_api_key: str, openai_api_key: str):
        self.exa = Exa(exa_api_key)
        self.openai_client = openai.OpenAI(api_key=openai_api_key)
        self.cache = {}
        self.rate_limiter = asyncio.Semaphore(5)
        
    def deduplicate_companies(self, companies: List[Dict]) -> List[Dict]:
        """Remove duplicate companies"""
        unique = []
        seen_names = set()
        
        for company in companies:
            name = company.get("name", "").lower().strip()
            if name and len(name) > 2 and name not in seen_names:
                seen_names.add(name)
                unique.append(company)
        
        return unique
    
    def extract_name_from_linkedin_url(self, url: str) -> str:
        """Extract name from LinkedIn URL"""
        try:
            parts = url.split('/in/')
            if len(parts) > 1:
                name_part = parts[1].split('/')[0].replace('-', ' ')
                return name_part.title()
        except:
            pass
        return "Unknown"
    
    def extract_role_from_title(self, title: str) -> str:
        """Extract role from LinkedIn title"""
        title_lower = title.lower()
        if "ceo" in title_lower or "chief executive" in title_lower:
            return "CEO"
        elif "cto" in title_lower or "chief technology" in title_lower:
            return "CTO"
        elif "founder" in title_lower:
            return "Founder"
        elif "president" in title_lower:
            return "President"
        else:
            return "Executive"
    
    def deduplicate_profiles(self, profiles: List[Dict]) -> List[Dict]:
        """Remove duplicate profiles"""
        unique = []
        seen_urls = set()
        
        for profile in profiles:
            url = profile.get("linkedin_url", "")
            if url and url not in seen_urls:
                seen_urls.add(url)
                unique.append(profile)
        
        return unique

print("✅ Pipeline class defined")

In [None]:
async def find_ai_companies(pipeline, num_results: int = 50) -> List[Dict]:
    """Find AI companies using Exa - better than Crunchbase API"""
    
    print("🔍 Finding AI companies with Exa...")
    
    queries = [
        "AI startups Series A funding 2023 2024 artificial intelligence",
        "machine learning companies venture capital investment TechCrunch",
        "generative AI startups GPT LLM computer vision funding",
        "AI unicorn companies billion valuation OpenAI competitor",
        "artificial intelligence robotics automation startup funding"
    ]
    
    all_companies = []
    
    for i, query in enumerate(queries):
        print(f"  Query {i+1}/{len(queries)}: {query[:50]}...")
        
        try:
            result = pipeline.exa.search_and_contents(
                query,
                type="neural",
                use_autoprompt=True,
                num_results=num_results // len(queries),
                text={"max_characters": 2000},
                include_domains=[
                    "techcrunch.com", "venturebeat.com", "crunchbase.com",
                    "pitchbook.com", "bloomberg.com", "reuters.com"
                ]
            )
            
            for item in result.results:
                company = await extract_company_data(pipeline, item.text, item.url, item.title)
                if company:
                    all_companies.append(company)
                    
        except Exception as e:
            print(f"    Error: {e}")
            continue
    
    # Deduplicate and return best companies
    unique_companies = pipeline.deduplicate_companies(all_companies)
    print(f"✅ Found {len(unique_companies)} unique AI companies")
    return unique_companies[:num_results]

async def extract_company_data(pipeline, content: str, url: str, title: str) -> Optional[Dict]:
    """Extract structured company data using GPT-4"""
    
    prompt = f"""
    Extract company information from this content. Return valid JSON only:
    
    Content: {content[:1500]}
    Title: {title}
    
    {{
        "name": "exact company name",
        "description": "what the company does in 1-2 sentences",
        "founded_year": "YYYY as integer or null",
        "funding_amount_millions": "funding amount in millions USD as number or null",
        "funding_stage": "seed/series-a/series-b/series-c/ipo or null",
        "founders": ["founder names if mentioned"],
        "location": "city, country if mentioned",
        "ai_focus": "specific AI area like NLP, computer vision, robotics, etc",
        "website": "company website if mentioned"
    }}
    """
    
    try:
        response = pipeline.openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )
        
        result = json.loads(response.choices[0].message.content)
        result["source_url"] = url
        result["extraction_date"] = datetime.now().isoformat()
        return result
        
    except Exception as e:
        return None

print("✅ Company finder functions defined")

In [None]:
async def find_linkedin_profiles(pipeline, companies: List[Dict]) -> List[Dict]:
    """Find LinkedIn profiles for company founders/executives"""
    
    print("🔍 Finding LinkedIn profiles...")
    
    all_profiles = []
    
    for i, company in enumerate(companies):
        print(f"  Company {i+1}/{len(companies)}: {company.get('name', 'Unknown')}")
        
        profiles = await get_company_profiles(pipeline, company)
        for profile in profiles:
            profile.update(company)  # Add company data to each profile
        all_profiles.extend(profiles)
    
    print(f"✅ Found {len(all_profiles)} LinkedIn profiles")
    return all_profiles

async def get_company_profiles(pipeline, company: Dict) -> List[Dict]:
    """Get LinkedIn profiles for a single company"""
    
    company_name = company.get("name", "")
    if not company_name:
        return []
    
    queries = [
        f"{company_name} CEO founder LinkedIn profile",
        f"{company_name} CTO chief technology officer LinkedIn",
        f'site:linkedin.com/in "{company_name}" founder CEO CTO'
    ]
    
    profiles = []
    
    for query in queries:
        try:
            result = pipeline.exa.search(
                query,
                type="keyword",
                include_domains=["linkedin.com"],
                num_results=5
            )
            
            for item in result.results:
                if "/in/" in item.url:
                    profile = {
                        "person_name": pipeline.extract_name_from_linkedin_url(item.url),
                        "linkedin_url": item.url,
                        "title": item.title,
                        "role": pipeline.extract_role_from_title(item.title)
                    }
                    profiles.append(profile)
                    
        except Exception as e:
            continue
    
    return pipeline.deduplicate_profiles(profiles)

In [None]:
async def analyze_markets(pipeline, companies: List[Dict]) -> Dict:
    """Comprehensive market analysis using Exa"""
    
    print("📊 Analyzing markets...")
    
    # Group by AI focus area
    sectors = {}
    for company in companies:
        focus = company.get("ai_focus", "General AI")
        if focus not in sectors:
            sectors[focus] = []
        sectors[focus].append(company)
    
    market_analysis = {}
    
    for sector, sector_companies in sectors.items():
        print(f"  Analyzing {sector} ({len(sector_companies)} companies)")
        
        market_data = await get_sector_metrics(pipeline, sector)
        
        # Calculate sector stats
        funding_amounts = [
            float(c.get("funding_amount_millions", 0) or 0) 
            for c in sector_companies
        ]
        
        market_analysis[sector] = {
            **market_data,
            "company_count": len(sector_companies),
            "avg_funding_millions": sum(funding_amounts) / len(funding_amounts) if funding_amounts else 0,
            "total_funding_millions": sum(funding_amounts),
            "top_companies": [c["name"] for c in sector_companies[:5]],
            "founded_years": [c.get("founded_year") for c in sector_companies if c.get("founded_year")]
        }
    
    print(f"✅ Analyzed {len(market_analysis)} sectors")
    return market_analysis

async def get_sector_metrics(pipeline, sector: str) -> Dict:
    """Get market metrics for a specific sector"""
    
    queries = [
        f"{sector} market size 2024 billion revenue industry report",
        f"{sector} growth rate CAGR forecast venture capital investment",
        f"{sector} competitive landscape market leaders analysis"
    ]
    
    all_content = []
    
    for query in queries:
        try:
            result = pipeline.exa.search_and_contents(
                query,
                type="neural",
                use_autoprompt=True,
                num_results=3,
                text={"max_characters": 2000},
                include_domains=[
                    "mckinsey.com", "bcg.com", "statista.com", 
                    "grandviewresearch.com", "marketsandmarkets.com"
                ]
            )
            
            for item in result.results:
                all_content.append(item.text)
                
        except Exception as e:
            continue
    
    # Extract metrics using GPT-4
    return await extract_market_metrics(pipeline, all_content, sector)

async def extract_market_metrics(pipeline, content_list: List[str], sector: str) -> Dict:
    """Extract market metrics from research content"""
    
    combined_content = "\n".join(content_list[:3])  # Use top 3 sources
    
    prompt = f"""
    Extract market metrics for {sector} from this research content. Return valid JSON only:
    
    {combined_content[:3000]}
    
    {{
        "market_size_billion": "market size in billions USD as number or null",
        "cagr_percent": "growth rate percentage as number or null",
        "market_stage": "emerging/growth/mature",
        "key_trends": ["trend1", "trend2", "trend3"],
        "geographic_focus": ["North America", "Europe", "Asia"],
        "confidence_score": "data quality score 1-10 as number"
    }}
    """
    
    try:
        response = pipeline.openai_client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )
        
        return json.loads(response.choices[0].message.content)
        
    except Exception as e:
        return {
            "market_size_billion": None,
            "cagr_percent": None,
            "market_stage": "unknown",
            "key_trends": [],
            "geographic_focus": [],
            "confidence_score": 1
        }

In [None]:
async def run_complete_pipeline(pipeline, num_companies: int = 50) -> tuple[pd.DataFrame, Dict]:
    """Run the complete enhanced pipeline"""
    
    print("🚀 Starting Enhanced AI Company Pipeline")
    print("=" * 60)
    
    start_time = time.time()
    
    try:
        # Step 1: Find AI companies
        companies = await find_ai_companies(pipeline, num_companies)
        
        # Save checkpoint
        with open("checkpoint_companies.json", "w") as f:
            json.dump(companies, f, indent=2)
        print(f"💾 Saved companies checkpoint")
        
        # Step 2: Find LinkedIn profiles
        profiles = await find_linkedin_profiles(pipeline, companies)
        
        # Save checkpoint
        with open("checkpoint_profiles.json", "w") as f:
            json.dump(profiles, f, indent=2)
        print(f"💾 Saved profiles checkpoint")
        
        # Step 3: Market analysis
        market_analysis = await analyze_markets(pipeline, companies)
        
        # Step 4: Combine everything
        final_dataset = []
        for profile in profiles:
            sector = profile.get("ai_focus", "General AI")
            profile["market_data"] = market_analysis.get(sector, {})
            final_dataset.append(profile)
        
        # Save results
        df = pd.DataFrame(final_dataset)
        df.to_csv("enhanced_ai_companies.csv", index=False)
        
        with open("market_analysis.json", "w") as f:
            json.dump(market_analysis, f, indent=2)
        
        execution_time = time.time() - start_time
        
        print("\n🎉 Pipeline Complete!")
        print(f"⏱️  Execution time: {execution_time:.1f} seconds")
        print(f"🏢 Companies found: {len(companies)}")
        print(f"👤 Profiles found: {len(profiles)}")
        print(f"📊 Sectors analyzed: {len(market_analysis)}")
        print(f"💾 Data saved to: enhanced_ai_companies.csv")
        
        return df, market_analysis
        
    except Exception as e:
        print(f"❌ Pipeline error: {e}")
        return None, None

In [None]:
# Initialize pipeline
pipeline = AIPipeline(EXA_API_KEY, OPENAI_API_KEY)

# Run complete pipeline (adjust num_companies as needed)
df, market_data = await run_complete_pipeline(pipeline, num_companies=30)

if df is not None:
    print("\n📋 Sample Results:")
    display_cols = ['name', 'person_name', 'role', 'ai_focus', 'funding_amount_millions']
    available_cols = [col for col in display_cols if col in df.columns]
    print(df[available_cols].head())
else:
    print("❌ Pipeline failed to complete")

In [None]:
if market_data:
    print("📊 MARKET ANALYSIS SUMMARY")
    print("=" * 50)
    
    for sector, data in market_data.items():
        print(f"\n🎯 {sector.upper()}")
        print(f"   Companies: {data.get('company_count', 0)}")
        print(f"   Market Size: ${data.get('market_size_billion', 'N/A')}B")
        print(f"   Growth Rate: {data.get('cagr_percent', 'N/A')}%")
        print(f"   Stage: {data.get('market_stage', 'Unknown')}")
        print(f"   Total Funding: ${data.get('total_funding_millions', 0):.1f}M")
        print(f"   Avg Funding: ${data.get('avg_funding_millions', 0):.1f}M")
        
        if data.get('key_trends'):
            print(f"   Key Trends: {', '.join(data['key_trends'][:3])}")
        
        if data.get('top_companies'):
            print(f"   Top Companies: {', '.join(data['top_companies'][:3])}")
    
    # Overall stats
    total_companies = sum(data.get('company_count', 0) for data in market_data.values())
    total_market_size = sum(data.get('market_size_billion', 0) or 0 for data in market_data.values())
    total_funding = sum(data.get('total_funding_millions', 0) for data in market_data.values())
    
    print(f"\n🌟 OVERALL SUMMARY")
    print(f"   Total Companies: {total_companies}")
    print(f"   Combined Market Size: ${total_market_size:.1f}B")
    print(f"   Total Funding Tracked: ${total_funding:.1f}M")
    print(f"   Sectors Analyzed: {len(market_data)}")
else:
    print("❌ No market data available")