In [3]:
import requests
import json
import time
import re
from typing import Dict, List, Optional
from datetime import datetime

class BrightDataLinkedInNameScraper:
    def __init__(self, api_token: str, dataset_id: str = "gd_l1viktl72bvl7bjuj0"):
        """
        Initialize LinkedIn name-based scraper with Bright Data API

        Args:
            api_token: Your Bright Data API token
            dataset_id: Your LinkedIn scraper dataset ID
        """
        self.api_token = api_token
        self.dataset_id = dataset_id
        self.headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json"
        }
        self.base_url = "https://api.brightdata.com/datasets/v3"

    def trigger_name_discovery(self, people: List[Dict[str, str]],
                             additional_params: Optional[Dict] = None) -> Dict:
        """
        Trigger LinkedIn profile discovery using names

        Args:
            people: List of dictionaries with 'first_name' and 'last_name'
            additional_params: Optional additional search parameters (company, location, etc.)

        Returns:
            API response with job details including snapshot_id
        """
        # Validate input data
        for person in people:
            if 'first_name' not in person or 'last_name' not in person:
                return {"error": "Each person must have 'first_name' and 'last_name'"}

        api_url = f"{self.base_url}/trigger"
        params = {
            "dataset_id": self.dataset_id,
            "include_errors": "true",
            "type": "discover_new",
            "discover_by": "name"
        }

        # Add any additional search parameters
        if additional_params:
            params.update(additional_params)

        print(f"🔍 Triggering LinkedIn name discovery for {len(people)} people...")
        for i, person in enumerate(people, 1):
            name_display = f"{person['first_name']} {person['last_name']}"
            # Add company/location if provided in person data
            if 'company' in person:
                name_display += f" (Company: {person['company']})"
            if 'location' in person:
                name_display += f" (Location: {person['location']})"
            print(f"   {i}. {name_display}")

        print(f"API URL: {api_url}")
        print(f"Dataset ID: {self.dataset_id}")
        print(f"Search parameters: {params}")

        try:
            response = requests.post(api_url, headers=self.headers, json=people, params=params)

            print(f"Response status: {response.status_code}")

            if response.status_code in [200, 201, 202]:
                result = response.json()
                print(f"✅ Name discovery job triggered successfully!")
                print(f"Snapshot ID: {result.get('snapshot_id')}")
                return result
            else:
                print(f"❌ Request failed: {response.status_code}")
                print(f"Response: {response.text}")
                return {"error": f"HTTP {response.status_code}", "details": response.text}

        except Exception as e:
            print(f"❌ Error triggering discovery: {e}")
            return {"error": str(e)}

    def wait_for_completion(self, snapshot_id: str, max_wait: int = 600, check_interval: int = 20) -> bool:
        """
        Wait for a name discovery job to complete
        Name discovery typically takes longer than URL scraping

        Args:
            snapshot_id: The snapshot ID to wait for
            max_wait: Maximum wait time in seconds (default 10 minutes)
            check_interval: Check interval in seconds
        """
        start_time = time.time()
        print(f"⏳ Waiting for discovery job {snapshot_id} to complete...")
        print(f"💡 Discovery jobs may take longer - timeout after {max_wait} seconds")

        attempts = 0
        while time.time() - start_time < max_wait:
            attempts += 1

            url = f"{self.base_url}/snapshot/{snapshot_id}"
            params = {"format": "json"}

            try:
                response = requests.get(url, headers=self.headers, params=params)
                elapsed = int(time.time() - start_time)

                print(f"Attempt {attempts}: Status {response.status_code} ({elapsed}s elapsed)")

                if response.status_code == 200:
                    print(f"✅ Discovery job {snapshot_id} is ready!")
                    return True
                elif response.status_code == 202:
                    print(f"⏳ Job still processing... (discovery can take several minutes)")
                elif response.status_code == 404:
                    print(f"⏳ Job not found yet, still initializing...")
                else:
                    print(f"❌ Unexpected status: {response.status_code}")
                    print(f"Response: {response.text[:200]}...")

            except Exception as e:
                elapsed = int(time.time() - start_time)
                print(f"⏳ Error checking job status ({elapsed}s elapsed): {e}")

            time.sleep(check_interval)

        print(f"⏰ Timeout reached after {max_wait} seconds")
        print(f"💡 Job may still be running - you can check later with snapshot ID: {snapshot_id}")
        return False

    def download_results(self, snapshot_id: str) -> Optional[List[Dict]]:
        """
        Download discovered profile data from a completed job

        Args:
            snapshot_id: The snapshot ID to download from

        Returns:
            List of discovered profiles or None if failed
        """
        url = f"{self.base_url}/snapshot/{snapshot_id}"
        params = {"format": "json"}

        print(f"📡 Downloading discovery results from snapshot: {snapshot_id}")

        try:
            response = requests.get(url, headers=self.headers, params=params)

            if response.status_code == 200:
                data = response.json()
                print(f"✅ Successfully downloaded discovery data!")

                # Handle different response formats
                if isinstance(data, list):
                    return data
                elif isinstance(data, dict):
                    if 'data' in data:
                        return data['data']
                    elif 'results' in data:
                        return data['results']
                    else:
                        return [data]
                return []

            elif response.status_code == 202:
                print("⏳ Snapshot still processing... try again later")
                return None
            else:
                print(f"❌ Download failed: {response.status_code}")
                print(f"Response: {response.text[:200]}...")
                return None

        except Exception as e:
            print(f"❌ Error downloading results: {e}")
            return None

    def get_snapshots(self, status: str = "ready") -> Dict:
        """Get snapshots with specific status for troubleshooting"""
        url = f"{self.base_url}/snapshots"
        params = {
            "dataset_id": self.dataset_id,
            "status": status
        }

        try:
            response = requests.get(url, headers=self.headers, params=params)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Snapshots request failed: {response.status_code}")
                return {}
        except Exception as e:
            print(f"Error getting snapshots: {e}")
            return {}


def select_best_profile_match(profiles: List[Dict], criteria: str = "comprehensive") -> Dict:
    """
    Select the single best profile match from filtered results

    Args:
        profiles: List of profiles that already match the company filter
        criteria: Selection criteria - "comprehensive", "current_company", "most_recent", "highest_quality"

    Returns:
        Single best matching profile
    """
    if not profiles:
        return {}

    if len(profiles) == 1:
        return profiles[0]

    print(f"   Selection criteria: {criteria}")

    if criteria == "comprehensive":
        # Score based on multiple factors
        scored_profiles = []

        for profile in profiles:
            score = 0
            score_details = []

            # Current company match (highest priority)
            current_company = profile.get('current_company', {})
            current_company_name = ""
            if isinstance(current_company, dict):
                current_company_name = current_company.get('name', '')
            else:
                current_company_name = profile.get('current_company_name', '')

            company_matches = profile.get('_company_matches', [])
            has_current_match = any('Current:' in match for match in company_matches)

            if has_current_match:
                score += 50  # High priority for current company match
                score_details.append("Current company match (+50)")

            # Experience quality
            experience = profile.get('experience', [])
            if isinstance(experience, list) and len(experience) > 0:
                score += len(experience) * 2
                score_details.append(f"Experience entries (+{len(experience) * 2})")

            # Profile completeness
            if profile.get('about') and len(profile.get('about', '')) > 100:
                score += 10
                score_details.append("Detailed about section (+10)")

            # Connection count (indication of active profile)
            connections = profile.get('connections', '0')
            if str(connections).isdigit() and int(connections) > 100:
                score += 15
                score_details.append("High connections (+15)")

            # Followers count
            followers = profile.get('followers', '0')
            if str(followers).isdigit() and int(followers) > 500:
                score += 10
                score_details.append("High followers (+10)")

            # Location match (if specified in search)
            # This would need additional logic based on search params

            profile['_selection_score'] = score
            profile['_score_details'] = score_details
            scored_profiles.append((profile, score))

        # Sort by score (highest first)
        scored_profiles.sort(key=lambda x: x[1], reverse=True)
        best_profile = scored_profiles[0][0]

        print(f"   Best match score: {best_profile['_selection_score']}")
        print(f"   Score breakdown: {', '.join(best_profile['_score_details'])}")

        return best_profile

    elif criteria == "current_company":
        # Prioritize profiles with current company match
        current_company_matches = []
        for profile in profiles:
            company_matches = profile.get('_company_matches', [])
            if any('Current:' in match for match in company_matches):
                current_company_matches.append(profile)

        if current_company_matches:
            return current_company_matches[0]  # Return first current company match
        else:
            return profiles[0]  # Fallback to first profile

    elif criteria == "most_recent":
        # Select profile with most recent experience at matching company
        # This would require parsing dates - simplified version returns first
        return profiles[0]

    elif criteria == "highest_quality":
        # Use existing quality scoring
        quality_profiles = filter_quality_profiles(profiles, min_quality_score=1)
        if quality_profiles:
            return quality_profiles[0]  # Already sorted by quality
        else:
            return profiles[0]

    return profiles[0]  # Default fallback


def filter_profiles_by_company_regex(profiles: List[Dict], company_pattern: str,
                                   case_sensitive: bool = False) -> List[Dict]:
    """
    Filter profiles to only include those where the company pattern matches
    current company or any past experience company names

    Args:
        profiles: List of discovered profiles
        company_pattern: Regex pattern to match company names (e.g., "Grant.*" or "Grant")
        case_sensitive: Whether the regex match should be case sensitive

    Returns:
        Filtered list of profiles matching the company pattern
    """
    if not profiles or not company_pattern:
        return profiles

    # Compile regex pattern
    flags = 0 if case_sensitive else re.IGNORECASE
    try:
        pattern = re.compile(company_pattern, flags)
    except re.error as e:
        print(f"❌ Invalid regex pattern '{company_pattern}': {e}")
        return []

    matched_profiles = []

    print(f"🔍 Filtering profiles with company regex: '{company_pattern}'")
    print(f"   Case sensitive: {case_sensitive}")

    for profile in profiles:
        profile_matched = False
        match_details = []

        # Check current company
        current_company = profile.get('current_company', {})
        if isinstance(current_company, dict):
            current_company_name = current_company.get('name', '')
        else:
            # Sometimes current_company might be a string
            current_company_name = str(current_company) if current_company else ''

        # Also check current_company_name field (alternative field name)
        if not current_company_name:
            current_company_name = profile.get('current_company_name', '')

        if current_company_name and pattern.search(current_company_name):
            profile_matched = True
            match_details.append(f"Current: {current_company_name}")

        # Check experience companies
        experience = profile.get('experience', [])
        if isinstance(experience, list):
            for exp in experience:
                if isinstance(exp, dict):
                    exp_company = exp.get('company', '')
                    if exp_company and pattern.search(exp_company):
                        profile_matched = True
                        match_details.append(f"Experience: {exp_company}")

        # If profile matched, add it with match info
        if profile_matched:
            profile['_company_matches'] = match_details
            matched_profiles.append(profile)
            name = profile.get('name', 'Unknown')
            print(f"   ✅ {name} - Matches: {', '.join(match_details)}")

    print(f"📊 Company filtering results:")
    print(f"   Total profiles searched: {len(profiles)}")
    print(f"   Profiles matching '{company_pattern}': {len(matched_profiles)}")

    return matched_profiles


def discover_linkedin_profiles_by_names_with_company_filter(
    api_token: str,
    dataset_id: str,
    people: List[Dict[str, str]],
    company_regex_pattern: str,
    additional_params: Optional[Dict] = None,
    case_sensitive: bool = False,
    select_best_only: bool = False
) -> Optional[List[Dict]]:
    """
    Complete LinkedIn profile discovery workflow using names with regex company filtering

    Args:
        api_token: Your Bright Data API token
        dataset_id: Your dataset ID
        people: List of dictionaries with 'first_name' and 'last_name'
        company_regex_pattern: Regex pattern to match company names (e.g., "Grant.*" or "(?i)grant")
        additional_params: Optional global search parameters
        case_sensitive: Whether company name matching should be case sensitive

    Returns:
        List of filtered profile data matching company pattern or None if failed
    """
    scraper = BrightDataLinkedInNameScraper(api_token, dataset_id)

    print("🔍 BRIGHT DATA LINKEDIN NAME DISCOVERY WITH COMPANY FILTERING")
    print("Using OFFICIAL API endpoints for name-based search + regex filtering")
    print("=" * 70)
    print(f"Company filter pattern: '{company_regex_pattern}'")
    print(f"Case sensitive: {case_sensitive}")

    # Trigger discovery (remove company filter from API params to get broader results)
    api_params = additional_params.copy() if additional_params else {}
    if 'company' in api_params:
        removed_company = api_params.pop('company')
        print(f"📝 Removed API company filter '{removed_company}' - will use regex instead")

    trigger_result = scraper.trigger_name_discovery(people, api_params)

    if trigger_result.get("error"):
        print("❌ Failed to trigger discovery job")
        print(f"Error details: {trigger_result}")
        return None

    snapshot_id = trigger_result.get("snapshot_id")
    if not snapshot_id:
        print("❌ No snapshot ID received from API")
        return None

    print(f"🎯 Discovery job started with snapshot ID: {snapshot_id}")

    # Wait for completion
    print(f"\n⏳ WAITING FOR DISCOVERY COMPLETION...")
    job_completed = scraper.wait_for_completion(snapshot_id)

    if job_completed:
        # Download results
        print(f"\n📥 DOWNLOADING RESULTS...")
        all_results = scraper.download_results(snapshot_id)

        if all_results:
            print(f"✅ Successfully discovered {len(all_results)} profiles!")

            # Apply company regex filtering
            print(f"\n🔍 APPLYING COMPANY REGEX FILTERING...")
            filtered_results = filter_profiles_by_company_regex(
                all_results,
                company_regex_pattern,
                case_sensitive
            )

            if filtered_results:
                if select_best_only:
                    print(f"🎯 Found {len(filtered_results)} matching profiles, selecting best match...")
                    best_match = select_best_profile_match(filtered_results)
                    print(f"✅ Selected best match: {best_match.get('name', 'Unknown')}")
                    return [best_match]  # Return as list for consistency
                else:
                    print(f"🎯 Final results: {len(filtered_results)} profiles match company pattern!")
                    return filtered_results
            else:
                print(f"❌ No profiles found matching company pattern '{company_regex_pattern}'")
                print(f"💡 Consider:")
                print(f"   - Using a broader regex pattern (e.g., 'Grant.*' instead of 'Grant')")
                print(f"   - Making the search case-insensitive")
                print(f"   - Checking if the company appears in different formats")
                return None
        else:
            print("❌ No profiles discovered or download failed")
            return None
    else:
        print("❌ Discovery job did not complete within timeout")
        return None


def display_company_filtered_profiles(profiles: List[Dict], max_display: int = 10):
    """Display company-filtered profiles with match details"""
    if not profiles:
        print("No profiles to display")
        return

    print(f"\n📋 COMPANY-FILTERED LINKEDIN PROFILES")
    print("=" * 65)
    print(f"Total matching profiles: {len(profiles)}")

    for i, profile in enumerate(profiles[:max_display], 1):
        print(f"\n👤 PROFILE {i}:")
        print(f"   Name: {profile.get('name', 'N/A')}")
        print(f"   LinkedIn ID: {profile.get('id', profile.get('linkedin_id', 'N/A'))}")
        print(f"   Location: {profile.get('city', 'N/A')}")

        # Show company matches
        company_matches = profile.get('_company_matches', [])
        print(f"   🎯 Company Matches: {', '.join(company_matches)}")

        # Current company details
        current_company = profile.get('current_company', {})
        if isinstance(current_company, dict):
            company_name = current_company.get('name', 'N/A')
            company_title = current_company.get('title', 'N/A')
        else:
            company_name = profile.get('current_company_name', 'N/A')
            company_title = profile.get('position', 'N/A')

        print(f"   Current Company: {company_name}")
        print(f"   Current Position: {company_title}")

        # Show relevant experience
        experience = profile.get('experience', [])
        if isinstance(experience, list) and len(experience) > 0:
            print(f"   Recent Experience:")
            for j, exp in enumerate(experience[:2], 1):  # Show first 2 experiences
                if isinstance(exp, dict):
                    exp_title = exp.get('title', 'N/A')
                    exp_company = exp.get('company', 'N/A')
                    exp_dates = f"{exp.get('start_date', '')} - {exp.get('end_date', 'Present')}"
                    print(f"     {j}. {exp_title} at {exp_company} ({exp_dates})")

        # Profile URL
        profile_url = profile.get('url', profile.get('linkedin_url', 'N/A'))
        print(f"   Profile URL: {profile_url}")

    if len(profiles) > max_display:
        print(f"\n... and {len(profiles) - max_display} more matching profiles")


def main():
    """Main function demonstrating LinkedIn name discovery with company regex filtering"""

    # Configuration with your API token
    API_TOKEN = ""
    DATASET_ID = "gd_l1viktl72bvl7bjuj0"

    print("🔍 LINKEDIN NAME DISCOVERY WITH COMPANY REGEX FILTERING")
    print("=" * 65)

    # Example: Search for people by name
    people_to_discover = [
        {"first_name": "Chandreyee", "last_name": "Mukherjee"}
    ]

    # Company regex pattern - modify this to match your needs
    # Examples:
    # "Grant" - exact match
    # "Grant.*" - starts with Grant
    # ".*Grant.*" - contains Grant anywhere
    # "(?i)grant" - case insensitive exact match
    # "Grant|grant" - exact match Grant or grant
    COMPANY_REGEX_PATTERN = ".*Grant.*"  # This will match any company containing "Grant"

    # Optional: Add other search parameters (but remove company filter - we'll use regex instead)
    additional_search_params = {
        "location": "India",
        # Don't include "company" here - we'll filter with regex after getting results
    }

    print(f"\n🎯 SEARCH CONFIGURATION:")
    print(f"   People to find: {len(people_to_discover)}")
    print(f"   Company regex pattern: '{COMPANY_REGEX_PATTERN}'")
    print(f"   Additional filters: {additional_search_params}")

    # Run the discovery with company filtering
    results = discover_linkedin_profiles_by_names_with_company_filter(
        API_TOKEN,
        DATASET_ID,
        people_to_discover,
        COMPANY_REGEX_PATTERN,
        additional_search_params,
        case_sensitive=False,  # Set to True if you want case-sensitive matching
        select_best_only=True  # Set to True to get only the best match
    )

    if results:
        # Display filtered profiles
        display_company_filtered_profiles(results)

        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"linkedin_company_filtered_{timestamp}.json"

        metadata = {
            "discovery_timestamp": datetime.now().isoformat(),
            "company_regex_pattern": COMPANY_REGEX_PATTERN,
            "total_matching_profiles": len(results),
            "search_parameters": {
                "people_searched": people_to_discover,
                "additional_params": additional_search_params
            }
        }

        data_package = {
            "metadata": metadata,
            "filtered_profiles": results
        }

        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data_package, f, indent=2, ensure_ascii=False)

        print(f"\n💾 Filtered results saved to: {filename}")
        print(f"✅ Discovery completed successfully with {len(results)} matching profiles!")

    else:
        print("\n❌ No profiles found matching the company pattern")
        print("\n🔧 TROUBLESHOOTING TIPS:")
        print("   1. Try a broader regex pattern (e.g., '.*Grant.*')")
        print("   2. Make the search case-insensitive")
        print("   3. Check if company names appear in different formats")
        print("   4. Remove location filters to get more initial results")


if __name__ == "__main__":
    main()

🔍 LINKEDIN NAME DISCOVERY WITH COMPANY REGEX FILTERING

🎯 SEARCH CONFIGURATION:
   People to find: 1
   Company regex pattern: '.*Grant.*'
   Additional filters: {'location': 'India'}
🔍 BRIGHT DATA LINKEDIN NAME DISCOVERY WITH COMPANY FILTERING
Using OFFICIAL API endpoints for name-based search + regex filtering
Company filter pattern: '.*Grant.*'
Case sensitive: False
🔍 Triggering LinkedIn name discovery for 1 people...
   1. Chandreyee Mukherjee
API URL: https://api.brightdata.com/datasets/v3/trigger
Dataset ID: gd_l1viktl72bvl7bjuj0
Search parameters: {'dataset_id': 'gd_l1viktl72bvl7bjuj0', 'include_errors': 'true', 'type': 'discover_new', 'discover_by': 'name', 'location': 'India'}
Response status: 200
✅ Name discovery job triggered successfully!
Snapshot ID: sd_mfxufg46106clnjf5z
🎯 Discovery job started with snapshot ID: sd_mfxufg46106clnjf5z

⏳ WAITING FOR DISCOVERY COMPLETION...
⏳ Waiting for discovery job sd_mfxufg46106clnjf5z to complete...
💡 Discovery jobs may take longer - t