In [4]:
import requests
import json
import time
from typing import Dict, List, Optional
from datetime import datetime

class BrightDataLinkedInNameScraper:
    def __init__(self, api_token: str, dataset_id: str = "gd_l1viktl72bvl7bjuj0"):
        """
        Initialize LinkedIn name-based scraper with Bright Data API

        Args:
            api_token: Your Bright Data API token
            dataset_id: Your LinkedIn scraper dataset ID
        """
        self.api_token = api_token
        self.dataset_id = dataset_id
        self.headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json"
        }
        self.base_url = "https://api.brightdata.com/datasets/v3"

    def trigger_name_discovery(self, people: List[Dict[str, str]],
                             additional_params: Optional[Dict] = None) -> Dict:
        """
        Trigger LinkedIn profile discovery using names

        Args:
            people: List of dictionaries with 'first_name' and 'last_name'
            additional_params: Optional additional search parameters (company, location, etc.)

        Returns:
            API response with job details including snapshot_id
        """
        # Validate input data
        for person in people:
            if 'first_name' not in person or 'last_name' not in person:
                return {"error": "Each person must have 'first_name' and 'last_name'"}

        api_url = f"{self.base_url}/trigger"
        params = {
            "dataset_id": self.dataset_id,
            "include_errors": "true",
            "type": "discover_new",
            "discover_by": "name"
        }

        # Add any additional search parameters
        if additional_params:
            params.update(additional_params)

        print(f"🔍 Triggering LinkedIn name discovery for {len(people)} people...")
        for i, person in enumerate(people, 1):
            name_display = f"{person['first_name']} {person['last_name']}"
            # Add company/location if provided in person data
            if 'company' in person:
                name_display += f" (Company: {person['company']})"
            if 'location' in person:
                name_display += f" (Location: {person['location']})"
            print(f"   {i}. {name_display}")

        print(f"API URL: {api_url}")
        print(f"Dataset ID: {self.dataset_id}")
        print(f"Search parameters: {params}")

        try:
            response = requests.post(api_url, headers=self.headers, json=people, params=params)

            print(f"Response status: {response.status_code}")

            if response.status_code in [200, 201, 202]:
                result = response.json()
                print(f"✅ Name discovery job triggered successfully!")
                print(f"Snapshot ID: {result.get('snapshot_id')}")
                return result
            else:
                print(f"❌ Request failed: {response.status_code}")
                print(f"Response: {response.text}")
                return {"error": f"HTTP {response.status_code}", "details": response.text}

        except Exception as e:
            print(f"❌ Error triggering discovery: {e}")
            return {"error": str(e)}

    def wait_for_completion(self, snapshot_id: str, max_wait: int = 600, check_interval: int = 20) -> bool:
        """
        Wait for a name discovery job to complete
        Name discovery typically takes longer than URL scraping

        Args:
            snapshot_id: The snapshot ID to wait for
            max_wait: Maximum wait time in seconds (default 10 minutes)
            check_interval: Check interval in seconds
        """
        start_time = time.time()
        print(f"⏳ Waiting for discovery job {snapshot_id} to complete...")
        print(f"💡 Discovery jobs may take longer - timeout after {max_wait} seconds")

        attempts = 0
        while time.time() - start_time < max_wait:
            attempts += 1

            url = f"{self.base_url}/snapshot/{snapshot_id}"
            params = {"format": "json"}

            try:
                response = requests.get(url, headers=self.headers, params=params)
                elapsed = int(time.time() - start_time)

                print(f"Attempt {attempts}: Status {response.status_code} ({elapsed}s elapsed)")

                if response.status_code == 200:
                    print(f"✅ Discovery job {snapshot_id} is ready!")
                    return True
                elif response.status_code == 202:
                    print(f"⏳ Job still processing... (discovery can take several minutes)")
                elif response.status_code == 404:
                    print(f"⏳ Job not found yet, still initializing...")
                else:
                    print(f"❌ Unexpected status: {response.status_code}")
                    print(f"Response: {response.text[:200]}...")

            except Exception as e:
                elapsed = int(time.time() - start_time)
                print(f"⏳ Error checking job status ({elapsed}s elapsed): {e}")

            time.sleep(check_interval)

        print(f"⏰ Timeout reached after {max_wait} seconds")
        print(f"💡 Job may still be running - you can check later with snapshot ID: {snapshot_id}")
        return False

    def download_results(self, snapshot_id: str) -> Optional[List[Dict]]:
        """
        Download discovered profile data from a completed job

        Args:
            snapshot_id: The snapshot ID to download from

        Returns:
            List of discovered profiles or None if failed
        """
        url = f"{self.base_url}/snapshot/{snapshot_id}"
        params = {"format": "json"}

        print(f"📡 Downloading discovery results from snapshot: {snapshot_id}")

        try:
            response = requests.get(url, headers=self.headers, params=params)

            if response.status_code == 200:
                data = response.json()
                print(f"✅ Successfully downloaded discovery data!")

                # Handle different response formats
                if isinstance(data, list):
                    return data
                elif isinstance(data, dict):
                    if 'data' in data:
                        return data['data']
                    elif 'results' in data:
                        return data['results']
                    else:
                        return [data]
                return []

            elif response.status_code == 202:
                print("⏳ Snapshot still processing... try again later")
                return None
            else:
                print(f"❌ Download failed: {response.status_code}")
                print(f"Response: {response.text[:200]}...")
                return None

        except Exception as e:
            print(f"❌ Error downloading results: {e}")
            return None

    def get_snapshots(self, status: str = "ready") -> Dict:
        """Get snapshots with specific status for troubleshooting"""
        url = f"{self.base_url}/snapshots"
        params = {
            "dataset_id": self.dataset_id,
            "status": status
        }

        try:
            response = requests.get(url, headers=self.headers, params=params)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Snapshots request failed: {response.status_code}")
                return {}
        except Exception as e:
            print(f"Error getting snapshots: {e}")
            return {}


def discover_linkedin_profiles_by_names(api_token: str, dataset_id: str,
                                       people: List[Dict[str, str]],
                                       additional_params: Optional[Dict] = None) -> Optional[List[Dict]]:
    """
    Complete LinkedIn profile discovery workflow using names

    Args:
        api_token: Your Bright Data API token
        dataset_id: Your dataset ID
        people: List of dictionaries with 'first_name' and 'last_name'
                Can also include 'company', 'location' for better targeting
        additional_params: Optional global search parameters

    Returns:
        List of discovered profile data or None if failed
    """
    scraper = BrightDataLinkedInNameScraper(api_token, dataset_id)

    print("🔍 BRIGHT DATA LINKEDIN NAME DISCOVERY")
    print("Using OFFICIAL API endpoints for name-based search")
    print("=" * 55)

    # Trigger discovery
    trigger_result = scraper.trigger_name_discovery(people, additional_params)

    if trigger_result.get("error"):
        print("❌ Failed to trigger discovery job")
        print(f"Error details: {trigger_result}")
        return None

    snapshot_id = trigger_result.get("snapshot_id")
    if not snapshot_id:
        print("❌ No snapshot ID received from API")
        return None

    print(f"🎯 Discovery job started with snapshot ID: {snapshot_id}")

    # Wait for completion (discovery jobs take longer than URL scraping)
    print(f"\n⏳ WAITING FOR DISCOVERY COMPLETION...")
    job_completed = scraper.wait_for_completion(snapshot_id)

    if job_completed:
        # Download results
        print(f"\n📥 DOWNLOADING RESULTS...")
        results = scraper.download_results(snapshot_id)

        if results:
            print(f"✅ Successfully discovered {len(results)} profiles!")
            return results
        else:
            print("❌ No profiles discovered or download failed")
            return None
    else:
        print("❌ Discovery job did not complete within timeout")
        print(f"💡 You can manually check this snapshot later: {snapshot_id}")

        # Try to download anyway in case it completed after timeout
        print(f"\n🔄 Attempting download anyway...")
        results = scraper.download_results(snapshot_id)
        if results:
            print(f"✅ Found {len(results)} profiles after timeout!")
            return results

        return None


def filter_quality_profiles(profiles: List[Dict], min_quality_score: int = 3) -> List[Dict]:
    """
    Filter profiles to keep only those with sufficient data quality

    Args:
        profiles: List of discovered profiles
        min_quality_score: Minimum quality score (1-10, higher = better)

    Returns:
        Filtered list of high-quality profiles
    """
    if not profiles:
        return []

    quality_profiles = []

    for profile in profiles:
        quality_score = 0

        # Score based on available data
        if profile.get('name') and len(profile.get('name', '')) > 3:
            quality_score += 1
        if profile.get('current_company_name') and profile.get('current_company_name') not in ['N/A', '--', None]:
            quality_score += 2
        if profile.get('position') and profile.get('position') not in ['N/A', '--', None]:
            quality_score += 2
        if profile.get('about') and len(profile.get('about', '')) > 50:
            quality_score += 2
        if profile.get('experience') and len(profile.get('experience', [])) > 0:
            quality_score += 1
        if profile.get('education') and len(profile.get('education', [])) > 0:
            quality_score += 1
        if profile.get('followers') and str(profile.get('followers', '')).isdigit():
            quality_score += 1

        # Add profile with quality score
        profile['_quality_score'] = quality_score

        if quality_score >= min_quality_score:
            quality_profiles.append(profile)

    # Sort by quality score (highest first)
    quality_profiles.sort(key=lambda x: x.get('_quality_score', 0), reverse=True)

    return quality_profiles

def display_discovered_profiles(profiles: List[Dict], max_display: int = 5, show_quality_filter: bool = True):
    """Display discovered profile data in a formatted way with quality filtering"""
    if not profiles:
        print("No profiles to display")
        return

    # Filter for quality profiles first
    if show_quality_filter:
        quality_profiles = filter_quality_profiles(profiles, min_quality_score=3)
        empty_profiles = [p for p in profiles if p not in quality_profiles]

        print(f"\n📋 LINKEDIN PROFILE ANALYSIS")
        print("=" * 65)
        print(f"   Total profiles found: {len(profiles)}")
        print(f"   High-quality profiles: {len(quality_profiles)}")
        print(f"   Low-quality/empty profiles: {len(empty_profiles)}")

        if quality_profiles:
            print(f"\n✅ HIGH-QUALITY PROFILES ({len(quality_profiles)} profiles)")
            print("=" * 65)
            profiles_to_show = quality_profiles
        else:
            print(f"\n⚠️  NO HIGH-QUALITY PROFILES FOUND - SHOWING ALL RESULTS")
            print("=" * 65)
            profiles_to_show = profiles
    else:
        print(f"\n📋 DISCOVERED LINKEDIN PROFILES ({len(profiles)} total)")
        print("=" * 65)
        profiles_to_show = profiles

    for i, profile in enumerate(profiles_to_show[:max_display], 1):
        quality_score = profile.get('_quality_score', 0)
        print(f"\n👤 PROFILE {i} (Quality Score: {quality_score}/10):")
        print(f"   Name: {profile.get('name', 'N/A')}")
        print(f"   LinkedIn ID: {profile.get('linkedin_id', 'N/A')}")
        print(f"   Location: {profile.get('city', 'N/A')} {profile.get('state', '')} {profile.get('country', '')}")
        # Highlight empty or low-quality fields
        company = profile.get('current_company_name', 'N/A')
        position = profile.get('position', 'N/A')

        if company in ['N/A', '--', None, '']:
            print(f"   Current Company: ❌ {company} (EMPTY)")
        else:
            print(f"   Current Company: ✅ {company}")

        if position in ['N/A', '--', None, '']:
            print(f"   Position: ❌ {position} (EMPTY)")
        else:
            print(f"   Position: ✅ {position}")

        # Industry with validation
        industry = profile.get('industry', 'N/A')
        if industry in ['N/A', '--', None, '']:
            print(f"   Industry: ❌ {industry} (EMPTY)")
        else:
            print(f"   Industry: ✅ {industry}")

        # Followers/Connections with validation
        followers = profile.get('followers', 'N/A')
        connections = profile.get('connections', 'N/A')

        if str(followers).isdigit():
            print(f"   Followers: ✅ {followers}")
        else:
            print(f"   Followers: ❌ {followers} (EMPTY)")

        if str(connections).isdigit():
            print(f"   Connections: ✅ {connections}")
        else:
            print(f"   Connections: ❌ {connections} (EMPTY)")

        # About section (truncated)
        about = profile.get('about', '')
        if about:
            about_preview = about[:150] + "..." if len(about) > 150 else about
            print(f"   About: {about_preview}")

        # Education
        education = profile.get('education', [])
        if education and len(education) > 0:
            edu = education[0]
            school = edu.get('title', edu.get('school', 'N/A'))
            degree = edu.get('degree', '')
            if degree:
                print(f"   Education: {degree} at {school}")
            else:
                print(f"   Education: {school}")

        # Experience
        experience = profile.get('experience', [])
        if experience and len(experience) > 0:
            exp = experience[0]
            title = exp.get('title', 'N/A')
            company = exp.get('company', 'N/A')
            print(f"   Latest Experience: {title} at {company}")

        # Profile URL
        url = profile.get('url', profile.get('linkedin_url', 'N/A'))
        print(f"   Profile URL: {url}")

        # Match confidence (if available)
        if 'match_score' in profile or 'confidence' in profile:
            score = profile.get('match_score', profile.get('confidence', 'N/A'))
            print(f"   Match Confidence: {score}")

    if len(profiles_to_show) > max_display:
        print(f"\n... and {len(profiles_to_show) - max_display} more quality profiles")

    # Show summary of filtered out profiles
    if show_quality_filter and len(empty_profiles) > 0:
        print(f"\n❌ FILTERED OUT {len(empty_profiles)} LOW-QUALITY PROFILES:")
        for i, profile in enumerate(empty_profiles[:3], 1):
            name = profile.get('name', 'Unknown')
            linkedin_id = profile.get('linkedin_id', 'N/A')
            quality_score = profile.get('_quality_score', 0)
            print(f"   {i}. {name} (ID: {linkedin_id}, Quality: {quality_score}/10)")

        if len(empty_profiles) > 3:
            print(f"   ... and {len(empty_profiles) - 3} more low-quality profiles")


def save_discovery_results(results: List[Dict], search_info: Dict = None) -> str:
    """Save discovery results to JSON file with metadata"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"linkedin_name_discovery_{timestamp}.json"

    # Analyze results for metadata
    companies = [p.get('current_company_name') for p in results if p.get('current_company_name')]
    locations = [p.get('city') for p in results if p.get('city')]
    industries = [p.get('industry') for p in results if p.get('industry')]

    metadata = {
        "discovery_timestamp": datetime.now().isoformat(),
        "total_profiles_found": len(results),
        "unique_companies": len(set(companies)) if companies else 0,
        "unique_locations": len(set(locations)) if locations else 0,
        "unique_industries": len(set(industries)) if industries else 0,
        "scraper_type": "bright_data_name_discovery"
    }

    if search_info:
        metadata["search_parameters"] = search_info

    data_package = {
        "metadata": metadata,
        "discovered_profiles": results
    }

    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data_package, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Discovery results saved to: {filename}")
    return filename


def main():
    """Main function demonstrating LinkedIn name discovery"""

    # Configuration with your API token
    API_TOKEN = ""
    DATASET_ID = "gd_l1viktl72bvl7bjuj0"

    print("🔍 BRIGHT DATA LINKEDIN NAME DISCOVERY SCRAPER")
    print("Find LinkedIn profiles by searching names")
    print("=" * 65)

    # Example: Search for people by name
    people_to_discover = [
        {"first_name": "Shashikant", "last_name": "Zarekar"}
    ]

    # Optional: Add more specific search parameters to get better results
    additional_search_params = {
        "company": "Tech",  # Uncomment and modify to filter by company keywords
        "location": "India",  # Uncomment to filter by location
        # "industry": "Technology"  # Uncomment to filter by industry
    }

    print(f"\n🎯 SEARCH CONFIGURATION:")
    print(f"   People to find: {len(people_to_discover)}")
    if additional_search_params:
        print(f"   Additional filters: {additional_search_params}")
    else:
        print(f"   Additional filters: None (will find all matches)")

    # Run the discovery
    results = discover_linkedin_profiles_by_names(
        API_TOKEN,
        DATASET_ID,
        people_to_discover,
        additional_search_params
    )

    if results:
        # Display discovered profiles
        display_discovered_profiles(results)

        # Save results
        search_info = {
            "people_searched": people_to_discover,
            "additional_params": additional_search_params
        }
        saved_file = save_discovery_results(results, search_info)

        # Show discovery statistics
        print(f"\n📊 DISCOVERY STATISTICS:")
        print(f"   Total profiles found: {len(results)}")

        # Analyze companies
        companies = [p.get('current_company_name') for p in results if p.get('current_company_name')]
        if companies:
            from collections import Counter
            top_companies = Counter(companies).most_common(5)
            print(f"   Top companies found: {[comp for comp, count in top_companies]}")

        # Analyze locations
        locations = [p.get('city') for p in results if p.get('city')]
        if locations:
            unique_locations = len(set(locations))
            print(f"   Locations represented: {unique_locations}")

        # Analyze industries
        industries = [p.get('industry') for p in results if p.get('industry')]
        if industries:
            unique_industries = len(set(industries))
            print(f"   Industries represented: {unique_industries}")

        print(f"\n✅ Discovery completed successfully!")
        print(f"   Results saved to: {saved_file}")

    else:
        print(f"\n❌ No profiles discovered")
        print(f"\n🔧 TROUBLESHOOTING TIPS:")
        print(f"   1. Try more common names")
        print(f"   2. Remove location/company filters if used")
        print(f"   3. Check that your dataset supports name discovery")
        print(f"   4. Wait longer - discovery jobs can take 10+ minutes")

        # Test API connectivity
        scraper = BrightDataLinkedInNameScraper(API_TOKEN, DATASET_ID)
        print(f"\n🧪 Testing API connectivity...")
        snapshots = scraper.get_snapshots("ready")
        if snapshots:
            snapshot_count = len(snapshots.get('data', [])) if isinstance(snapshots, dict) else len(snapshots)
            print(f"✅ API working - found {snapshot_count} ready snapshots")
        else:
            print(f"❌ API connection issues")


if __name__ == "__main__":
    main()

🔍 BRIGHT DATA LINKEDIN NAME DISCOVERY SCRAPER
Find LinkedIn profiles by searching names

🎯 SEARCH CONFIGURATION:
   People to find: 1
   Additional filters: {'company': 'Tech', 'location': 'India'}
🔍 BRIGHT DATA LINKEDIN NAME DISCOVERY
Using OFFICIAL API endpoints for name-based search
🔍 Triggering LinkedIn name discovery for 1 people...
   1. Shashikant Zarekar
API URL: https://api.brightdata.com/datasets/v3/trigger
Dataset ID: gd_l1viktl72bvl7bjuj0
Search parameters: {'dataset_id': 'gd_l1viktl72bvl7bjuj0', 'include_errors': 'true', 'type': 'discover_new', 'discover_by': 'name', 'company': 'Tech', 'location': 'India'}
Response status: 200
✅ Name discovery job triggered successfully!
Snapshot ID: sd_mfxtcd8g2mo9wi9shd
🎯 Discovery job started with snapshot ID: sd_mfxtcd8g2mo9wi9shd

⏳ WAITING FOR DISCOVERY COMPLETION...
⏳ Waiting for discovery job sd_mfxtcd8g2mo9wi9shd to complete...
💡 Discovery jobs may take longer - timeout after 600 seconds
Attempt 1: Status 202 (0s elapsed)
⏳ Job st