In [None]:
import requests
import json
import time
from typing import Dict, List, Optional
from datetime import datetime

class BrightDataLinkedInScraper:
    def __init__(self, api_token: str, dataset_id: str = "gd_l1viktl72bvl7bjuj0"):
        """
        Initialize with Bright Data API token and your LinkedIn scraper dataset ID

        Args:
            api_token: Your Bright Data API token
            dataset_id: Your LinkedIn scraper dataset ID (e.g., gd_l1viktl72bvl7bjuj0)
        """
        self.api_token = api_token
        self.dataset_id = dataset_id
        self.headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json"
        }
        self.base_url = "https://api.brightdata.com/datasets/v3"

    def trigger_scraping(self, profile_urls: List[str]) -> Dict:
        """
        Trigger LinkedIn profile scraping using the CORRECT Bright Data API endpoint
        Based on official documentation: /datasets/v3/trigger

        Args:
            profile_urls: List of LinkedIn profile URLs to scrape

        Returns:
            API response with job details
        """
        # Format URLs as required by Bright Data API
        url_data = []
        for url in profile_urls:
            url_data.append({"url": url})

        # Use the correct trigger endpoint from documentation
        api_url = f"{self.base_url}/trigger"
        params = {
            "dataset_id": self.dataset_id,
            "format": "json",
            "uncompressed_webhook": "true"
        }

        print(f"🚀 Triggering scraping job...")
        print(f"API URL: {api_url}")
        print(f"Dataset ID: {self.dataset_id}")
        print(f"URLs to scrape: {len(profile_urls)}")

        try:
            response = requests.post(
                api_url,
                headers=self.headers,
                json=url_data,
                params=params
            )

            print(f"Response status: {response.status_code}")

            if response.status_code in [200, 201, 202]:
                result = response.json()
                print(f"✅ Scraping job triggered successfully!")
                print(f"Response: {result}")
                return result
            else:
                print(f"❌ Request failed: {response.status_code}")
                print(f"Response text: {response.text}")
                return {"error": f"HTTP {response.status_code}", "details": response.text}

        except Exception as e:
            print(f"❌ Error triggering scraping: {e}")
            return {"error": str(e)}

    def check_job_progress(self) -> Dict:
        """
        Check the progress of scraping jobs
        Note: The /progress/ endpoint might not work, so we'll use snapshots as alternative
        """
        # Try the progress endpoint first
        progress_url = f"{self.base_url}/progress/"

        try:
            response = requests.get(progress_url, headers=self.headers)
            if response.status_code == 200:
                return response.json()
        except Exception as e:
            print(f"Progress endpoint failed: {e}")

        # Fallback: Check snapshots for status
        print("Checking snapshots instead...")
        return self.get_snapshots("running")

    def get_snapshots(self, status: str = "ready") -> Dict:
        """
        Get snapshots with specific status

        Args:
            status: 'ready', 'running', 'failed', etc.
        """
        url = f"{self.base_url}/snapshots"
        params = {
            "dataset_id": self.dataset_id,
            "status": status
        }

        try:
            response = requests.get(url, headers=self.headers, params=params)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Snapshots request failed: {response.status_code}")
                return {}
        except Exception as e:
            print(f"Error getting snapshots: {e}")
            return {}

    def download_snapshot(self, snapshot_id: str = None, format_type: str = "json") -> Optional[List[Dict]]:
        """
        Download scraped data using CORRECT Bright Data API endpoints

        Based on official documentation:
        - https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}?format=json (for specific snapshot)
        - https://api.brightdata.com/datasets/snapshots/{snapshot_id}/download (alternative endpoint)

        Args:
            snapshot_id: Specific snapshot ID, or None for latest
            format_type: Data format (json, csv)
        """
        if snapshot_id:
            # Method 1: Use the official documented endpoint
            url = f"{self.base_url}/snapshot/{snapshot_id}"
            params = {"format": format_type}

            print(f"📡 Downloading from: {url}")
            try:
                response = requests.get(url, headers=self.headers, params=params)

                if response.status_code == 200:
                    data = response.json()
                    print(f"✅ Successfully downloaded data!")

                    # Handle different response formats
                    if isinstance(data, list):
                        return data
                    elif isinstance(data, dict):
                        if 'data' in data:
                            return data['data']
                        elif 'results' in data:
                            return data['results']
                        else:
                            return [data]
                    return []

                elif response.status_code == 202:
                    print("⏳ Snapshot still processing... waiting a bit longer")
                    return None

                else:
                    print(f"❌ Download failed: {response.status_code} - {response.text}")

                    # Method 2: Try alternative endpoint from docs
                    alt_url = f"https://api.brightdata.com/datasets/snapshots/{snapshot_id}/download"
                    print(f"🔄 Trying alternative endpoint: {alt_url}")

                    response = requests.get(alt_url, headers=self.headers, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        return data if isinstance(data, list) else [data]
                    else:
                        print(f"❌ Alternative endpoint also failed: {response.status_code}")
                        return None

            except Exception as e:
                print(f"❌ Error downloading data: {e}")
                return None

        else:
            # Try to get latest ready snapshot first
            ready_snapshots = self.get_snapshots("ready")

            # Extract snapshot ID from the response
            snapshot_ids = []
            if isinstance(ready_snapshots, dict) and ready_snapshots.get('data'):
                # Look for 'id' field in snapshots
                snapshot_ids = [snap.get('id') or snap.get('snapshot_id') for snap in ready_snapshots['data']]
            elif isinstance(ready_snapshots, list):
                snapshot_ids = [snap.get('id') or snap.get('snapshot_id') for snap in ready_snapshots]

            # Filter out None values
            snapshot_ids = [sid for sid in snapshot_ids if sid]

            if snapshot_ids:
                # Use the most recent snapshot
                latest_snapshot_id = snapshot_ids[0]
                print(f"📥 Using latest snapshot ID: {latest_snapshot_id}")
                return self.download_snapshot(latest_snapshot_id, format_type)
            else:
                print("❌ No snapshot IDs found in ready snapshots")
                return None

    def wait_for_specific_completion(self, snapshot_id: str, max_wait: int = 120, check_interval: int = 10) -> bool:
        """
        Wait for a SPECIFIC scraping job to complete using its snapshot ID

        Args:
            snapshot_id: The specific snapshot ID to wait for
            max_wait: Maximum wait time in seconds (reduced to 2 minutes)
            check_interval: Check interval in seconds
        """
        start_time = time.time()
        print(f"⏳ Waiting for specific job {snapshot_id} to complete...")
        print(f"💡 Will timeout after {max_wait} seconds")

        attempts = 0
        while time.time() - start_time < max_wait:
            attempts += 1

            # Try to download the specific snapshot to see if it's ready
            url = f"{self.base_url}/snapshot/{snapshot_id}"
            params = {"format": "json"}

            try:
                response = requests.get(url, headers=self.headers, params=params)
                elapsed = int(time.time() - start_time)

                print(f"Attempt {attempts}: Status {response.status_code} ({elapsed}s elapsed)")

                if response.status_code == 200:
                    print(f"✅ Job {snapshot_id} is ready!")
                    return True
                elif response.status_code == 202:
                    print(f"⏳ Job still processing...")
                elif response.status_code == 404:
                    print(f"⏳ Job not found yet, still initializing...")
                else:
                    print(f"❌ Unexpected status: {response.status_code}")
                    print(f"Response: {response.text[:200]}...")

            except Exception as e:
                elapsed = int(time.time() - start_time)
                print(f"⏳ Error checking job status ({elapsed}s elapsed): {e}")

            time.sleep(check_interval)

        print(f"⏰ Timeout reached after {max_wait} seconds")
        return False

    def fallback_download_latest(self) -> Optional[List[Dict]]:
        """
        Fallback method to download the most recent data if specific job fails
        """
        print("🔄 Falling back to download latest available data...")

        # Get all ready snapshots
        ready_snapshots = self.get_snapshots("ready")

        if isinstance(ready_snapshots, dict) and ready_snapshots.get('data'):
            snapshots = ready_snapshots['data']
        elif isinstance(ready_snapshots, list):
            snapshots = ready_snapshots
        else:
            print("❌ No snapshots found")
            return None

        if not snapshots:
            print("❌ No ready snapshots available")
            return None

        # Sort by creation date and get the most recent
        try:
            snapshots_sorted = sorted(snapshots, key=lambda x: x.get('created', ''), reverse=True)
            latest_snapshot = snapshots_sorted[0]
            latest_id = latest_snapshot.get('id')

            print(f"📥 Trying latest snapshot: {latest_id}")
            print(f"   Created: {latest_snapshot.get('created', 'unknown')}")

            return self.download_snapshot(latest_id)

        except Exception as e:
            print(f"❌ Error with fallback download: {e}")
            return None

def scrape_linkedin_profiles_complete(api_token: str, dataset_id: str, profile_urls: List[str]) -> Optional[List[Dict]]:
    """
    Complete LinkedIn scraping workflow using correct Bright Data API

    Args:
        api_token: Your Bright Data API token
        dataset_id: Your dataset ID (e.g., gd_l1viktl72bvl7bjuj0)
        profile_urls: List of LinkedIn profile URLs to scrape

    Returns:
        Scraped profile data
    """
    scraper = BrightDataLinkedInScraper(api_token, dataset_id)

    print("🔍 BRIGHT DATA LINKEDIN SCRAPER")
    print("Using OFFICIAL API endpoints from documentation")
    print("=" * 55)

    # Step 1: Trigger scraping
    print(f"\n1️⃣ Triggering scraping for {len(profile_urls)} profiles...")
    for i, url in enumerate(profile_urls, 1):
        print(f"   {i}. {url}")

    trigger_result = scraper.trigger_scraping(profile_urls)

    if trigger_result.get("error"):
        print("❌ Failed to trigger scraping job")
        return None

    # Get the specific snapshot ID from the trigger response
    new_snapshot_id = trigger_result.get("snapshot_id")
    print(f"🎯 New job snapshot ID: {new_snapshot_id}")

    # Step 2: Wait for SPECIFIC job completion (with shorter timeout)
    print(f"\n2️⃣ Waiting for the specific job to complete...")
    job_completed = scraper.wait_for_specific_completion(new_snapshot_id, max_wait=120)

    if job_completed:
        # Step 3: Download results from the SPECIFIC snapshot
        print(f"\n3️⃣ Downloading data from specific job: {new_snapshot_id}")
        results = scraper.download_snapshot(new_snapshot_id)

        if results:
            print(f"✅ Successfully downloaded {len(results)} profiles from the new job!")
            return results
        else:
            print("❌ No data found in the specific job")

    else:
        print("❌ Specific job did not complete in time")
        print("💡 Job might still be running in the background")

    # Fallback: Try to get the most recent data available
    print(f"\n🔄 FALLBACK: Trying to get most recent available data...")
    fallback_results = scraper.fallback_download_latest()

    if fallback_results:
        print(f"✅ Downloaded {len(fallback_results)} profiles from latest snapshot")
        print("⚠️  Note: This might be from a previous job, not the one just triggered")
        return fallback_results
    else:
        print("❌ No data available at all")
        return None

def display_profile_data(profiles: List[Dict]):
    """Display scraped profile data in a nice format"""
    if not profiles:
        print("No profiles to display")
        return

    print(f"\n📋 SCRAPED LINKEDIN PROFILES ({len(profiles)} total)")
    print("=" * 60)

    for i, profile in enumerate(profiles[:3], 1):  # Show first 3 profiles
        print(f"\n👤 PROFILE {i}:")
        print(f"   Name: {profile.get('name', 'N/A')}")
        print(f"   LinkedIn ID: {profile.get('linkedin_id', 'N/A')}")
        print(f"   Location: {profile.get('city', 'N/A')}")
        print(f"   Current Company: {profile.get('current_company_name', 'N/A')}")
        print(f"   Position: {profile.get('position', 'N/A')}")
        print(f"   Followers: {profile.get('followers', 'N/A')}")
        print(f"   Connections: {profile.get('connections', 'N/A')}")

        # About section (truncated)
        about = profile.get('about', '')
        if about and len(about) > 0:
            about_preview = about[:150] + "..." if len(about) > 150 else about
            print(f"   About: {about_preview}")

        # Education
        if profile.get('education') and len(profile['education']) > 0:
            edu = profile['education'][0]
            print(f"   Education: {edu.get('title', 'N/A')}")

        # Experience
        if profile.get('experience') and len(profile['experience']) > 0:
            exp = profile['experience'][0]
            print(f"   Experience: {exp.get('title', 'N/A')} at {exp.get('company', 'N/A')}")

        print(f"   Profile URL: {profile.get('url', 'N/A')}")

    if len(profiles) > 3:
        print(f"\n... and {len(profiles) - 3} more profiles")

def main():
    # Configuration - Replace with your actual values
    API_TOKEN = ""
    DATASET_ID = "gd_l1viktl72bvl7bjuj0"  # Your LinkedIn scraper dataset ID

    # URLs to scrape
    profile_urls = [
        "https://www.linkedin.com/in/ketanarun/",
        "https://www.linkedin.com/in/aravind-srinivas-16051987/"
        # Add more URLs here as needed
    ]

    # Run complete scraping workflow
    results = scrape_linkedin_profiles_complete(API_TOKEN, DATASET_ID, profile_urls)

    if results:
        # Display the scraped data
        display_profile_data(results)

        # Save to file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"linkedin_profiles_{timestamp}.json"

        data_package = {
            "metadata": {
                "scraped_at": datetime.now().isoformat(),
                "total_profiles": len(results),
                "api_token_used": API_TOKEN[-10:] + "...",  # Show last 10 chars for verification
                "dataset_id": DATASET_ID
            },
            "profiles": results
        }

        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data_package, f, indent=2, ensure_ascii=False)

        print(f"\n💾 Data saved to: {filename}")

        # Show statistics
        companies = [p.get('current_company_name') for p in results if p.get('current_company_name')]
        locations = [p.get('city') for p in results if p.get('city')]

        print(f"\n📊 QUICK STATS:")
        print(f"   Total profiles: {len(results)}")
        print(f"   Unique companies: {len(set(companies))}")
        print(f"   Unique locations: {len(set(locations))}")

        if companies:
            from collections import Counter
            top_companies = Counter(companies).most_common(3)
            print(f"   Top companies: {[comp for comp, count in top_companies]}")

    else:
        print("\n❌ No data retrieved")
        print("\n🔧 TROUBLESHOOTING STEPS:")
        print("1. Verify your API token is correct")
        print("2. Check that dataset_id matches your scraper")
        print("3. Make sure URLs are valid LinkedIn profiles")
        print("4. Try with a smaller number of URLs first")

        # Test API connectivity
        scraper = BrightDataLinkedInScraper(API_TOKEN, DATASET_ID)
        print("\n🧪 Testing API connectivity...")
        snapshots = scraper.get_snapshots("ready")
        if snapshots:
            snapshot_count = 0
            if isinstance(snapshots, dict) and snapshots.get('data'):
                snapshot_count = len(snapshots['data'])
            elif isinstance(snapshots, list):
                snapshot_count = len(snapshots)
            print(f"✅ API connection working - found {snapshot_count} ready snapshots")
        else:
            print("❌ API connection issues")

if __name__ == "__main__":
    main()

🔍 BRIGHT DATA LINKEDIN SCRAPER
Using OFFICIAL API endpoints from documentation

1️⃣ Triggering scraping for 2 profiles...
   1. https://www.linkedin.com/in/ketanarun/
   2. https://www.linkedin.com/in/aravind-srinivas-16051987/
🚀 Triggering scraping job...
API URL: https://api.brightdata.com/datasets/v3/trigger
Dataset ID: gd_l1viktl72bvl7bjuj0
URLs to scrape: 2
Response status: 200
✅ Scraping job triggered successfully!
Response: {'snapshot_id': 'sd_mfxrzseufz76qseoo'}
🎯 New job snapshot ID: sd_mfxrzseufz76qseoo

2️⃣ Waiting for the specific job to complete...
⏳ Waiting for specific job sd_mfxrzseufz76qseoo to complete...
💡 Will timeout after 120 seconds
Attempt 1: Status 202 (0s elapsed)
⏳ Job still processing...
Attempt 2: Status 202 (10s elapsed)
⏳ Job still processing...
Attempt 3: Status 202 (20s elapsed)
⏳ Job still processing...
Attempt 4: Status 200 (31s elapsed)
✅ Job sd_mfxrzseufz76qseoo is ready!

3️⃣ Downloading data from specific job: sd_mfxrzseufz76qseoo
📡 Downloading fr