In [None]:
!pip install langchain-brightdata

Collecting langchain-brightdata
  Downloading langchain_brightdata-0.1.3-py3-none-any.whl.metadata (6.0 kB)
Downloading langchain_brightdata-0.1.3-py3-none-any.whl (12 kB)
Installing collected packages: langchain-brightdata
Successfully installed langchain-brightdata-0.1.3


In [None]:

import requests
import json
import time
from datetime import datetime

def test_brightdata_direct_api(url, dataset_id, api_key):
    """
    Test Bright Data's direct API endpoint (not LangChain)
    """
    # API endpoints
    scrape_endpoint = f"https://api.brightdata.com/datasets/v3/scrape"
    trigger_endpoint = f"https://api.brightdata.com/datasets/v3/trigger"

    headers = {
        'Authorization': f'Bearer {api_key}',
        'Content-Type': 'application/json'
    }

    payload = [{"url": url}]

    print(f"🚀 Testing Bright Data Direct API")
    print(f"🔗 URL: {url}")
    print(f"📊 Dataset ID: {dataset_id}")
    print("=" * 60)

    try:
        # Method 1: Try synchronous scraping
        print("\n🧪 Method 1: Synchronous Scraping")
        params = {
            'dataset_id': dataset_id,
            'format': 'json'
        }

        print("⏳ Sending request...")
        response = requests.post(
            scrape_endpoint,
            headers=headers,
            params=params,
            json=payload,
            timeout=60
        )

        print(f"📊 Status Code: {response.status_code}")
        print(f"📏 Response Size: {len(response.content)} bytes")

        if response.ok:
            data = response.json()
            print("✅ Synchronous scraping successful!")
            return save_results(url, dataset_id, data, "synchronous")
        else:
            print(f"❌ Synchronous failed: {response.text}")

    except Exception as e:
        print(f"❌ Synchronous error: {str(e)}")

    try:
        # Method 2: Try asynchronous trigger
        print("\n🧪 Method 2: Asynchronous Trigger")
        params = {
            'dataset_id': dataset_id,
            'format': 'json'
        }

        print("⏳ Triggering collection...")
        response = requests.post(
            trigger_endpoint,
            headers=headers,
            params=params,
            json=payload,
            timeout=30
        )

        print(f"📊 Status Code: {response.status_code}")

        if response.ok:
            result = response.json()
            print("✅ Collection triggered successfully!")
            print(f"📄 Response: {result}")

            if 'snapshot_id' in result:
                snapshot_id = result['snapshot_id']
                print(f"📋 Snapshot ID: {snapshot_id}")
                print("💡 Use this ID to retrieve results later")
                return save_results(url, dataset_id, result, "asynchronous")
            else:
                return save_results(url, dataset_id, result, "trigger_response")
        else:
            print(f"❌ Trigger failed: {response.text}")
            return save_results(url, dataset_id, {"error": response.text}, "error")

    except Exception as e:
        print(f"❌ Trigger error: {str(e)}")
        return save_results(url, dataset_id, {"error": str(e)}, "error")

def save_results(url, dataset_id, data, method):
    """
    Save results to JSON file
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    url_clean = url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_')
    filename = f"brightdata_{method}_{url_clean}_{timestamp}.json"

    result_data = {
        "url": url,
        "dataset_id": dataset_id,
        "method": method,
        "scraped_at": datetime.now().isoformat(),
        "scraper": "Bright Data Direct API",
        "data": data
    }

    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(result_data, f, indent=2, ensure_ascii=False)

        print(f"💾 Results saved to: {filename}")
        return filename
    except Exception as e:
        print(f"❌ Failed to save: {str(e)}")
        return None

def test_dataset_info(dataset_id, api_key):
    """
    Try to get information about the dataset
    """
    print(f"\n🔍 Testing Dataset Info: {dataset_id}")

    # Try different info endpoints
    info_endpoints = [
        f"https://api.brightdata.com/datasets/v3/dataset/{dataset_id}",
        f"https://api.brightdata.com/datasets/v3/info?dataset_id={dataset_id}",
        f"https://api.brightdata.com/datasets/{dataset_id}/info"
    ]

    headers = {
        'Authorization': f'Bearer {api_key}',
        'Content-Type': 'application/json'
    }

    for endpoint in info_endpoints:
        try:
            print(f"📡 Trying: {endpoint}")
            response = requests.get(endpoint, headers=headers, timeout=10)
            print(f"   Status: {response.status_code}")

            if response.ok:
                info = response.json()
                print(f"   ✅ Dataset info retrieved!")
                print(f"   📄 Info: {json.dumps(info, indent=2)}")
                return info
            else:
                print(f"   ❌ {response.text[:100]}")

        except Exception as e:
            print(f"   ❌ Error: {str(e)}")

    return None

def main():
    """
    Main function to test Bright Data
    """
    # Configuration
    URL = "https://gemengserv.com"
    DATASET_ID = "gd_m6gjtfmeh43we6cqc"  # From your original URL
    API_KEY = ""

    print("🌟 Bright Data Direct API Test")
    print("=" * 60)

    # Test dataset info first
    dataset_info = test_dataset_info(DATASET_ID, API_KEY)

    # Test scraping
    result_file = test_brightdata_direct_api(URL, DATASET_ID, API_KEY)

    print("\n" + "=" * 60)
    print("📋 TEST SUMMARY")
    print("=" * 60)

    if result_file:
        print(f"✅ Test completed - check {result_file}")
        print("\n💡 Next steps:")
        print("   - If you got a snapshot_id, wait a few minutes then retrieve results")
        print("   - Check your Bright Data dashboard for collection status")
        print("   - Verify your dataset supports the target website")
    else:
        print("❌ Test failed - no results saved")
        print("\n🔧 Troubleshooting:")
        print("   - Verify your API key is correct")
        print("   - Check if dataset_id is valid for your account")
        print("   - Ensure the website is supported by your dataset")
        print("   - Check if you have sufficient credits")

if __name__ == "__main__":
    main()

🌟 Bright Data Direct API Test

🔍 Testing Dataset Info: gd_m6gjtfmeh43we6cqc
📡 Trying: https://api.brightdata.com/datasets/v3/dataset/gd_m6gjtfmeh43we6cqc
   Status: 404
   ❌ <!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Error</title>
</head>
<body>
<
📡 Trying: https://api.brightdata.com/datasets/v3/info?dataset_id=gd_m6gjtfmeh43we6cqc
   Status: 404
   ❌ <!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Error</title>
</head>
<body>
<
📡 Trying: https://api.brightdata.com/datasets/gd_m6gjtfmeh43we6cqc/info
   Status: 404
   ❌ <!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Error</title>
</head>
<body>
<
🚀 Testing Bright Data Direct API
🔗 URL: https://gemengserv.com
📊 Dataset ID: gd_m6gjtfmeh43we6cqc

🧪 Method 1: Synchronous Scraping
⏳ Sending request...
❌ Synchronous error: HTTPSConnectionPool(host='api.brightdata.com', port=443): Read timed out. (read timeout=60)

🧪 Method 2: Asynchronous Trigger
⏳ Triggering collection..

In [None]:
#run above script first and wait for snapshot id to generate. Then run below script

In [None]:
import requests
import json
import time
from datetime import datetime

def retrieve_scraping_results(snapshot_id, api_key, max_wait_minutes=10):
    """
    Retrieve results from Bright Data using snapshot ID
    """
    print(f"🔍 Retrieving results for snapshot: {snapshot_id}")
    print(f"⏰ Started at: {datetime.now()}")
    print("=" * 60)

    # API endpoints for retrieving results
    download_endpoint = f"https://api.brightdata.com/datasets/v3/progress/{snapshot_id}"

    headers = {
        'Authorization': f'Bearer {api_key}',
        'Content-Type': 'application/json'
    }

    start_time = time.time()
    max_wait_seconds = max_wait_minutes * 60

    while True:
        try:
            elapsed_time = time.time() - start_time

            if elapsed_time > max_wait_seconds:
                print(f"⏰ Timeout after {max_wait_minutes} minutes")
                break

            print(f"📡 Checking status... ({elapsed_time:.0f}s elapsed)")

            # Check progress/status
            response = requests.get(download_endpoint, headers=headers, timeout=30)

            if response.ok:
                status_data = response.json()
                print(f"📊 Status: {json.dumps(status_data, indent=2)}")

                # Check if collection is complete
                if 'status' in status_data:
                    if status_data['status'] == 'running':
                        print("⏳ Still processing... waiting 30 seconds")
                        time.sleep(30)
                        continue
                    elif status_data['status'] == 'done':
                        print("✅ Collection completed!")
                        break
                    elif status_data['status'] == 'failed':
                        print("❌ Collection failed!")
                        return save_results(snapshot_id, status_data, "failed")

                # Try to download results anyway
                print("🔄 Attempting to download results...")
                break

            else:
                print(f"❌ Status check failed: {response.status_code} - {response.text}")
                break

        except Exception as e:
            print(f"❌ Error checking status: {str(e)}")
            break

    # Try to download the actual data
    try:
        print("\n📥 Downloading scraped data...")

        # Try different download endpoints
        download_urls = [
            f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}?format=json",
            f"https://api.brightdata.com/datasets/v3/download/{snapshot_id}",
            f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
        ]

        for download_url in download_urls:
            print(f"🔗 Trying: {download_url}")

            response = requests.get(download_url, headers=headers, timeout=60)

            if response.ok:
                print(f"✅ Download successful! ({len(response.content)} bytes)")

                # Try to parse as JSON
                try:
                    scraped_data = response.json()
                    print("📄 JSON data retrieved successfully!")
                    return save_results(snapshot_id, scraped_data, "completed")
                except json.JSONDecodeError:
                    # Save as text if not JSON
                    print("📄 Non-JSON data retrieved")
                    return save_results(snapshot_id, response.text, "completed_text")

            else:
                print(f"   ❌ Failed: {response.status_code} - {response.text[:100]}")

    except Exception as e:
        print(f"❌ Download error: {str(e)}")

    return None

def save_results(snapshot_id, data, status):
    """
    Save retrieved results to JSON file
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"brightdata_results_{snapshot_id}_{status}_{timestamp}.json"

    result_data = {
        "snapshot_id": snapshot_id,
        "status": status,
        "retrieved_at": datetime.now().isoformat(),
        "data": data
    }

    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(result_data, f, indent=2, ensure_ascii=False)

        print(f"💾 Results saved to: {filename}")

        # Show preview of data
        if isinstance(data, dict) and 'data' in str(data):
            print("\n📋 Data Preview:")
            print(json.dumps(data, indent=2)[:500] + "..." if len(str(data)) > 500 else json.dumps(data, indent=2))
        elif isinstance(data, list) and data:
            print(f"\n📋 Retrieved {len(data)} items")
            if isinstance(data[0], dict):
                print(f"📄 Sample item keys: {list(data[0].keys())}")

        return filename

    except Exception as e:
        print(f"❌ Failed to save: {str(e)}")
        return None

def main():
    """
    Main function to retrieve results
    """
    # Configuration
    SNAPSHOT_ID = "s_mfba6mg9u5klm6dae"  # From your previous run
    API_KEY = "a2a22824d35a919cfc9955980b9bcf1f9d92d70fc54da6229d713edc3825efb6"

    print("🌟 Bright Data Results Retriever")
    print("=" * 60)

    # Retrieve results
    result_file = retrieve_scraping_results(SNAPSHOT_ID, API_KEY, max_wait_minutes=5)

    print("\n" + "=" * 60)
    print("📋 RETRIEVAL SUMMARY")
    print("=" * 60)

    if result_file:
        print(f"✅ Results retrieved successfully!")
        print(f"📄 Check file: {result_file}")
        print(f"\n🎉 You now have the scraped data from gemengserv.com!")
    else:
        print("❌ Failed to retrieve results")
        print("\n💡 Troubleshooting:")
        print("   - The scraping might still be in progress")
        print("   - Check your Bright Data dashboard")
        print("   - Try again in a few minutes")
        print(f"   - Use snapshot ID: {SNAPSHOT_ID}")

if __name__ == "__main__":
    main()

🌟 Bright Data Results Retriever
🔍 Retrieving results for snapshot: s_mfba6mg9u5klm6dae
⏰ Started at: 2025-09-08 15:38:52.872854
📡 Checking status... (0s elapsed)
📊 Status: {
  "status": "ready",
  "snapshot_id": "s_mfba6mg9u5klm6dae",
  "dataset_id": "gd_m6gjtfmeh43we6cqc",
  "records": 1,
  "errors": 0,
  "collection_duration": 53411
}
🔄 Attempting to download results...

📥 Downloading scraped data...
🔗 Trying: https://api.brightdata.com/datasets/v3/snapshot/s_mfba6mg9u5klm6dae?format=json
✅ Download successful! (384284 bytes)
📄 JSON data retrieved successfully!
💾 Results saved to: brightdata_results_s_mfba6mg9u5klm6dae_completed_20250908_153855.json

📋 Retrieved 1 items
📄 Sample item keys: ['markdown', 'url', 'html2text', 'page_html', 'ld_json', 'page_title', 'timestamp', 'input']

📋 RETRIEVAL SUMMARY
✅ Results retrieved successfully!
📄 Check file: brightdata_results_s_mfba6mg9u5klm6dae_completed_20250908_153855.json

🎉 You now have the scraped data from gemengserv.com!
