In [10]:
from dotenv import load_dotenv
from typing import Optional, List, Tuple, Any
import os
import time
import requests

load_dotenv(override=True)

BASE_URL = "https://api.brightdata.com"
INSTAGRAM_DATASET_ID = "gd_l1vikfch901nx3by4"   
BRIGHTDATA_TOKEN = os.getenv("BRIGHTDATA_API_TOKEN")
INSTAGRAM_POSTS_DATASET_ID = "gd_lk5ns7kz21pck8jpis"
INSTAGRAM_REELS_DATASET_ID = "gd_lyclm20il4r5helnj"  

headers = {
    "Authorization": f"Bearer {BRIGHTDATA_TOKEN}",
    "Content-Type": "application/json",
}


In [5]:
def fetch_instagram_profile_snapshot(
    user_name: str,
    dataset_id: str = INSTAGRAM_DATASET_ID,
    base_url: str = BASE_URL,
    poll_interval: int = 5,
    timeout: int = 300,
):
    """
    Trigger an Instagram snapshot, wait until it is fully processed,
    and download the resulting JSON data.

    Parameters
    ----------
    user_name : str
        Instagram username without '@', e.g. "zoobarcelona".
    dataset_id : str
        Bright Data dataset ID for Instagram.
    base_url : str
        Base Bright Data API URL.
    poll_interval : int
        Time in seconds between progress checks.
    timeout : int
        Maximum time in seconds to wait for snapshot to finish.

    Returns
    -------
        data - parsed JSON from the snapshot.
    """
    trigger_body = {
        "input": [{"user_name": user_name}],
    }

    trigger_params = {
        "dataset_id": dataset_id,
        "notify": "false",
        "include_errors": "true",
        "type": "discover_new",
        "discover_by": "user_name",
    }

    trigger_resp = requests.post(
        f"{base_url}/datasets/v3/trigger",
        headers=headers,
        params=trigger_params,
        json=trigger_body,
        timeout=30,
    )
    print("Trigger raw text:", trigger_resp.text)
    trigger_resp.raise_for_status()

    trigger_json = trigger_resp.json()
    snapshot_id = trigger_json.get("snapshot_id")
    if not snapshot_id:
        raise RuntimeError(f"No snapshot_id in trigger response: {trigger_json}")

    print("Trigger response:", trigger_resp.status_code, trigger_json)
    print("Snapshot ID:", snapshot_id)

    progress_url = f"{base_url}/datasets/v3/progress/{snapshot_id}"
    deadline = time.time() + timeout

    while True:
        r = requests.get(progress_url, headers=headers, timeout=30)
        print("Progress raw:", r.text[:300])
        r.raise_for_status()

        j = r.json()
        status = j.get("status")
        print("Current status:", status)

        if status in {"done", "completed", "ready"}:
            print("Snapshot ready!")
            break

        if status in {"failed", "error"}:
            raise RuntimeError(f"Snapshot failed with status={status}: {j}")

        if time.time() > deadline:
            raise TimeoutError(f"Timeout waiting for snapshot {snapshot_id}, last status={status}")

        time.sleep(poll_interval)

    download_url = f"{base_url}/datasets/v3/snapshot/{snapshot_id}"
    resp = requests.get(
        download_url,
        headers=headers,
        params={"format": "json"},
        timeout=60,
    )
    resp.raise_for_status()

    data = resp.json()
    print(f"Downloaded {len(data) if isinstance(data, list) else 'some'} records")

    return data


In [6]:
def fetch_instagram_posts_by_url(
    url: str,
    start_date: str,
    end_date: str,
    num_of_posts: Optional[int] = None,
    post_type: Optional[str] = None,
    posts_to_not_include: Optional[List[str]] = None,
    dataset_id: str = INSTAGRAM_POSTS_DATASET_ID,
    base_url: str = BASE_URL,
    poll_interval: int = 5,
    timeout: int = 300,
) -> Any:
    """
    Trigger an Instagram posts snapshot for a single profile URL, wait
    until it is fully processed, and download the resulting JSON data.

    Parameters
    ----------
    url : str
        Full Instagram profile URL, e.g. "https://www.instagram.com/meta/".
    start_date : str
        Start date filter for posts, as a string, e.g. "01-01-2025".
        (Use the same format as configured in your Bright Data dataset.)
    end_date : str
        End date filter for posts, e.g. "03-01-2025".
    num_of_posts : int, optional
        Maximum number of posts to fetch. If None, the dataset default is used.
    post_type : str, optional
        Post type filter, e.g. "Post", "Reel", or "" for all.
    posts_to_not_include : list of str, optional
        List of post IDs that should be excluded.
    dataset_id : str
        Bright Data dataset ID for Instagram posts.
    base_url : str
        Base Bright Data API URL.
    poll_interval : int
        Time in seconds between progress checks.
    timeout : int
        Maximum time in seconds to wait for snapshot to finish.

    Returns
    -------
    tuple
        data - parsed JSON from the snapshot.
    """
    input_payload: dict = {
        "url": url,
        "start_date": start_date,
        "end_date": end_date,
    }

    if num_of_posts is not None:
        input_payload["num_of_posts"] = num_of_posts

    if post_type is not None:
        input_payload["post_type"] = post_type

    if posts_to_not_include:
        input_payload["posts_to_not_include"] = posts_to_not_include

    trigger_body = {
        "input": [input_payload],
    }

    trigger_params = {
        "dataset_id": dataset_id,
        "notify": "false",
        "include_errors": "true",
        "type": "discover_new",
        "discover_by": "url",
    }

    # 1. Trigger the snapshot creation
    trigger_resp = requests.post(
        f"{base_url}/datasets/v3/trigger",
        headers=headers,
        params=trigger_params,
        json=trigger_body,
        timeout=30,
    )
    print("Trigger raw text:", trigger_resp.text)
    trigger_resp.raise_for_status()

    trigger_json = trigger_resp.json()
    snapshot_id = trigger_json.get("snapshot_id")
    if not snapshot_id:
        raise RuntimeError(
            f"No snapshot_id returned by Bright Data. Response: {trigger_json}"
        )

    print("Trigger response status:", trigger_resp.status_code)
    print("Snapshot ID:", snapshot_id)

    # 2. Poll snapshot progress until it is ready
    progress_url = f"{base_url}/datasets/v3/progress/{snapshot_id}"
    deadline = time.time() + timeout

    while True:
        progress_resp = requests.get(progress_url, headers=headers, timeout=30)
        print("Progress raw:", progress_resp.text[:300])
        progress_resp.raise_for_status()

        progress_json = progress_resp.json()
        status = progress_json.get("status")
        print("Current status:", status)

        if status in {"done", "completed", "ready"}:
            print("Snapshot is ready!")
            break

        if status in {"failed", "error"}:
            raise RuntimeError(
                f"Snapshot failed with status='{status}'. Response: {progress_json}"
            )

        if time.time() > deadline:
            raise TimeoutError(
                f"Snapshot {snapshot_id} timed out after {timeout}s. Last status: {status}"
            )

        time.sleep(poll_interval)

    # 3. Download the snapshot contents as JSON
    download_url = f"{base_url}/datasets/v3/snapshot/{snapshot_id}"
    download_resp = requests.get(
        download_url,
        headers=headers,
        params={"format": "json"},
        timeout=60,
    )
    download_resp.raise_for_status()

    data = download_resp.json()
    print(
        f"Downloaded {len(data) if isinstance(data, list) else 'unknown number of'} records."
    )

    return data

In [None]:
def fetch_instagram_reels_by_url(
    url: str,
    start_date: str = "",
    end_date: str = "",
    dataset_id: str = INSTAGRAM_REELS_DATASET_ID,
    base_url: str = BASE_URL,
    poll_interval: int = 5,
    timeout: int = 300,
):
    """
    Trigger an Instagram Reels snapshot for a single profile URL, wait
    until it is fully processed, and download the resulting JSON data.

    Parameters
    ----------
    url : str
        Full Instagram profile URL, e.g. "https://www.instagram.com/espn".
    start_date : str
        Optional start date filter for reels. Format must match the dataset.
        If empty string, no filter is applied.
    end_date : str
        Optional end date filter for reels. If empty string, no filter is applied.
    dataset_id : str
        Bright Data dataset ID for Instagram Reels.
    base_url : str
        Base Bright Data API URL.
    poll_interval : int
        Time in seconds between progress checks.
    timeout : int
        Maximum time in seconds to wait for snapshot to finish.

    Returns
    -------
    data - parsed JSON from the snapshot.
    """
    input_payload = {
        "url": url,
        "start_date": start_date,
        "end_date": end_date,
    }

    trigger_body = {
        "input": [input_payload],
    }

    trigger_params = {
        "dataset_id": dataset_id,
        "notify": "false",
        "include_errors": "true",
        "type": "discover_new",
        "discover_by": "url",
    }

    # 1. Trigger the snapshot
    trigger_resp = requests.post(
        f"{base_url}/datasets/v3/trigger",
        headers=headers,
        params=trigger_params,
        json=trigger_body,
        timeout=30,
    )
    print("Trigger raw text:", trigger_resp.text)
    trigger_resp.raise_for_status()

    trigger_json = trigger_resp.json()
    snapshot_id = trigger_json.get("snapshot_id")
    if not snapshot_id:
        raise RuntimeError(
            f"No snapshot_id returned by Bright Data. Response: {trigger_json}"
        )

    print("Trigger response status:", trigger_resp.status_code)
    print("Snapshot ID:", snapshot_id)

    # 2. Poll until ready
    progress_url = f"{base_url}/datasets/v3/progress/{snapshot_id}"
    deadline = time.time() + timeout

    while True:
        progress_resp = requests.get(progress_url, headers=headers, timeout=30)
        print("Progress raw:", progress_resp.text[:300])
        progress_resp.raise_for_status()

        progress_json = progress_resp.json()
        status = progress_json.get("status")
        print("Current status:", status)

        if status in {"done", "completed", "ready"}:
            print("Snapshot is ready!")
            break

        if status in {"failed", "error"}:
            raise RuntimeError(
                f"Snapshot failed with status='{status}'. Response: {progress_json}"
            )

        if time.time() > deadline:
            raise TimeoutError(
                f"Snapshot {snapshot_id} timed out after {timeout}s. Last status: {status}"
            )

        time.sleep(poll_interval)

    # 3. Download the snapshot contents
    download_url = f"{base_url}/datasets/v3/snapshot/{snapshot_id}"
    download_resp = requests.get(
        download_url,
        headers=headers,
        params={"format": "json"},
        timeout=60,
    )
    download_resp.raise_for_status()

    data = download_resp.json()
    print(
        f"Downloaded {len(data) if isinstance(data, list) else 'unknown number of'} records."
    )

    return data


In [None]:
data = fetch_instagram_profile_snapshot("zoobarcelona")


In [9]:
posts_data = fetch_instagram_posts_by_url(
    url="https://www.instagram.com/meta/",
    start_date="08-21-2025",  # 3 months back from today, example
    end_date="11-21-2025",
    post_type="Post",
)
posts_data

Trigger raw text: {"snapshot_id":"sd_mi8os4es253rl0tzzf"}
Trigger response status: 200
Snapshot ID: sd_mi8os4es253rl0tzzf
Progress raw: {"status":"running","snapshot_id":"sd_mi8os4es253rl0tzzf","dataset_id":"gd_lk5ns7kz21pck8jpis"}
Current status: running
Progress raw: {"status":"running","snapshot_id":"sd_mi8os4es253rl0tzzf","dataset_id":"gd_lk5ns7kz21pck8jpis"}
Current status: running
Progress raw: {"status":"running","snapshot_id":"sd_mi8os4es253rl0tzzf","dataset_id":"gd_lk5ns7kz21pck8jpis"}
Current status: running
Progress raw: {"status":"running","snapshot_id":"sd_mi8os4es253rl0tzzf","dataset_id":"gd_lk5ns7kz21pck8jpis"}
Current status: running
Progress raw: {"status":"ready","snapshot_id":"sd_mi8os4es253rl0tzzf","dataset_id":"gd_lk5ns7kz21pck8jpis","records":9,"errors":0,"collection_duration":18739}
Current status: ready
Snapshot is ready!
Downloaded 9 records.


[{'url': 'https://www.instagram.com/p/DQEyuMKDoqk',
  'user_posted': 'oakleymeta',
  'description': 'Oakley Meta Vanguard is here. Link in bio to order the full collection.',
  'num_comments': 328,
  'date_posted': '2025-10-21T14:37:11.000Z',
  'likes': 6050,
  'photos': ['https://scontent-dfw5-2.cdninstagram.com/v/t51.2885-15/567924795_17900581518295039_5438479003989676902_n.jpg?stp=dst-jpg_e35_p1080x1080_sh0.08_tt6&_nc_ht=scontent-dfw5-2.cdninstagram.com&_nc_cat=107&_nc_oc=Q6cZ2QGX1BPlHSfYVhAe2U8fSt_3_E2TRQR1gPwPkkYZgzVqMpKrNhJmqS5r5aFSjC6xskA&_nc_ohc=ArVVSflhkDgQ7kNvwHIlxlg&_nc_gid=FHt0jn5fY0O4wOk1CaCr2A&edm=ANTKIIoBAAAA&ccb=7-5&oh=00_AfiOB253DoRtAGEC4vSSR6wXVDO3h8997QMM14uxC6GpYA&oe=69262207&_nc_sid=d885a2',
   'https://scontent-dfw5-2.cdninstagram.com/v/t51.2885-15/565326631_17900581533295039_3928103440686233608_n.jpg?stp=dst-jpg_e35_p1080x1080_sh0.08_tt6&_nc_ht=scontent-dfw5-2.cdninstagram.com&_nc_cat=107&_nc_oc=Q6cZ2QGX1BPlHSfYVhAe2U8fSt_3_E2TRQR1gPwPkkYZgzVqMpKrNhJmqS5r5aFSjC6x