# Instagram Influencer Quality Analysis Notebook

This notebook is used to collect and analyze real Instagram data for evaluating influencer quality.  
It pulls information directly from Bright Data’s scraping datasets and allows you to inspect profiles, posts, reels and comments at scale.

The goal of this notebook is to:
- retrieve fresh data for any Instagram creator,
- analyze posting patterns, engagement and audience signals,
- compare influencers across multiple metrics,
- and validate whether their content looks authentic or artificially boosted.

All scraping calls are done through Bright Data’s Dataset API and the helper functions below provide a clean interface for triggering snapshots and downloading results.


In [None]:
from dotenv import load_dotenv
from typing import Optional, List, Any, Dict
import os
import time
import requests

load_dotenv(override=True)

BASE_URL = "https://api.brightdata.com"
INSTAGRAM_DATASET_ID = "gd_l1vikfch901nx3by4"   
BRIGHTDATA_TOKEN = os.getenv("BRIGHTDATA_API_TOKEN")
INSTAGRAM_POSTS_DATASET_ID = "gd_lk5ns7kz21pck8jpis"
INSTAGRAM_REELS_DATASET_ID = "gd_lyclm20il4r5helnj"  
INSTAGRAM_COMMENTS_DATASET_ID = "gd_ltppn085pokosxh13"

headers = {
    "Authorization": f"Bearer {BRIGHTDATA_TOKEN}",
    "Content-Type": "application/json",
}


## Bright Data Helper Functions for Instagram Scraping

This section defines a small set of reusable helper functions that interact with Bright Data’s Instagram datasets.  
Each function triggers a snapshot, polls until it is ready, and then downloads the final JSON results.

Included functions:
- `fetch_instagram_profile_snapshot()` — collect profile metadata by username  
- `fetch_instagram_posts_by_url()` — collect posts within a date range  
- `fetch_instagram_reels_by_url()` — collect Reels within a date range  
- `fetch_instagram_comments_by_url()` — collect comments for a specific post or reel  

All functions use a shared internal helper `_run_brightdata_snapshot()` to avoid duplication and keep the logic consistent across datasets.


In [None]:
def _run_brightdata_snapshot(
    trigger_body: Dict[str, Any],
    trigger_params: Dict[str, Any],
    base_url: str,
    poll_interval: int,
    timeout: int,
) -> Any:
    """
    Internal helper to trigger a Bright Data snapshot, poll for completion,
    and download the resulting JSON data.
    """
    # 1. Trigger the snapshot
    trigger_resp = requests.post(
        f"{base_url}/datasets/v3/trigger",
        headers=headers,
        params=trigger_params,
        json=trigger_body,
        timeout=30,
    )
    print("Trigger raw text:", trigger_resp.text)
    trigger_resp.raise_for_status()

    trigger_json = trigger_resp.json()
    snapshot_id = trigger_json.get("snapshot_id")
    if not snapshot_id:
        raise RuntimeError(f"No snapshot_id in trigger response: {trigger_json}")

    print("Trigger response status:", trigger_resp.status_code)
    print("Snapshot ID:", snapshot_id)

    # 2. Poll snapshot progress until it is ready
    progress_url = f"{base_url}/datasets/v3/progress/{snapshot_id}"
    deadline = time.time() + timeout

    while True:
        progress_resp = requests.get(progress_url, headers=headers, timeout=30)
        print("Progress raw:", progress_resp.text[:300])
        progress_resp.raise_for_status()

        progress_json = progress_resp.json()
        status = progress_json.get("status")
        print("Current status:", status)

        if status in {"done", "completed", "ready"}:
            print("Snapshot is ready!")
            break

        if status in {"failed", "error"}:
            raise RuntimeError(
                f"Snapshot failed with status='{status}'. Response: {progress_json}"
            )

        if time.time() > deadline:
            raise TimeoutError(
                f"Snapshot {snapshot_id} timed out after {timeout}s. Last status: {status}"
            )

        time.sleep(poll_interval)

    # 3. Download the snapshot contents as JSON
    download_url = f"{base_url}/datasets/v3/snapshot/{snapshot_id}"
    download_resp = requests.get(
        download_url,
        headers=headers,
        params={"format": "json"},
        timeout=60,
    )
    download_resp.raise_for_status()

    data = download_resp.json()
    print(
        f"Downloaded {len(data) if isinstance(data, list) else 'unknown number of'} records."
    )

    return data


def fetch_instagram_profile_snapshot(
    user_name: str,
    dataset_id: str = INSTAGRAM_DATASET_ID,
    base_url: str = BASE_URL,
    poll_interval: int = 5,
    timeout: int = 300,
):
    """
    Trigger an Instagram snapshot, wait until it is fully processed,
    and download the resulting JSON data.

    Parameters
    ----------
    user_name : str
        Instagram username without '@', e.g. "zoobarcelona".
    dataset_id : str
        Bright Data dataset ID for Instagram.
    base_url : str
        Base Bright Data API URL.
    poll_interval : int
        Time in seconds between progress checks.
    timeout : int
        Maximum time in seconds to wait for snapshot to finish.

    Returns
    -------
        data - parsed JSON from the snapshot.
    """
    trigger_body = {
        "input": [{"user_name": user_name}],
    }

    trigger_params = {
        "dataset_id": dataset_id,
        "notify": "false",
        "include_errors": "true",
        "type": "discover_new",
        "discover_by": "user_name",
    }

    data = _run_brightdata_snapshot(
        trigger_body=trigger_body,
        trigger_params=trigger_params,
        base_url=base_url,
        poll_interval=poll_interval,
        timeout=timeout,
    )

    return data


def fetch_instagram_posts_by_url(
    url: str,
    start_date: str,
    end_date: str,
    num_of_posts: Optional[int] = None,
    post_type: Optional[str] = None,
    posts_to_not_include: Optional[List[str]] = None,
    dataset_id: str = INSTAGRAM_POSTS_DATASET_ID,
    base_url: str = BASE_URL,
    poll_interval: int = 5,
    timeout: int = 300,
) -> Any:
    """
    Trigger an Instagram posts snapshot for a single profile URL, wait
    until it is fully processed, and download the resulting JSON data.

    Parameters
    ----------
    url : str
        Full Instagram profile URL, e.g. "https://www.instagram.com/meta/".
    start_date : str
        Start date filter for posts, as a string, e.g. "01-01-2025".
        (Use the same format as configured in your Bright Data dataset.)
    end_date : str
        End date filter for posts, e.g. "03-01-2025".
    num_of_posts : int, optional
        Maximum number of posts to fetch. If None, the dataset default is used.
    post_type : str, optional
        Post type filter, e.g. "Post", "Reel", or "" for all.
    posts_to_not_include : list of str, optional
        List of post IDs that should be excluded.
    dataset_id : str
        Bright Data dataset ID for Instagram posts.
    base_url : str
        Base Bright Data API URL.
    poll_interval : int
        Time in seconds between progress checks.
    timeout : int
        Maximum time in seconds to wait for snapshot to finish.

    Returns
    -------
    tuple
        data - parsed JSON from the snapshot.
    """
    input_payload: dict = {
        "url": url,
        "start_date": start_date,
        "end_date": end_date,
    }

    if num_of_posts is not None:
        input_payload["num_of_posts"] = num_of_posts

    if post_type is not None:
        input_payload["post_type"] = post_type

    if posts_to_not_include:
        input_payload["posts_to_not_include"] = posts_to_not_include

    trigger_body = {
        "input": [input_payload],
    }

    trigger_params = {
        "dataset_id": dataset_id,
        "notify": "false",
        "include_errors": "true",
        "type": "discover_new",
        "discover_by": "url",
    }

    data = _run_brightdata_snapshot(
        trigger_body=trigger_body,
        trigger_params=trigger_params,
        base_url=base_url,
        poll_interval=poll_interval,
        timeout=timeout,
    )

    return data


def fetch_instagram_reels_by_url(
    url: str,
    start_date: str = "",
    end_date: str = "",
    dataset_id: str = INSTAGRAM_REELS_DATASET_ID,
    base_url: str = BASE_URL,
    poll_interval: int = 5,
    timeout: int = 300,
):
    """
    Trigger an Instagram Reels snapshot for a single profile URL, wait
    until it is fully processed, and download the resulting JSON data.

    Parameters
    ----------
    url : str
        Full Instagram profile URL, e.g. "https://www.instagram.com/espn".
    start_date : str
        Optional start date filter for reels. Format must match the dataset.
        If empty string, no filter is applied.
    end_date : str
        Optional end date filter for reels. If empty string, no filter is applied.
    dataset_id : str
        Bright Data dataset ID for Instagram Reels.
    base_url : str
        Base Bright Data API URL.
    poll_interval : int
        Time in seconds between progress checks.
    timeout : int
        Maximum time in seconds to wait for snapshot to finish.

    Returns
    -------
    data - parsed JSON from the snapshot.
    """
    input_payload = {
        "url": url,
        "start_date": start_date,
        "end_date": end_date,
    }

    trigger_body = {
        "input": [input_payload],
    }

    trigger_params = {
        "dataset_id": dataset_id,
        "notify": "false",
        "include_errors": "true",
        "type": "discover_new",
        "discover_by": "url",
    }

    data = _run_brightdata_snapshot(
        trigger_body=trigger_body,
        trigger_params=trigger_params,
        base_url=base_url,
        poll_interval=poll_interval,
        timeout=timeout,
    )

    return data

def fetch_instagram_comments_by_url(
    url: str,
    dataset_id: str = INSTAGRAM_COMMENTS_DATASET_ID,
    base_url: str = BASE_URL,
    poll_interval: int = 5,
    timeout: int = 300,
) -> Any:
    """
    Trigger an Instagram comments snapshot for a single post URL, wait
    until it is fully processed, and download the resulting JSON data.

    Parameters
    ----------
    url : str
        Full Instagram post or reel URL, e.g.
        "https://www.instagram.com/cats_of_instagram/reel/C4GLo_eLO2e/".
    dataset_id : str
        Bright Data dataset ID for Instagram comments.
    base_url : str
        Base Bright Data API URL.
    poll_interval : int
        Time in seconds between progress checks.
    timeout : int
        Maximum time in seconds to wait for snapshot to finish.

    Returns
    -------
        data - parsed JSON from the snapshot.
    """
    input_payload = {
        "url": url,
    }

    trigger_body = {
        "input": [input_payload],
    }

    trigger_params = {
        "dataset_id": dataset_id,
        "notify": "false",
        "include_errors": "true",
    }

    data = _run_brightdata_snapshot(
        trigger_body=trigger_body,
        trigger_params=trigger_params,
        base_url=base_url,
        poll_interval=poll_interval,
        timeout=timeout,
    )

    return data

## Examples of calls

```
data = fetch_instagram_profile_snapshot("zoobarcelona")

posts_data = fetch_instagram_posts_by_url(
    url="https://www.instagram.com/zoobarcelona/",
    start_date="08-21-2025",  # 3 months back from today, example
    end_date="11-21-2025",
    post_type="Post",
)

reels_data = fetch_instagram_reels_by_url(
    url="https://www.instagram.com/zoobarcelona",
    start_date="10-21-2025",
    end_date="11-21-2025",
)

comments_data = fetch_instagram_comments_by_url(
    url="https://www.instagram.com/cats_of_instagram/reel/C4GLo_eLO2e/"
)

```

In [None]:
import json
from urllib.parse import urlparse

URLS = [
    "https://www.instagram.com/biancafrombrooklyn",
    "https://www.instagram.com/emscakesntreats",
    "https://www.instagram.com/aya_eats_",
    "https://www.instagram.com/bigdoughenergy/",
    "https://www.instagram.com/sorteddelightsby_lini",
    "https://www.instagram.com/breadology101",

    "https://www.instagram.com/theclevercarrot",
    "https://www.instagram.com/BrooklynSourdough",
    "https://www.instagram.com/riseandloaf_sourdoughco",
    "https://www.instagram.com/BlondieandRye",
    "https://www.instagram.com/Maurizio",
    "https://www.instagram.com/october_farms",
    "https://www.instagram.com/the.sourdough.baker",
    "https://www.instagram.com/bookroad.sourdough.co",
    "https://www.instagram.com/giasbatch",
    "https://www.instagram.com/amybakesbread",

    "https://www.instagram.com/artisanbryan",
    "https://www.instagram.com/thebreadahead",
    "https://www.instagram.com/nyc.breadgirl",
    "https://www.instagram.com/oliver_the_baker",
]


def extract_username(url: str) -> str:
    """Extract username from Instagram URL."""
    path = urlparse(url).path
    # /username/, /username → strip slashes
    username = path.strip("/")

    return username


def collect_profiles(urls):
    """Fetch profile snapshots for unique usernames."""
    unique_usernames = {extract_username(u) for u in urls}

    results = []
    for username in unique_usernames:
        try:
            print(f"Fetching: {username}")
            profile = fetch_instagram_profile_snapshot(username)
            if profile:
                results.append(profile)
        except Exception as e:
            print(f"Error fetching {username}: {e}")

    return results


profiles = collect_profiles(URLS)

with open("profiles.json", "w", encoding="utf-8") as f:
    json.dump(profiles, f, ensure_ascii=False, indent=2)

print(f"Saved {len(profiles)} profiles.")


In [None]:
import pandas as pd
from typing import List, Dict, Any, Optional


def _pick_existing_key(sample: Dict[str, Any], candidates: List[str]) -> Optional[str]:
    """
    Try to find the first key in candidates that exists in the sample dict.
    Returns the key name or None.
    """
    for key in candidates:
        if key in sample:
            return key
    return None


def build_posts_dataframe(posts_data: List[Dict[str, Any]]) -> pd.DataFrame:
    """
    Convert raw Bright Data Instagram posts JSON into a normalized DataFrame.

    The function uses the following mapping:
      - posted_at:  content timestamp from 'date_posted'
      - scraped_at: snapshot timestamp from 'timestamp' (Bright Data metadata)
      - likes:      from 'likes'
      - comments:   from 'num_comments'
      - caption:    from 'description'
      - caption_length: length of caption text
      - post_url:   from 'url'
      - post_id:    from 'post_id'
      - content_type: from 'content_type'
      - followers:  from 'followers' (per-row, last seen value)
    """
    if not posts_data:
        raise ValueError("posts_data is empty")

    df = pd.DataFrame(posts_data)

    # Content timestamp: when the post was actually published
    if "date_posted" in df.columns:
        df["posted_at"] = pd.to_datetime(
            df["date_posted"], utc=True, errors="coerce"
        ).dt.tz_convert(None)  # make tz-naive
    else:
        df["posted_at"] = pd.NaT

    # Snapshot timestamp: when Bright Data collected the data
    if "timestamp" in df.columns:
        df["scraped_at"] = pd.to_datetime(
            df["timestamp"], utc=True, errors="coerce"
        ).dt.tz_convert(None)
    else:
        df["scraped_at"] = pd.NaT

    # Basic engagement metrics
    if "likes" in df.columns:
        df["likes"] = pd.to_numeric(df["likes"], errors="coerce")
    else:
        df["likes"] = None

    if "num_comments" in df.columns:
        df["comments"] = pd.to_numeric(df["num_comments"], errors="coerce")
    else:
        df["comments"] = None

    # Caption text and its length
    if "description" in df.columns:
        df["caption"] = df["description"].fillna("").astype(str)
    else:
        df["caption"] = ""

    df["caption_length"] = df["caption"].str.len()

    # Basic identifiers / URLs
    if "url" in df.columns:
        df["post_url"] = df["url"].astype(str)

    if "post_id" in df.columns:
        df["post_id"] = df["post_id"].astype(str)

    if "content_type" in df.columns:
        df["content_type"] = df["content_type"].astype(str)

    if "followers" in df.columns:
        df["followers"] = pd.to_numeric(df["followers"], errors="coerce")

    return df

def summarize_post_engagement(
    df: pd.DataFrame,
    followers_count: int | None = None,
    days: int = 90,
) -> dict:
    """
    Compute simple engagement stats for the last N days.

    Uses the 'posted_at' column (tz-naive) as the post timestamp.
    """
    if "posted_at" not in df.columns:
        raise ValueError("DataFrame must contain 'posted_at' column")

    cutoff = datetime.utcnow() - timedelta(days=days)

    recent = df[df["posted_at"] >= cutoff].copy()

    if recent.empty:
        return {
            "posts_in_period": 0,
            "avg_likes": 0,
            "avg_comments": 0,
            "engagement_rate_avg": None,
        }

    avg_likes = recent["likes"].mean()
    avg_comments = recent["comments"].mean()

    if followers_count is None and "followers" in recent.columns:
        followers_count = recent["followers"].dropna().iloc[-1] if not recent["followers"].dropna().empty else None

    if followers_count and followers_count > 0:
        recent["engagement_rate"] = (recent["likes"] + recent["comments"]) / followers_count
        engagement_rate_avg = recent["engagement_rate"].mean()
    else:
        engagement_rate_avg = None

    return {
        "posts_in_period": int(len(recent)),
        "avg_likes": float(avg_likes) if pd.notna(avg_likes) else 0,
        "avg_comments": float(avg_comments) if pd.notna(avg_comments) else 0,
        "engagement_rate_avg": float(engagement_rate_avg) if engagement_rate_avg is not None else None,
    }

In [None]:
import matplotlib.pyplot as plt

df_plot = posts_df.sort_values("posted_at")

plt.figure(figsize=(10, 4))
plt.plot(df_plot["posted_at"], df_plot["likes"])
plt.title("Likes over time")
plt.xlabel("Date")
plt.ylabel("Likes")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
