# Data Profiling Dashboard

This notebook loads data from your workspace's GCS data collection and generates a **ydata-profiling** report.

**Configure** `GCS_BUCKET` and `FILE_NAME` in the next cell for your data, then run all cells.

In [None]:
# Configuration - set for your workspace's data collection
GCS_BUCKET = "my-gcs-experimentation-bucker-wb-steady-parsnip-7109"
FILE_NAME = "MUP_DPR_RY25_P04_V10_DY23_Geo.csv"
FILE_FORMAT = "csv"
print(f"Will load: gs://{GCS_BUCKET}/{FILE_NAME}")

In [None]:
# Install dependencies if needed
import sys, subprocess
try:
    from google.cloud import storage
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "google-cloud-storage"])
    from google.cloud import storage
try:
    from ydata_profiling import ProfileReport
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "ydata-profiling"])
    from ydata_profiling import ProfileReport

import pandas as pd
import numpy as np

# Numpy compatibility patch
orig = np.asarray
def patched(a, dtype=None, order=None, copy=None, **kw):
    try:
        return orig(a, dtype=dtype, order=order, copy=copy, **kw) if copy is not None else orig(a, dtype=dtype, order=order, **kw)
    except TypeError:
        kw.pop("copy", None)
        return orig(a, dtype=dtype, order=order, **kw)
np.asarray = patched
print("Libraries ready.")

In [None]:
# Load data from GCS
client = storage.Client()
bucket = client.bucket(GCS_BUCKET.replace("gs://", "").strip())
blob = bucket.blob(FILE_NAME)
path = f"/tmp/{FILE_NAME}"
blob.download_to_filename(path)
df = pd.read_csv(path)
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
df.head(10)

In [None]:
# Generate and display profiling report
profile = ProfileReport(df, title="Data Profiling Report", explorative=True, minimal=False, progress_bar=True)
profile.to_file("data_profile_report.html")
print("Report saved as data_profile_report.html")
profile.to_widgets()