<a href="https://colab.research.google.com/github/workjuanmejia/MaestriaIA/blob/main/Project_Cancer_Uterino.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install requests pandas python-slugify

import os, math, time, requests, pandas as pd
from urllib.parse import urlparse
from slugify import slugify

# ----- Opcional: token de Socrata para mayores cuotas/velocidad -----
# Coloca tu token si lo tienes (Settings > Environment en Colab o aquí directamente)
SOCRATA_APP_TOKEN = os.getenv("SOCRATA_APP_TOKEN", "")  # ej. "abcd1234..."

# ======== HELPERS ========

def socrata_to_csv(domain: str, dataset_id: str, out_name: str, limit=50000, sleep_s=0.8, select=None, where=None):
    """
    Descarga TODO el dataset Socrata (SODA) con paginación y lo guarda en CSV.
    domain: 'www.datos.gov.co'
    dataset_id: como 'jba4-yke'
    out_name: nombre de archivo CSV a guardar.
    """
    base = f"https://{domain}/resource/{dataset_id}.json"
    headers = {"X-App-Token": SOCRATA_APP_TOKEN} if SOCRATA_APP_TOKEN else {}
    offset = 0
    frames = []
    while True:
        params = {"$limit": limit, "$offset": offset}
        if select: params["$select"] = select
        if where:  params["$where"] = where

        r = requests.get(base, headers=headers, params=params, timeout=60)
        r.raise_for_status()
        chunk = r.json()
        if not chunk:
            break
        frames.append(pd.DataFrame(chunk))
        offset += limit
        # cortesía para no saturar
        time.sleep(sleep_s)

    if not frames:
        print(f"[Socrata] Sin datos: {domain}/{dataset_id}")
        return

    df = pd.concat(frames, ignore_index=True)
    # Normaliza nombres
    df.columns = [slugify(c, separator="_") for c in df.columns]
    df.to_csv(out_name, index=False, encoding="utf-8-sig")
    print(f"✅ [Socrata] Guardado {out_name} | filas: {len(df)} | cols: {len(df.columns)}")
    return df


def ckan_dataset_slug_to_resource_ids(domain: str, dataset_slug: str):
    """
    Dado un dataset SLUG en CKAN (p.ej. 'morbilidad-cancer-de-cuello-uterino'),
    trae los resources (UUIDs) usando package_show.
    """
    url = f"https://{domain}/api/3/action/package_show"
    r = requests.get(url, params={"id": dataset_slug}, timeout=60)
    r.raise_for_status()
    pkg = r.json()
    if not pkg.get("success"):
        raise RuntimeError(f"CKAN package_show falló: {pkg}")
    resources = pkg["result"].get("resources", [])
    # Filtrar solo recursos con datastore activo (tabulares)
    return [res for res in resources if res.get("datastore_active")]


def ckan_resource_to_csv(domain: str, resource_id: str, out_name: str, limit=50000, sleep_s=0.8, fields=None, filters=None):
    """
    Descarga TODO el resource CKAN mediante datastore_search con paginación.
    """
    base = f"https://{domain}/api/3/action/datastore_search"
    offset = 0
    frames = []
    while True:
        params = {"resource_id": resource_id, "limit": limit, "offset": offset}
        if fields:  params["fields"]  = fields
        if filters: params["filters"] = filters  # JSON string si lo usas

        r = requests.get(base, params=params, timeout=120)
        r.raise_for_status()
        data = r.json()
        if not data.get("success"):
            raise RuntimeError(f"CKAN datastore_search falló: {data}")
        records = data["result"]["records"]
        if not records:
            break
        frames.append(pd.DataFrame.from_records(records))
        offset += limit
        time.sleep(sleep_s)

    if not frames:
        print(f"[CKAN] Sin datos: {domain} | resource {resource_id}")
        return

    df = pd.concat(frames, ignore_index=True)
    df.columns = [slugify(c, separator="_") for c in df.columns]
    df.to_csv(out_name, index=False, encoding="utf-8-sig")
    print(f"✅ [CKAN] Guardado {out_name} | filas: {len(df)} | cols: {len(df.columns)}")
    return df


In [12]:
!git clone https://github.com/workjuanmejia/MaestriaIA.git
%cd MaestriaIA

Cloning into 'MaestriaIA'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), done.
/content/MaestriaIA/MaestriaIA


In [None]:
import pandas as pd
import requests

# ========================
# 1. Endpoints Socrata (datos.gov.co)
# ========================
urls_socrata = {
    "Pereira": "https://www.datos.gov.co/resource/utgq-6fdm.json",
    "Nacional": "https://www.datos.gov.co/resource/jba4-yyke.json",
    "Cali": "https://www.datos.gov.co/resource/wqyy-ev4z.json"
}

for nombre, url in urls_socrata.items():
    df = pd.read_json(url)
    df.to_csv(f"{nombre}.csv", index=False, encoding="utf-8-sig")
    print(f"✅ Guardado {nombre}.csv con {len(df)} registros")


# ========================
# 2. Dataset Bogotá (CKAN)
# ========================
# Aquí no hay API SODA; toca usar CKAN -> "datastore_search"
# Primero revisa el resource_id en la página:
# https://datosabiertos.bogota.gov.co/dataset/morbilidad-cancer-de-cuello-uterino
# busca el "resource_id" (UUID largo, ej: 5a8b61d0-xxxx-xxxx-xxxx-xxxxxxxx)

resource_id = "<RESOURCE_ID_BOGOTA>"  # 👈 Pegar el resource_id real
url_ckan = f"https://datosabiertos.bogota.gov.co/api/3/action/datastore_search?resource_id={resource_id}&limit=50000"

r = requests.get(url_ckan)
data = r.json()

if data["success"]:
    df_bogota = pd.DataFrame(data["result"]["records"])
    df_bogota.to_csv("Bogota.csv", index=False, encoding="utf-8-sig")
    print(f"✅ Guardado Bogota.csv con {len(df_bogota)} registros")
else:
    print("⚠️ No se pudo acceder al recurso de Bogotá")




✅ Guardado Pereira.csv con 1000 registros


HTTPError: HTTP Error 400: Bad Request