# Data Analytics: flujo end-to-end (Google Colab)

Este notebook replica el flujo del original y está preparado para ejecutarse directamente en Google Colab.

Pasos:
1. Clonar el repositorio (para habilitar imports 'app.*').
2. Instalar dependencias con !pip.
3. Configurar rutas en /content y sys.path.
4. Generar dataset (idempotente) y cargar en pandas.
5. Chequear integridad.
6. Calcular métricas y guardar JSON.
7. Análisis exploratorio y guardar JSON.
8. Visualizaciones con Plotly.

Ejecutá las celdas en orden (Runtime > Run all).

In [None]:
# 0) Contexto de ejecución y utilidades
import sys
import os
import json
import importlib
import subprocess
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules or os.environ.get("COLAB_RELEASE_TAG") is not None
BASE_DIR = Path("/content") if IN_COLAB else Path.cwd()
REPO_URL = "https://github.com/vhcontre/data-analytics-con-python.git"
REPO_DIR = BASE_DIR / "data-analytics-con-python"
BACKEND_ROOT = REPO_DIR / "backend"

print(f"Python: {sys.version.split()[0]}")
print(f"IN_COLAB: {IN_COLAB}")
print(f"REPO_DIR: {REPO_DIR}")
print(f"BACKEND_ROOT: {BACKEND_ROOT}")

In [None]:
# 1) Clonar el repositorio (si no existe) para habilitar imports 'app.*'
if not REPO_DIR.exists():
    print("Clonando repositorio...")
    subprocess.run(["git", "clone", "--depth", "1", REPO_URL, str(REPO_DIR)], check=True)
else:
    print("Repositorio ya presente; saltando clonación.")

# Comprobación rápida de carpetas clave
for p in [BACKEND_ROOT / "app", BACKEND_ROOT / "datasets", BACKEND_ROOT / "reports"]:
    print(p, 'exists' if p.exists() else 'missing')

In [None]:
# 2) Instalar dependencias necesarias
# Nota: Usamos !pip para que los paquetes queden en el kernel actual.
!pip install -q -U pip
!pip install -q -r /content/data-analytics-con-python/backend/requirements.txt

# Chequeo rápido de librerías clave
for mod in ("numpy", "pandas", "plotly"):
    try:
        m = importlib.import_module(mod)
        print(f"{mod}=={getattr(m, '__version__', 'unknown')}")
    except Exception as e:
        print(f"Falta o falla {mod}: {e}")

In [None]:
# 3) Configurar rutas y sys.path para importar 'app.*'
if str(BACKEND_ROOT) not in sys.path:
    sys.path.insert(0, str(BACKEND_ROOT))

DATASETS_DIR = BACKEND_ROOT / "datasets"
REPORTS_DIR = BACKEND_ROOT / "reports"
DATASETS_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
CSV_PATH = DATASETS_DIR / "product_dataset.csv"

print(f"DATASETS_DIR: {DATASETS_DIR}")
print(f"REPORTS_DIR: {REPORTS_DIR}")
print(f"CSV_PATH: {CSV_PATH}")

In [None]:
# 4) Generar dataset (idempotente) y cargarlo
import pandas as pd
import sys
import subprocess

# Revalidar variables de ruta por si se ejecuta esta celda aislada
try:
    CSV_PATH
except NameError:
    from pathlib import Path
    REPO_DIR = Path("/content") / "data-analytics-con-python"
    BACKEND_ROOT = REPO_DIR / "backend"
    if str(BACKEND_ROOT) not in sys.path:
        sys.path.insert(0, str(BACKEND_ROOT))
    DATASETS_DIR = BACKEND_ROOT / "datasets"
    DATASETS_DIR.mkdir(parents=True, exist_ok=True)
    CSV_PATH = DATASETS_DIR / "product_dataset.csv"

# Generar sólo si no existe (usar CLI del módulo; main() no acepta args posicionales)
if not CSV_PATH.exists():
    subprocess.run([
        sys.executable, "-m", "app.scripts.create_product_dataset",
        "--num-samples", "200", "--seed", "42", "--out", str(CSV_PATH)
    ], cwd=str(BACKEND_ROOT), check=True)
else:
    print("Dataset ya existe; saltando generación.")

# Cargar
assert CSV_PATH.exists(), f"No se encontró el CSV en {CSV_PATH}"
df = pd.read_csv(CSV_PATH)
print("Dataset cargado:", df.shape)
df.head()

In [None]:
# 5) Verificar integridad (modo no estricto para continuar el flujo)
from app.scripts.check_dataset_integrity import main as chk_main

# Revalidar CSV_PATH si se ejecuta esta celda aislada
try:
    CSV_PATH
except NameError:
    from pathlib import Path
    import sys
    REPO_DIR = Path("/content") / "data-analytics-con-python"
    BACKEND_ROOT = REPO_DIR / "backend"
    if str(BACKEND_ROOT) not in sys.path:
        sys.path.insert(0, str(BACKEND_ROOT))
    DATASETS_DIR = BACKEND_ROOT / "datasets"
    DATASETS_DIR.mkdir(parents=True, exist_ok=True)
    CSV_PATH = DATASETS_DIR / "product_dataset.csv"

exit_code = chk_main(["--path", str(CSV_PATH)])
print("Integrity check exit code:", exit_code)

In [None]:
# 6) Calcular métricas y guardar JSON
from app.scripts.calculate_product_metrics import main as met_main
from pathlib import Path

# Revalidar rutas si se ejecuta aislada
try:
    REPORTS_DIR
    CSV_PATH
except NameError:
    from pathlib import Path as _Path
    import sys
    REPO_DIR = _Path("/content") / "data-analytics-con-python"
    BACKEND_ROOT = REPO_DIR / "backend"
    if str(BACKEND_ROOT) not in sys.path:
        sys.path.insert(0, str(BACKEND_ROOT))
    REPORTS_DIR = BACKEND_ROOT / "reports"
    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    DATASETS_DIR = BACKEND_ROOT / "datasets"
    DATASETS_DIR.mkdir(parents=True, exist_ok=True)
    CSV_PATH = DATASETS_DIR / "product_dataset.csv"

# Garantizar un Path tipado para evitar Union[Unbound|Path]
if 'REPORTS_DIR' in globals() and isinstance(REPORTS_DIR, Path):
    _REPORTS_DIR: Path = REPORTS_DIR
else:
    _BACKEND = (Path("/content") / "data-analytics-con-python" / "backend")
    _REPORTS_DIR = _BACKEND / "reports"
    _REPORTS_DIR.mkdir(parents=True, exist_ok=True)

metrics_path = _REPORTS_DIR / "metrics.json"
_ = met_main(["--path", str(CSV_PATH), "--json-out", str(metrics_path)])

metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
metrics

In [None]:
# 7) Análisis exploratorio y guardar JSON
from app.scripts.exploratory_analysis import main as exp_main
import json
from pathlib import Path

# Revalidar rutas si se ejecuta aislada
try:
    REPORTS_DIR
    CSV_PATH
except NameError:
    from pathlib import Path as _Path
    import sys
    REPO_DIR = _Path("/content") / "data-analytics-con-python"
    BACKEND_ROOT = REPO_DIR / "backend"
    if str(BACKEND_ROOT) not in sys.path:
        sys.path.insert(0, str(BACKEND_ROOT))
    REPORTS_DIR = BACKEND_ROOT / "reports"
    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    DATASETS_DIR = BACKEND_ROOT / "datasets"
    DATASETS_DIR.mkdir(parents=True, exist_ok=True)
    CSV_PATH = DATASETS_DIR / "product_dataset.csv"

# Garantizar Path tipado para reports
if 'REPORTS_DIR' in globals() and isinstance(REPORTS_DIR, Path):
    _REPORTS_DIR: Path = REPORTS_DIR
else:
    _BACKEND = (Path("/content") / "data-analytics-con-python" / "backend")
    _REPORTS_DIR = _BACKEND / "reports"
    _REPORTS_DIR.mkdir(parents=True, exist_ok=True)

expl_summary_path = _REPORTS_DIR / "exploratory_summary.json"
_ = exp_main(["--path", str(CSV_PATH), "--json-out", str(expl_summary_path)])

summary = json.loads(expl_summary_path.read_text(encoding="utf-8"))
summary

In [None]:
# 8) Visualizaciones con Plotly
import plotly.express as px
import pandas as pd

# Revalidar df y metrics si se ejecuta aislada
try:
    df
except NameError:
    from pathlib import Path
    import sys
    REPO_DIR = Path("/content") / "data-analytics-con-python"
    BACKEND_ROOT = REPO_DIR / "backend"
    if str(BACKEND_ROOT) not in sys.path:
        sys.path.insert(0, str(BACKEND_ROOT))
    DATASETS_DIR = BACKEND_ROOT / "datasets"
    CSV_PATH = DATASETS_DIR / "product_dataset.csv"
    df = pd.read_csv(CSV_PATH)

try:
    metrics
except NameError:
    import json
    REPORTS_DIR = BACKEND_ROOT / "reports"
    metrics_path = REPORTS_DIR / "metrics.json"
    if metrics_path.exists():
        metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
    else:
        metrics = {}

# Conteo por categoría
if "Category" in df.columns:
    counts = df["Category"].value_counts(dropna=False).reset_index()
    counts.columns = ["Category", "count"]
    fig1 = px.bar(counts, x="Category", y="count", title="Conteo de productos por categoría")
    fig1.show()

# Promedios globales
avg_df = pd.DataFrame([
    {"metric": "BaseYield", "value": metrics.get("average_base_yield")},
    {"metric": "Cost", "value": metrics.get("total_cost")},
    {"metric": "EnvironmentalImpact", "value": metrics.get("average_environmental_impact")},
]).dropna()
if not avg_df.empty:
    fig2 = px.bar(avg_df, x="metric", y="value", title="Métricas globales")
    fig2.show()

Notas:
- Si ves errores de importación, re-ejecutá las celdas 1–3.
- Los archivos generados se guardan en `/content/data-analytics-con-python/backend/datasets` y `/content/data-analytics-con-python/backend/reports`.
- Podés descargar los JSON desde el panel de archivos de Colab.
- Si el repo es privado, reemplazá la celda de clonación por autenticación con token de GitHub.