# Data Analytics: flujo end-to-end

> Este notebook orquesta el pipeline completo: generar dataset sintético, verificar integridad, calcular métricas, análisis exploratorio y visualizar resultados.

## Pasos
1. Setup de rutas y entorno (Python >= 3.11 con numpy/pandas).
2. Generar el dataset si no existe (CSV en `backend/data/datasets`).
3. Chequear integridad del dataset (salida por consola).
4. Calcular métricas y guardar JSON en `backend/reports/product_metrics.json`.
5. Análisis exploratorio y guardar JSON en `backend/reports/exploratory_summary.json`.
6. Visualizaciones rápidas con Plotly.

Consejo: ejecutá las celdas de arriba hacia abajo. Si cambiás el entorno, reiniciá el kernel y volvés a ejecutar.

In [None]:
# Notebook setup: imports and robust path resolution
import sys
from pathlib import Path
import json

# Resolve backend root so 'app' package is importable and paths are correct
CWD = Path.cwd()
candidates = [CWD, CWD.parent, CWD.parent.parent]
BACKEND_ROOT = None
for base in candidates:
    if (base / "app").exists():  # only require 'app'
        BACKEND_ROOT = base
        break
if BACKEND_ROOT is None:
    BACKEND_ROOT = CWD  # fallback

if str(BACKEND_ROOT) not in sys.path:
    sys.path.insert(0, str(BACKEND_ROOT))

DATASETS_DIR = BACKEND_ROOT / "datasets"
REPORTS_DIR = BACKEND_ROOT / "reports"
DATASETS_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
CSV_PATH = DATASETS_DIR / "product_dataset.csv"

print(f"BACKEND_ROOT: {BACKEND_ROOT}")
print(f"CSV_PATH: {CSV_PATH}")

In [None]:
# 0) Verificación rápida de entorno y versiones
import sys as _sys
import importlib

missing = []

def _try_import(mod):
    try:
        m = importlib.import_module(mod)
        print(f"{mod}=={getattr(m, '__version__', 'unknown')}")
        return True
    except Exception as e:
        print(f"Falta o falla {mod}: {e}")
        missing.append(mod)
        return False

mods = ["numpy", "pandas", "plotly"]
_ = [ _try_import(m) for m in mods ]
print(f"Python: {_sys.version}")

if missing:
    print("\nSugerencia: instalá paquetes faltantes en tu entorno activo (PowerShell):")
    print("  .\\.venv\\Scripts\\Activate.ps1  # si usás venv")
    print("  pip install -r backend/requirements.txt")

In [None]:
# 1) Generar dataset (idempotente) y cargarlo
import pandas as pd
from app.scripts.create_product_dataset import main as gen_main

# Generar solo si no existe
if not CSV_PATH.exists():
    _ = gen_main(["--num-samples", "200", "--seed", "42", "--out", str(CSV_PATH)])

# Cargar dataset
df = pd.read_csv(CSV_PATH)
df.head()

In [None]:
# 2) Verificar integridad (modo no estricto para continuar el flujo)
from app.scripts.check_dataset_integrity import main as chk_main

exit_code = chk_main(["--path", str(CSV_PATH)])
print("Integrity check exit code:", exit_code)

In [None]:
# 3) Calcular métricas y guardar JSON
from app.scripts.calculate_product_metrics import main as met_main

REPORTS_DIR.mkdir(parents=True, exist_ok=True)
metrics_path = REPORTS_DIR / "metrics.json"
_ = met_main(["--path", str(CSV_PATH), "--json-out", str(metrics_path)])

import json
metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
metrics

In [None]:
# 4) Análisis exploratorio y guardar JSON
from app.scripts.exploratory_analysis import main as exp_main

expl_summary_path = REPORTS_DIR / "exploratory_summary.json"
_ = exp_main(["--path", str(CSV_PATH), "--json-out", str(expl_summary_path)])

summary = json.loads(expl_summary_path.read_text(encoding="utf-8"))
summary

In [None]:
# 5) Visualizaciones con Plotly
import plotly.express as px

# Conteo por categoría
if "Category" in df.columns:
    counts = df["Category"].value_counts(dropna=False).reset_index()
    counts.columns = ["Category", "count"]
    fig1 = px.bar(counts, x="Category", y="count", title="Conteo de productos por categoría")
    fig1.show()

# Promedios globales
avg_df = pd.DataFrame([
    {"metric": "BaseYield", "value": metrics.get("average_base_yield")},
    {"metric": "Cost", "value": metrics.get("total_cost")},
    {"metric": "EnvironmentalImpact", "value": metrics.get("average_environmental_impact")},
])
avg_df = avg_df.dropna()
if not avg_df.empty:
    fig2 = px.bar(avg_df, x="metric", y="value", title="Métricas globales")
    fig2.show()