In [1]:
# Instalar las librerías necesarias.

!pip install snowflake-connector-python pyarrow requests pandas matplotlib seaborn
print("=" * 80)
print("Paquetes instalados correctamente.")
print("=" * 80)

Paquetes instalados correctamente.


In [2]:
# Verificar que todas las variables de ambiente necesarias estén configuradas.

import os
import requests
from io import StringIO
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import snowflake.connector

print("=" * 80)
print("CONFIGURACIÓN DE AMBIENTE - PROYECTO 3")
print("=" * 80)

# Variables obligatorias
required_vars = [
    'SNOWFLAKE_ACCOUNT',
    'SNOWFLAKE_DATABASE',
    'SNOWFLAKE_SCHEMA_RAW',
    'SNOWFLAKE_SCHEMA_ANALYTICS',
    'SNOWFLAKE_WAREHOUSE',
    'SNOWFLAKE_USER',
    'SNOWFLAKE_PASSWORD',
    'SNOWFLAKE_ROLE',
    'TAXI_ZONE_URL'
]

# Verificar que todas las variables existan
missing_vars = [var for var in required_vars if not os.getenv(var)]
if missing_vars:
    print(f"ERROR: Faltan variables de ambiente: {', '.join(missing_vars)}")
    print("Por favor configura tu archivo .env correctamente.")
else:
    print("Todas las variables de ambiente requeridas están configuradas.")

print("=" * 80)

CONFIGURACIÓN DE AMBIENTE - PROYECTO 3
Todas las variables de ambiente requeridas están configuradas.


In [3]:
# AUDITORÍA INICIAL - Estado ANTES de limpieza.

print("\n" + "=" * 80)
print("AUDITORÍA INICIAL - ESTADO ANTES DE LIMPIEZA")
print("=" * 80)

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Conteo total ANTES
cursor.execute("SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS")
total_before_cleaning = cursor.fetchone()[0]

print(f"Total de registros: {total_before_cleaning:,}")

# Guardar para comparación final
with open('/tmp/cleaning_audit.txt', 'w') as f:
    f.write(f"BEFORE_CLEANING={total_before_cleaning}\n")

cursor.close()
conn.close()
print("=" * 80)


AUDITORÍA INICIAL - ESTADO ANTES DE LIMPIEZA
Total de registros: 889,971,027


In [4]:
# VALIDACIÓN Y LIMPIEZA 1 - Nulos en columnas críticas.

print("\n" + "=" * 80)
print("VALIDACIÓN 1: NULOS EN COLUMNAS CRÍTICAS")
print("=" * 80)

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Identificar nulos en columnas críticas
cursor.execute("""
    SELECT 
        SUM(CASE WHEN PICKUP_DATETIME IS NULL THEN 1 ELSE 0 END) AS null_pickup,
        SUM(CASE WHEN DROPOFF_DATETIME IS NULL THEN 1 ELSE 0 END) AS null_dropoff,
        SUM(CASE WHEN PU_LOCATION_ID IS NULL THEN 1 ELSE 0 END) AS null_pu_location,
        SUM(CASE WHEN DO_LOCATION_ID IS NULL THEN 1 ELSE 0 END) AS null_do_location,
        SUM(CASE WHEN TOTAL_AMOUNT IS NULL THEN 1 ELSE 0 END) AS null_total_amount,
        SUM(CASE WHEN SERVICE_TYPE IS NULL THEN 1 ELSE 0 END) AS null_service_type,
        SUM(CASE WHEN TRIP_DISTANCE IS NULL THEN 1 ELSE 0 END) AS null_trip_distance,
        COUNT(*) AS total
    FROM ANALYTICS.OBT_TRIPS
""")

row = cursor.fetchone()
total = row[7] if row[7] > 0 else 1

null_counts = {
    'PICKUP_DATETIME': row[0],
    'DROPOFF_DATETIME': row[1],
    'PU_LOCATION_ID': row[2],
    'DO_LOCATION_ID': row[3],
    'TOTAL_AMOUNT': row[4],
    'SERVICE_TYPE': row[5],
    'TRIP_DISTANCE': row[6]
}

print(f"\nTotal de registros: {total:,}\n")
for col, count in null_counts.items():
    pct = (count/total*100) if total > 0 else 0
    print(f"  {col}: {count:,} nulos ({pct:.4f}%)")

total_nulls = sum(null_counts.values())

# LIMPIEZA: Eliminar registros con nulos en columnas críticas
if total_nulls > 0:
    print(f"\nAPLICANDO LIMPIEZA: Eliminando {total_nulls:,} registros con nulos...")
    
    cursor.execute("""
        DELETE FROM ANALYTICS.OBT_TRIPS
        WHERE PICKUP_DATETIME IS NULL
           OR DROPOFF_DATETIME IS NULL
           OR PU_LOCATION_ID IS NULL
           OR DO_LOCATION_ID IS NULL
           OR TOTAL_AMOUNT IS NULL
           OR SERVICE_TYPE IS NULL
           OR TRIP_DISTANCE IS NULL
    """)
    
    rows_deleted = cursor.rowcount
    conn.commit()
    
    print(f"Eliminados: {rows_deleted:,} registros.")
    
    # Guardar auditoría
    with open('/tmp/cleaning_audit.txt', 'a') as f:
        f.write(f"DELETED_NULLS={rows_deleted}\n")
else:
    print("\nVALIDACIÓN EXITOSA: No hay nulos en columnas críticas.")

cursor.close()
conn.close()
print("=" * 80)


VALIDACIÓN 1: NULOS EN COLUMNAS CRÍTICAS

Total de registros: 889,971,027

  PICKUP_DATETIME: 0 nulos (0.0000%)
  DROPOFF_DATETIME: 0 nulos (0.0000%)
  PU_LOCATION_ID: 0 nulos (0.0000%)
  DO_LOCATION_ID: 0 nulos (0.0000%)
  TOTAL_AMOUNT: 0 nulos (0.0000%)
  SERVICE_TYPE: 0 nulos (0.0000%)
  TRIP_DISTANCE: 0 nulos (0.0000%)

VALIDACIÓN EXITOSA: No hay nulos en columnas críticas.


In [5]:
# VALIDACIÓN Y LIMPIEZA 2 - Distancias negativas.

print("\n" + "=" * 80)
print("VALIDACIÓN 2: DISTANCIAS NEGATIVAS")
print("=" * 80)

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Identificar distancias negativas
cursor.execute("""
    SELECT COUNT(*) 
    FROM ANALYTICS.OBT_TRIPS
    WHERE TRIP_DISTANCE < 0
""")
neg_distance = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS")
total = cursor.fetchone()[0]

print(f"\nTotal de registros: {total:,}")
print(f"Distancias negativas: {neg_distance:,} ({neg_distance/total*100:.4f}%)")

# LIMPIEZA: Eliminar distancias negativas
if neg_distance > 0:
    print(f"\nAPLICANDO LIMPIEZA: Eliminando registros con distancias negativas...")
    
    cursor.execute("""
        DELETE FROM ANALYTICS.OBT_TRIPS
        WHERE TRIP_DISTANCE < 0
    """)
    
    rows_deleted = cursor.rowcount
    conn.commit()
    
    print(f"Eliminados: {rows_deleted:,} registros.")
    
    # Guardar auditoría
    with open('/tmp/cleaning_audit.txt', 'a') as f:
        f.write(f"DELETED_NEG_DISTANCE={rows_deleted}\n")
else:
    print("\nVALIDACIÓN EXITOSA: No hay distancias negativas")

cursor.close()
conn.close()
print("=" * 80)


VALIDACIÓN 2: DISTANCIAS NEGATIVAS

Total de registros: 889,971,027
Distancias negativas: 30,971 (0.0035%)

APLICANDO LIMPIEZA: Eliminando registros con distancias negativas...
Eliminados: 30,971 registros.


In [6]:
# VALIDACIÓN Y LIMPIEZA 3 - Duraciones negativas.

print("\n" + "=" * 80)
print("VALIDACIÓN 3: DURACIONES NEGATIVAS")
print("=" * 80)

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Identificar duraciones negativas
cursor.execute("""
    SELECT COUNT(*) 
    FROM ANALYTICS.OBT_TRIPS
    WHERE TRIP_DURATION_MIN < 0
""")
neg_duration = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS")
total = cursor.fetchone()[0]

print(f"\nTotal de registros: {total:,}")
print(f"Duraciones negativas: {neg_duration:,} ({neg_duration/total*100:.4f}%)")

# LIMPIEZA: Eliminar duraciones negativas
if neg_duration > 0:
    print(f"\nAPLICANDO LIMPIEZA: Eliminando registros con duraciones negativas...")
    
    cursor.execute("""
        DELETE FROM ANALYTICS.OBT_TRIPS
        WHERE TRIP_DURATION_MIN < 0
    """)
    
    rows_deleted = cursor.rowcount
    conn.commit()
    
    print(f"Eliminados: {rows_deleted:,} registros.")
    
    # Guardar auditoría
    with open('/tmp/cleaning_audit.txt', 'a') as f:
        f.write(f"DELETED_NEG_DURATION={rows_deleted}\n")
else:
    print("\nVALIDACIÓN EXITOSA: No hay duraciones negativas.")

cursor.close()
conn.close()
print("=" * 80)


VALIDACIÓN 3: DURACIONES NEGATIVAS

Total de registros: 889,940,056
Duraciones negativas: 0 (0.0000%)

VALIDACIÓN EXITOSA: No hay duraciones negativas.


In [7]:
# VALIDACIÓN Y LIMPIEZA 4 - Montos negativos.

print("\n" + "=" * 80)
print("VALIDACIÓN 4: MONTOS NEGATIVOS")
print("=" * 80)

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Identificar montos negativos
print("\n--- Identificando montos negativos ---")
cursor.execute("""
    SELECT COUNT(*) 
    FROM ANALYTICS.OBT_TRIPS
    WHERE TOTAL_AMOUNT < 0
""")
neg_amount = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS")
total = cursor.fetchone()[0]

print(f"Total de registros: {total:,}")
print(f"Montos negativos: {neg_amount:,} ({neg_amount/total*100:.4f}%)")

# LIMPIEZA: Eliminar montos negativos
if neg_amount > 0:
    print(f"\nAPLICANDO LIMPIEZA: Eliminando registros con montos negativos...")
    
    cursor.execute("""
        DELETE FROM ANALYTICS.OBT_TRIPS
        WHERE TOTAL_AMOUNT < 0
    """)
    
    rows_deleted = cursor.rowcount
    conn.commit()
    
    print(f"Eliminados: {rows_deleted:,} registros.")
    
    # Guardar auditoría
    with open('/tmp/cleaning_audit.txt', 'a') as f:
        f.write(f"DELETED_NEG_AMOUNT={rows_deleted}\n")
else:
    print("\nVALIDACIÓN EXITOSA: No hay montos negativos.")

cursor.close()
conn.close()
print("=" * 80)


VALIDACIÓN 4: MONTOS NEGATIVOS

--- Identificando montos negativos ---
Total de registros: 889,940,056
Montos negativos: 2,648,359 (0.2976%)

APLICANDO LIMPIEZA: Eliminando registros con montos negativos...
Eliminados: 2,648,359 registros.


In [8]:
# VALIDACIÓN Y LIMPIEZA 5 - Fechas inconsistentes.

print("\n" + "=" * 80)
print("VALIDACIÓN 5: FECHAS INCONSISTENTES (PICKUP > DROPOFF)")
print("=" * 80)

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Identificar fechas inconsistentes
print("\n--- Identificando fechas inconsistentes ---")
cursor.execute("""
    SELECT COUNT(*) 
    FROM ANALYTICS.OBT_TRIPS
    WHERE PICKUP_DATETIME > DROPOFF_DATETIME
""")
bad_dates = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS")
total = cursor.fetchone()[0]

print(f"Total de registros: {total:,}")
print(f"Fechas inconsistentes: {bad_dates:,} ({bad_dates/total*100:.4f}%)")

# Mostrar ejemplos
if bad_dates > 0:
    print("\n--- Muestra de registros con fechas inconsistentes ---")
    cursor.execute("""
        SELECT 
            PICKUP_DATETIME,
            DROPOFF_DATETIME,
            TRIP_DURATION_MIN,
            SERVICE_TYPE
        FROM ANALYTICS.OBT_TRIPS
        WHERE PICKUP_DATETIME > DROPOFF_DATETIME
        LIMIT 5
    """)
    
    print("\nPickup | Dropoff | Duración | Servicio")
    print("-" * 90)
    for row in cursor.fetchall():
        print(f"{row[0]} | {row[1]} | {row[2]} min | {row[3]}")

# LIMPIEZA: Eliminar fechas inconsistentes
if bad_dates > 0:
    print(f"\nAPLICANDO LIMPIEZA: Eliminando registros con fechas inconsistentes...")
    
    cursor.execute("""
        DELETE FROM ANALYTICS.OBT_TRIPS
        WHERE PICKUP_DATETIME > DROPOFF_DATETIME
    """)
    
    rows_deleted = cursor.rowcount
    conn.commit()
    
    print(f"Eliminados: {rows_deleted:,} registros.")
    
    # Guardar auditoría
    with open('/tmp/cleaning_audit.txt', 'a') as f:
        f.write(f"DELETED_BAD_DATES={rows_deleted}\n")
else:
    print("\nVALIDACIÓN EXITOSA: No hay fechas inconsistentes.")

cursor.close()
conn.close()
print("=" * 80)


VALIDACIÓN 5: FECHAS INCONSISTENTES (PICKUP > DROPOFF)

--- Identificando fechas inconsistentes ---
Total de registros: 887,291,697
Fechas inconsistentes: 87,985 (0.0099%)

--- Muestra de registros con fechas inconsistentes ---

Pickup | Dropoff | Duración | Servicio
------------------------------------------------------------------------------------------
2015-02-16 17:35:35 | 2015-02-16 17:34:51 | None min | yellow
2015-02-23 10:05:08 | 2015-02-23 10:04:21 | None min | yellow
2015-01-21 11:16:20 | 2015-01-21 11:15:47 | None min | yellow
2015-01-21 11:07:35 | 2015-01-21 11:06:54 | None min | yellow
2015-02-23 21:17:12 | 2015-02-23 21:16:20 | None min | yellow

APLICANDO LIMPIEZA: Eliminando registros con fechas inconsistentes...
Eliminados: 87,985 registros.


In [9]:
# VALIDACIÓN Y LIMPIEZA 6 - Valores numéricos negativos.
import time

print("\n" + "=" * 80)
print("VALIDACIÓN 6: VALORES NUMÉRICOS NEGATIVOS")
print("=" * 80)

# Inicio del cronómetro
start_time = time.time()

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Identificar valores negativos en múltiples columnas
print("\n--- Identificando valores negativos ---")
cursor.execute("""
    SELECT 
        SUM(CASE WHEN TRIP_DISTANCE < 0 THEN 1 ELSE 0 END) AS neg_trip_distance,
        SUM(CASE WHEN FARE_AMOUNT < 0 THEN 1 ELSE 0 END) AS neg_fare_amount,
        SUM(CASE WHEN TIP_AMOUNT < 0 THEN 1 ELSE 0 END) AS neg_tip_amount,
        SUM(CASE WHEN TOLLS_AMOUNT < 0 THEN 1 ELSE 0 END) AS neg_tolls_amount,
        SUM(CASE WHEN TOTAL_AMOUNT < 0 THEN 1 ELSE 0 END) AS neg_total_amount,
        SUM(CASE WHEN EXTRA < 0 THEN 1 ELSE 0 END) AS neg_extra,
        SUM(CASE WHEN MTA_TAX < 0 THEN 1 ELSE 0 END) AS neg_mta_tax,
        SUM(CASE WHEN IMPROVEMENT_SURCHARGE < 0 THEN 1 ELSE 0 END) AS neg_improvement_surcharge,
        SUM(CASE WHEN CONGESTION_SURCHARGE < 0 THEN 1 ELSE 0 END) AS neg_congestion_surcharge,
        COUNT(*) AS total
    FROM ANALYTICS.OBT_TRIPS
""")

result = cursor.fetchone()
total = result[9] if result[9] > 0 else 1

print(f"Total de registros: {total:,}")
print(f"  - Trip Distance negativos: {result[0]:,} ({result[0]/total*100:.4f}%)")
print(f"  - Fare Amount negativos: {result[1]:,} ({result[1]/total*100:.4f}%)")
print(f"  - Tip Amount negativos: {result[2]:,} ({result[2]/total*100:.4f}%)")
print(f"  - Tolls Amount negativos: {result[3]:,} ({result[3]/total*100:.4f}%)")
print(f"  - Total Amount negativos: {result[4]:,} ({result[4]/total*100:.4f}%)")
print(f"  - Extra negativos: {result[5]:,} ({result[5]/total*100:.4f}%)")
print(f"  - MTA Tax negativos: {result[6]:,} ({result[6]/total*100:.4f}%)")
print(f"  - Improvement Surcharge negativos: {result[7]:,} ({result[7]/total*100:.4f}%)")
print(f"  - Congestion Surcharge negativos: {result[8]:,} ({result[8]/total*100:.4f}%)")

# Contar total de registros con al menos un valor negativo
cursor.execute("""
    SELECT COUNT(*) 
    FROM ANALYTICS.OBT_TRIPS
    WHERE TRIP_DISTANCE < 0
       OR FARE_AMOUNT < 0
       OR TIP_AMOUNT < 0
       OR TOLLS_AMOUNT < 0
       OR TOTAL_AMOUNT < 0
       OR EXTRA < 0
       OR MTA_TAX < 0
       OR IMPROVEMENT_SURCHARGE < 0
       OR CONGESTION_SURCHARGE < 0
""")
neg_records = cursor.fetchone()[0]

print(f"\nTotal de registros con al menos un valor negativo: {neg_records:,} ({neg_records/total*100:.4f}%)")

# LIMPIEZA: Eliminar registros con valores negativos
if neg_records > 0:
    print(f"\nAPLICANDO LIMPIEZA: Eliminando registros con valores negativos...")
    
    cursor.execute("""
        DELETE FROM ANALYTICS.OBT_TRIPS
        WHERE TRIP_DISTANCE < 0
           OR FARE_AMOUNT < 0
           OR TIP_AMOUNT < 0
           OR TOLLS_AMOUNT < 0
           OR TOTAL_AMOUNT < 0
           OR EXTRA < 0
           OR MTA_TAX < 0
           OR IMPROVEMENT_SURCHARGE < 0
           OR CONGESTION_SURCHARGE < 0
    """)
    
    rows_deleted = cursor.rowcount
    conn.commit()
    
    print(f"Eliminados: {rows_deleted:,} registros.")
    
    # Guardar auditoría
    with open('/tmp/cleaning_audit.txt', 'a') as f:
        f.write(f"DELETED_NEGATIVE_VALUES={rows_deleted}\n")
else:
    print("\nVALIDACIÓN EXITOSA: No hay valores negativos.")

# Calcular tiempo transcurrido
elapsed_seconds = int(time.time() - start_time)

cursor.close()
conn.close()

print(f"\nValidación completada en {elapsed_seconds} segundos.")
print("=" * 80)


VALIDACIÓN 6: VALORES NUMÉRICOS NEGATIVOS

--- Identificando valores negativos ---
Total de registros: 887,203,712
  - Trip Distance negativos: 0 (0.0000%)
  - Fare Amount negativos: 1,355,676 (0.1528%)
  - Tip Amount negativos: 49 (0.0000%)
  - Tolls Amount negativos: 9 (0.0000%)
  - Total Amount negativos: 0 (0.0000%)
  - Extra negativos: 3,372 (0.0004%)
  - MTA Tax negativos: 134 (0.0000%)
  - Improvement Surcharge negativos: 32 (0.0000%)
  - Congestion Surcharge negativos: 10 (0.0000%)

Total de registros con al menos un valor negativo: 1,359,194 (0.1532%)

APLICANDO LIMPIEZA: Eliminando registros con valores negativos...
Eliminados: 1,359,194 registros.

Validación completada en 274 segundos


In [10]:
# VALIDACIÓN Y LIMPIEZA 7 - Fechas fuera del rango 2015-2025.
import time

print("\n" + "=" * 80)
print("VALIDACIÓN 7: FECHAS FUERA DEL RANGO 2015-2025")
print("=" * 80)

# Inicio del cronómetro
start_time = time.time()

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Identificar registros con fechas fuera del rango
print("\n--- Identificando fechas fuera del rango 2015-2025 ---")
cursor.execute("""
    SELECT 
        COUNT(*) AS total_records,
        SUM(CASE WHEN YEAR(PICKUP_DATETIME) < 2015 OR YEAR(PICKUP_DATETIME) > 2025 THEN 1 ELSE 0 END) AS invalid_pickup,
        SUM(CASE WHEN YEAR(DROPOFF_DATETIME) < 2015 OR YEAR(DROPOFF_DATETIME) > 2025 THEN 1 ELSE 0 END) AS invalid_dropoff,
        MIN(YEAR(PICKUP_DATETIME)) AS min_pickup_year,
        MAX(YEAR(PICKUP_DATETIME)) AS max_pickup_year,
        MIN(YEAR(DROPOFF_DATETIME)) AS min_dropoff_year,
        MAX(YEAR(DROPOFF_DATETIME)) AS max_dropoff_year
    FROM ANALYTICS.OBT_TRIPS
""")

result = cursor.fetchone()
total = result[0] if result[0] > 0 else 1
invalid_pickup = result[1]
invalid_dropoff = result[2]

print(f"Total de registros: {total:,}")
print(f"Rango de años PICKUP_DATETIME: {result[3]} - {result[4]}")
print(f"Rango de años DROPOFF_DATETIME: {result[5]} - {result[6]}")
print(f"\nRegistros con PICKUP_DATETIME fuera del rango: {invalid_pickup:,} ({invalid_pickup/total*100:.4f}%)")
print(f"Registros con DROPOFF_DATETIME fuera del rango: {invalid_dropoff:,} ({invalid_dropoff/total*100:.4f}%)")

# Contar registros con cualquiera de las dos fechas fuera del rango
cursor.execute("""
    SELECT COUNT(*) 
    FROM ANALYTICS.OBT_TRIPS
    WHERE YEAR(PICKUP_DATETIME) < 2015 
       OR YEAR(PICKUP_DATETIME) > 2025
       OR YEAR(DROPOFF_DATETIME) < 2015 
       OR YEAR(DROPOFF_DATETIME) > 2025
""")
invalid_records = cursor.fetchone()[0]

print(f"\nTotal de registros con fechas fuera del rango: {invalid_records:,} ({invalid_records/total*100:.4f}%)")

# LIMPIEZA: Eliminar registros con fechas fuera del rango
if invalid_records > 0:
    print(f"\nAPLICANDO LIMPIEZA: Eliminando registros con fechas fuera del rango 2015-2025...")
    
    cursor.execute("""
        DELETE FROM ANALYTICS.OBT_TRIPS
        WHERE YEAR(PICKUP_DATETIME) < 2015 
           OR YEAR(PICKUP_DATETIME) > 2025
           OR YEAR(DROPOFF_DATETIME) < 2015 
           OR YEAR(DROPOFF_DATETIME) > 2025
    """)
    
    rows_deleted = cursor.rowcount
    conn.commit()
    
    print(f"Eliminados: {rows_deleted:,} registros.")
    
    # Guardar auditoría
    with open('/tmp/cleaning_audit.txt', 'a') as f:
        f.write(f"DELETED_INVALID_DATES={rows_deleted}\n")
    
    # Verificar nuevo rango después de la limpieza
    cursor.execute("""
        SELECT 
            MIN(YEAR(PICKUP_DATETIME)) AS min_pickup_year,
            MAX(YEAR(PICKUP_DATETIME)) AS max_pickup_year,
            MIN(YEAR(DROPOFF_DATETIME)) AS min_dropoff_year,
            MAX(YEAR(DROPOFF_DATETIME)) AS max_dropoff_year
        FROM ANALYTICS.OBT_TRIPS
    """)
    
    new_range = cursor.fetchone()
    print(f"\nNuevo rango de años PICKUP_DATETIME: {new_range[0]} - {new_range[1]}")
    print(f"Nuevo rango de años DROPOFF_DATETIME: {new_range[2]} - {new_range[3]}")
else:
    print("\nVALIDACIÓN EXITOSA: Todas las fechas están dentro del rango 2015-2025.")

# Calcular tiempo transcurrido
elapsed_seconds = int(time.time() - start_time)

cursor.close()
conn.close()

print(f"\nValidación completada en {elapsed_seconds} segundos.")
print("=" * 80)


VALIDACIÓN 7: FECHAS FUERA DEL RANGO 2015-2025

--- Identificando fechas fuera del rango 2015-2025 ---
Total de registros: 885,844,518
Rango de años PICKUP_DATETIME: 2001 - 2098
Rango de años DROPOFF_DATETIME: 2001 - 2253

Registros con PICKUP_DATETIME fuera del rango: 3,452 (0.0004%)
Registros con DROPOFF_DATETIME fuera del rango: 3,335 (0.0004%)

Total de registros con fechas fuera del rango: 3,464 (0.0004%)

APLICANDO LIMPIEZA: Eliminando registros con fechas fuera del rango 2015-2025...
Eliminados: 3,464 registros.

Nuevo rango de años PICKUP_DATETIME: 2015 - 2025
Nuevo rango de años DROPOFF_DATETIME: 2015 - 2025

Validación completada en 336 segundos.


In [14]:
# AUDITORÍA FINAL - Estado DESPUÉS de limpieza.

import time

print("\n" + "=" * 80)
print("AUDITORÍA FINAL - ESTADO DESPUÉS DE LIMPIEZA")
print("=" * 80)

# Inicio del cronómetro
start_time = time.time()

conn = snowflake.connector.connect(
    user=os.environ["SNOWFLAKE_USER"],
    password=os.environ["SNOWFLAKE_PASSWORD"],
    account=os.environ["SNOWFLAKE_ACCOUNT"],
    warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
    database=os.environ["SNOWFLAKE_DATABASE"],
    role=os.environ["SNOWFLAKE_ROLE"]
)
cursor = conn.cursor()

# Conteo final DESPUÉS de limpieza
cursor.execute("SELECT COUNT(*) FROM ANALYTICS.OBT_TRIPS")
total_after_cleaning = cursor.fetchone()[0]

# Leer auditoría completa
try:
    with open('/tmp/cleaning_audit.txt', 'r') as f:
        audit_data = {}
        for line in f:
            if '=' in line:
                key, value = line.strip().split('=')
                audit_data[key] = int(value)
except FileNotFoundError:
    print("ADVERTENCIA: No se encontró archivo de auditoría.")
    audit_data = {}

# Extraer valores de auditoría
total_before = audit_data.get('BEFORE_CLEANING', 0)
deleted_nulls = audit_data.get('DELETED_NULLS', 0)
deleted_neg_dist = audit_data.get('DELETED_NEG_DISTANCE', 0)
deleted_neg_dur = audit_data.get('DELETED_NEG_DURATION', 0)
deleted_neg_amt = audit_data.get('DELETED_NEG_AMOUNT', 0)
deleted_bad_dates = audit_data.get('DELETED_BAD_DATES', 0)
deleted_negative_values = audit_data.get('DELETED_NEGATIVE_VALUES', 0)
deleted_invalid_dates = audit_data.get('DELETED_INVALID_DATES', 0)

# Calcular total eliminado
total_deleted = total_before - total_after_cleaning if total_before > 0 else 0

# Mostrar resumen
print(f"\n{'='*80}")
print(f"RESUMEN DE LIMPIEZA")
print(f"{'='*80}")
print(f"\nREGISTROS ANTES DE LIMPIEZA: {total_before:,}")
print(f"\nREGISTROS ELIMINADOS POR REGLA:")
print(f"  1. Nulos en campos críticos:          {deleted_nulls:,}")
print(f"  2. Distancias negativas:               {deleted_neg_dist:,}")
print(f"  3. Duraciones negativas:               {deleted_neg_dur:,}")
print(f"  4. Montos negativos (TOTAL_AMOUNT):    {deleted_neg_amt:,}")
print(f"  5. Fechas inconsistentes (PU > DO):    {deleted_bad_dates:,}")
print(f"  6. Valores numéricos negativos:        {deleted_negative_values:,}")
print(f"  7. Fechas fuera del rango 2015-2025:   {deleted_invalid_dates:,}")

# Separador
print(f"\n{'-'*80}")

# Total eliminado
if total_before > 0:
    pct_deleted = (total_deleted / total_before) * 100
    pct_remaining = (total_after_cleaning / total_before) * 100
    
    print(f"TOTAL ELIMINADO:                       {total_deleted:,} ({pct_deleted:.4f}%)")
    print(f"REGISTROS DESPUÉS DE LIMPIEZA:         {total_after_cleaning:,} ({pct_remaining:.2f}%)")
else:
    print("ADVERTENCIA: No se encontró el total antes de limpieza.")
    print(f"REGISTROS DESPUÉS DE LIMPIEZA:         {total_after_cleaning:,}")

# Verificar calidad de datos después de limpieza
print(f"\n{'='*80}")
print("VERIFICACIÓN DE CALIDAD POST-LIMPIEZA")
print(f"{'='*80}")

cursor.execute("""
    SELECT 
        SUM(CASE WHEN PICKUP_DATETIME IS NULL THEN 1 ELSE 0 END) AS null_pickup,
        SUM(CASE WHEN DROPOFF_DATETIME IS NULL THEN 1 ELSE 0 END) AS null_dropoff,
        SUM(CASE WHEN TRIP_DISTANCE < 0 THEN 1 ELSE 0 END) AS neg_distance,
        SUM(CASE WHEN TRIP_DURATION_MIN < 0 THEN 1 ELSE 0 END) AS neg_duration,
        SUM(CASE WHEN TOTAL_AMOUNT < 0 THEN 1 ELSE 0 END) AS neg_amount,
        SUM(CASE WHEN PICKUP_DATETIME > DROPOFF_DATETIME THEN 1 ELSE 0 END) AS bad_dates,
        MIN(YEAR(PICKUP_DATETIME)) AS min_year,
        MAX(YEAR(PICKUP_DATETIME)) AS max_year,
        COUNT(*) AS total
    FROM ANALYTICS.OBT_TRIPS
""")

verification = cursor.fetchone()

print(f"\nVerificación de nulos y valores inválidos:")
print(f"  - Nulos en PICKUP_DATETIME:            {verification[0]:,}")
print(f"  - Nulos en DROPOFF_DATETIME:           {verification[1]:,}")
print(f"  - Distancias negativas:                {verification[2]:,}")
print(f"  - Duraciones negativas:                {verification[3]:,}")
print(f"  - Montos negativos:                    {verification[4]:,}")
print(f"  - Fechas inconsistentes:               {verification[5]:,}")
print(f"  - Rango de años:                       {verification[6]} - {verification[7]}")

# Estadísticas descriptivas
print(f"\n{'='*80}")
print("ESTADÍSTICAS DESCRIPTIVAS")
print(f"{'='*80}")

cursor.execute("""
    SELECT 
        ROUND(AVG(TRIP_DISTANCE), 2) AS avg_distance,
        ROUND(AVG(TRIP_DURATION_MIN), 2) AS avg_duration,
        ROUND(AVG(TOTAL_AMOUNT), 2) AS avg_amount,
        ROUND(AVG(PASSENGER_COUNT), 2) AS avg_passengers,
        COUNT(DISTINCT SERVICE_TYPE) AS service_types,
        COUNT(DISTINCT PU_BOROUGH) AS pu_boroughs,
        COUNT(DISTINCT DO_BOROUGH) AS do_boroughs
    FROM ANALYTICS.OBT_TRIPS
""")

stats = cursor.fetchone()

print(f"\nPromedios:")
print(f"  - Distancia promedio:                  {stats[0]} millas")
print(f"  - Duración promedio:                   {stats[1]} minutos")
print(f"  - Monto promedio:                      ${stats[2]}")

print(f"\nVariedad:")
print(f"  - Tipos de servicio:                   {stats[4]}")
print(f"  - Boroughs de origen:                  {stats[5]}")
print(f"  - Boroughs de destino:                 {stats[6]}")

# Guardar resultado final en auditoría
with open('/tmp/cleaning_audit.txt', 'a') as f:
    f.write(f"AFTER_CLEANING={total_after_cleaning}\n")
    f.write(f"TOTAL_DELETED={total_deleted}\n")

# Calcular tiempo transcurrido
elapsed_seconds = int(time.time() - start_time)

cursor.close()
conn.close()

print(f"\n{'='*80}")
print(f"Auditoría completada en {elapsed_seconds} segundos.")
print(f"{'='*80}")

print("\n" + "=" * 80)
print("NOTEBOOK 04_VALIDACIONES_Y_EXPLORACION.IPYNB COMPLETADO")
print("=" * 80)


AUDITORÍA FINAL - ESTADO DESPUÉS DE LIMPIEZA

RESUMEN DE LIMPIEZA

REGISTROS ANTES DE LIMPIEZA: 889,971,027

REGISTROS ELIMINADOS POR REGLA:
  1. Nulos en campos críticos:          0
  2. Distancias negativas:               30,971
  3. Duraciones negativas:               0
  4. Montos negativos (TOTAL_AMOUNT):    2,648,359
  5. Fechas inconsistentes (PU > DO):    87,985
  6. Valores numéricos negativos:        1,359,194
  7. Fechas fuera del rango 2015-2025:   3,464

--------------------------------------------------------------------------------
TOTAL ELIMINADO:                       4,129,973 (0.4641%)
REGISTROS DESPUÉS DE LIMPIEZA:         885,841,054 (99.54%)

VERIFICACIÓN DE CALIDAD POST-LIMPIEZA

Verificación de nulos y valores inválidos:
  - Nulos en PICKUP_DATETIME:            0
  - Nulos en DROPOFF_DATETIME:           0
  - Distancias negativas:                0
  - Duraciones negativas:                0
  - Montos negativos:                    0
  - Fechas inconsistentes:   