In [1]:
# Instalar las librerías necesarias.

!pip install snowflake-connector-python pyarrow requests pandas matplotlib seaborn
print("=" * 80)
print("Paquetes instalados correctamente.")
print("=" * 80)

Collecting snowflake-connector-python
  Downloading snowflake_connector_python-4.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m936.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python)
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl.metadata (13 kB)
Collecting filelock<4,>=3.5 (from snowflake-connector-python)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting tomlkit (from snowflake-connector-python)
  Downloading tomlkit-0.13.3-py3-none-any.whl.metadata (2.8 kB)
Collecting boto3>=1.24 (from snowflake-connector-python)
  Downloading boto3-1.40.55-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore>=1.24 (from snowflake-connector-python)
  Downloading botocore-1.40.55-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.24->snowflake-connector

In [2]:
# Verificar que todas las variables de ambiente necesarias estén configuradas.

import os
import requests
from io import StringIO
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import snowflake.connector

print("=" * 80)
print("CONFIGURACIÓN DE AMBIENTE - PROYECTO 3")
print("=" * 80)

# Variables obligatorias
required_vars = [
    'SNOWFLAKE_ACCOUNT',
    'SNOWFLAKE_DATABASE',
    'SNOWFLAKE_SCHEMA_RAW',
    'SNOWFLAKE_SCHEMA_ANALYTICS',
    'SNOWFLAKE_WAREHOUSE',
    'SNOWFLAKE_USER',
    'SNOWFLAKE_PASSWORD',
    'SNOWFLAKE_ROLE',
    'TAXI_ZONE_URL'
]

# Verificar que todas las variables existan
missing_vars = [var for var in required_vars if not os.getenv(var)]
if missing_vars:
    print(f"ERROR: Faltan variables de ambiente: {', '.join(missing_vars)}")
    print("Por favor configura tu archivo .env correctamente.")
else:
    print("Todas las variables de ambiente requeridas están configuradas.")

print("=" * 80)

CONFIGURACIÓN DE AMBIENTE - PROYECTO 3
Todas las variables de ambiente requeridas están configuradas.


In [3]:
# Crear Spark Session.

spark = (
    SparkSession.builder
    .appName("P3 - Data Analysis - Anahi Andrade")
    
    .config("spark.sql.timestampType", "TIMESTAMP_LTZ")
    .config("spark.sql.session.timeZone", "UTC")
    
    .config(
        "spark.jars.packages",
        "net.snowflake:snowflake-jdbc:3.13.33,"
        "net.snowflake:spark-snowflake_2.12:2.9.3-spark_3.1"
    )
    
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    
    .getOrCreate()
)

print("=" * 80)
print("SPARK SESSION CREADA EXITOSAMENTE")
print("=" * 80)
print(f"Spark Version: {spark.version}")
print(f"Spark UI disponible en: http://localhost:4040")
print(f"Timezone: {spark.conf.get('spark.sql.session.timeZone')}")
print("=" * 80)

SPARK SESSION CREADA EXITOSAMENTE
Spark Version: 3.5.0
Spark UI disponible en: http://localhost:4040
Timezone: UTC


In [4]:
# Definir opciones de conexión a Snowflake para lectura/escritura con Spark.

snowflake_options = {
    "sfURL": f"{os.getenv('SNOWFLAKE_ACCOUNT')}.snowflakecomputing.com",
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA_ANALYTICS"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole": os.getenv("SNOWFLAKE_ROLE")
}

print("=" * 80)
print("Configuración Snowflake lista para usar.")
print("=" * 80)

Configuración Snowflake lista para usar.


In [5]:
# Cargar tabla OBT_TRIPS en Spark DataFrame.

import time

print("\n" + "=" * 80)
print("CARGANDO TABLA ANALYTICS.OBT_TRIPS EN SPARK")
print("=" * 80)

# Inicio del cronómetro
start_time = time.time()

df = (spark.read
      .format("snowflake")
      .options(**snowflake_options)
      .option("dbtable", "OBT_TRIPS")
      .load())

# Cache para mejorar performance de queries repetidas
df.cache()

# Calcular tiempo transcurrido
elapsed_seconds = int(time.time() - start_time)

# Verificar carga
print(f"\nTabla cargada exitosamente en {elapsed_seconds} segundos.")

# Mostrar schema
print("\n--- Schema de OBT_TRIPS ---")
df.printSchema()

print("=" * 80)


CARGANDO TABLA ANALYTICS.OBT_TRIPS EN SPARK

Tabla cargada exitosamente en 9 segundos.

--- Schema de OBT_TRIPS ---
root
 |-- TRIP_ID: string (nullable = true)
 |-- PICKUP_DATETIME: timestamp (nullable = true)
 |-- DROPOFF_DATETIME: timestamp (nullable = true)
 |-- PICKUP_DATE: date (nullable = true)
 |-- PICKUP_HOUR: decimal(38,0) (nullable = true)
 |-- DROPOFF_DATE: date (nullable = true)
 |-- DROPOFF_HOUR: decimal(38,0) (nullable = true)
 |-- DAY_OF_WEEK: decimal(38,0) (nullable = true)
 |-- MONTH: decimal(38,0) (nullable = true)
 |-- YEAR: decimal(38,0) (nullable = true)
 |-- PU_LOCATION_ID: decimal(38,0) (nullable = true)
 |-- PU_ZONE: string (nullable = true)
 |-- PU_BOROUGH: string (nullable = true)
 |-- DO_LOCATION_ID: decimal(38,0) (nullable = true)
 |-- DO_ZONE: string (nullable = true)
 |-- DO_BOROUGH: string (nullable = true)
 |-- SERVICE_TYPE: string (nullable = true)
 |-- VENDOR_ID: decimal(38,0) (nullable = true)
 |-- VENDOR_NAME: string (nullable = true)
 |-- RATE_CODE

In [6]:
# Función de ayuda para ejecutar queries en Snowflake.

def query_snowflake(sql_query):
    
    # Ejecuta una query SQL directamente en Snowflake y retorna Spark DataFrame.
    # Esto es MÁS EFICIENTE que hacer groupBy/agg sobre el DataFrame completo.
    
    return (spark.read
            .format("snowflake")
            .options(**snowflake_options)
            .option("query", sql_query)
            .load())
    
print("=" * 80)
print("Función query_snowflake() lista para usar.")
print("=" * 80)

Función query_snowflake() lista para usar.


In [7]:
# PREGUNTA A - Top 10 zonas de pickup por volumen mensual.

print("\n" + "=" * 80)
print("PREGUNTA A: Top 10 zonas de pickup por volumen mensual")
print("=" * 80)

query1 = """
    SELECT 
        YEAR,
        MONTH,
        PU_ZONE,
        PU_BOROUGH,
        COUNT(*) as TOTAL_VIAJES,
        AVG(TOTAL_AMOUNT) as AVG_AMOUNT
    FROM OBT_TRIPS
    WHERE PU_ZONE IS NOT NULL
    GROUP BY YEAR, MONTH, PU_ZONE, PU_BOROUGH
    ORDER BY TOTAL_VIAJES DESC
    LIMIT 10
"""

top_pickup = query_snowflake(query1)

top_pickup.show(10, truncate=False)

# Convertir a Pandas (ahora con nombres en MAYÚSCULAS)
top_pickup_pd = top_pickup.toPandas()

print("\nINTERPRETACIÓN:")
if len(top_pickup_pd) > 0:
    print(f"Zona #1: {top_pickup_pd.iloc[0]['PU_ZONE']}")
    print(f" - Borough: {top_pickup_pd.iloc[0]['PU_BOROUGH']}")
    print(f" - Total viajes: {int(top_pickup_pd.iloc[0]['TOTAL_VIAJES']):,}")
    print(f" - Año-Mes: {int(top_pickup_pd.iloc[0]['YEAR'])}-{int(top_pickup_pd.iloc[0]['MONTH']):02d}")
    print(f" - Tarifa promedio: ${top_pickup_pd.iloc[0]['AVG_AMOUNT']:.2f}")

print("=" * 80)


PREGUNTA A: Top 10 zonas de pickup por volumen mensual
+----+-----+-------------------------+----------+------------+------------------+
|YEAR|MONTH|PU_ZONE                  |PU_BOROUGH|TOTAL_VIAJES|AVG_AMOUNT        |
+----+-----+-------------------------+----------+------------+------------------+
|2015|4    |Upper East Side South    |Manhattan |995180      |12.189851363572421|
|2015|3    |Upper East Side South    |Manhattan |946637      |12.055959781838235|
|2015|3    |Midtown Center           |Manhattan |928067      |14.35484416534582 |
|2015|4    |Midtown Center           |Manhattan |921622      |14.567999852434077|
|2015|4    |Upper East Side North    |Manhattan |912114      |12.70150288231515 |
|2015|3    |Times Sq/Theatre District|Manhattan |883836      |16.612257364488435|
|2015|3    |Midtown East             |Manhattan |882996      |14.543188032561869|
|2015|3    |Murray Hill              |Manhattan |879990      |13.963641905021648|
|2015|3    |East Village             |Manh

In [8]:
# PREGUNTA B - Top 10 zonas de dropoff.

print("\n" + "=" * 80)
print("PREGUNTA B: Top 10 zonas de dropoff por volumen mensual")
print("=" * 80)

query2 = """
    SELECT 
        YEAR,
        MONTH,
        DO_ZONE,
        DO_BOROUGH,
        COUNT(*) as TOTAL_VIAJES,
        AVG(TOTAL_AMOUNT) as AVG_AMOUNT
    FROM OBT_TRIPS
    WHERE DO_ZONE IS NOT NULL
    GROUP BY YEAR, MONTH, DO_ZONE, DO_BOROUGH
    ORDER BY TOTAL_VIAJES DESC
    LIMIT 10
"""

top_dropoff = query_snowflake(query2)

top_dropoff.show(10, truncate=False)

top_dropoff_pd = top_dropoff.toPandas()
print("\nINTERPRETACIÓN:")
if len(top_dropoff_pd) > 0:
    print(f"Zona #1: {top_dropoff_pd.iloc[0]['DO_ZONE']}")
    print(f" - Borough: {top_dropoff_pd.iloc[0]['DO_BOROUGH']}")
    print(f" - Viajes: {int(top_dropoff_pd.iloc[0]['TOTAL_VIAJES']):,}")

print("=" * 80)


PREGUNTA B: Top 10 zonas de dropoff por volumen mensual
+----+-----+-------------------------+----------+------------+------------------+
|YEAR|MONTH|DO_ZONE                  |DO_BOROUGH|TOTAL_VIAJES|AVG_AMOUNT        |
+----+-----+-------------------------+----------+------------+------------------+
|2015|3    |Midtown Center           |Manhattan |1006642     |14.837134582105655|
|2015|4    |Midtown Center           |Manhattan |997294      |14.078700714132443|
|2015|4    |Upper East Side North    |Manhattan |918995      |11.80604877066796 |
|2015|3    |Upper East Side North    |Manhattan |875294      |12.24354340370207 |
|2015|4    |Upper East Side South    |Manhattan |870599      |11.786136108587305|
|2015|3    |Murray Hill              |Manhattan |857105      |13.286592914520392|
|2016|6    |Midtown Center           |Manhattan |851819      |14.685192488075518|
|2015|3    |Times Sq/Theatre District|Manhattan |840506      |15.406214554090038|
|2015|4    |Murray Hill              |Man

In [9]:
# PREGUNTA C - Evolución mensual por borough.

print("\n" + "=" * 80)
print("PREGUNTA C: Evolución mensual de total_amount y tip_pct por borough")
print("=" * 80)

query3 = """
    SELECT 
        YEAR,
        MONTH,
        PU_BOROUGH,
        AVG(TOTAL_AMOUNT) as AVG_TOTAL_AMOUNT,
        AVG(TIP_PCT) as AVG_TIP_PCT,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE PU_BOROUGH IS NOT NULL
    GROUP BY YEAR, MONTH, PU_BOROUGH
    ORDER BY PU_BOROUGH, YEAR DESC, MONTH DESC
    LIMIT 20
"""

evolucion = query_snowflake(query3)
print("\n--- Evolución mensual por borough (primeras 20 filas) ---")
evolucion.show(20, truncate=False)

# Análisis específico Manhattan
query3b = """
    SELECT 
        YEAR, 
        MONTH,
        AVG(TOTAL_AMOUNT) as AVG_AMOUNT,
        AVG(TIP_PCT) as AVG_TIP
    FROM OBT_TRIPS
    WHERE PU_BOROUGH = 'Manhattan'
    GROUP BY YEAR, MONTH
    ORDER BY YEAR DESC, MONTH DESC
    LIMIT 12
"""

manhattan = query_snowflake(query3b)
manhattan_pd = manhattan.toPandas()

print("\nINTERPRETACIÓN (Manhattan últimos 12 meses):")
if len(manhattan_pd) > 0:
    print(f" - Tarifa máxima: ${manhattan_pd['AVG_AMOUNT'].max():.2f}")
    print(f" - Tarifa mínima: ${manhattan_pd['AVG_AMOUNT'].min():.2f}")
    print(f" - Propina máxima: {manhattan_pd['AVG_TIP'].max():.2f}%")

print("\nCONCLUSIÓN:")
print(f" - Variación de tarifas en Manhattan: ${manhattan_pd['AVG_AMOUNT'].max() - manhattan_pd['AVG_AMOUNT'].min():.2f}")

print("=" * 80)


PREGUNTA C: Evolución mensual de total_amount y tip_pct por borough

--- Evolución mensual por borough (primeras 20 filas) ---
+----+-----+----------+------------------+------------------+------------+
|YEAR|MONTH|PU_BOROUGH|AVG_TOTAL_AMOUNT  |AVG_TIP_PCT       |TOTAL_VIAJES|
+----+-----+----------+------------------+------------------+------------+
|2025|8    |Bronx     |29.919462406856628|1.3151114979058132|31269       |
|2025|7    |Bronx     |30.472783496085157|5.25028880433244  |34101       |
|2025|6    |Bronx     |32.00182554909653 |0.957949427906668 |34921       |
|2025|5    |Bronx     |30.533609405208686|0.7793936073033234|35172       |
|2025|4    |Bronx     |30.450307520173595|0.7875144608503142|29494       |
|2025|3    |Bronx     |30.82682220891948 |0.6516897584632311|33410       |
|2025|2    |Bronx     |31.229503987089423|0.608263894619218 |21068       |
|2025|1    |Bronx     |33.38525913043478 |0.9941069744720872|14375       |
|2024|12   |Bronx     |35.32811230175739 |2.461

In [10]:
# PREGUNTA D - Ticket promedio por service_type y mes.
import time

print("\n" + "=" * 80)
print("PREGUNTA D: Ticket promedio (avg total_amount) por service_type y mes")
print("=" * 80)

query4 = """
    SELECT 
        SERVICE_TYPE,
        YEAR,
        MONTH,
        AVG(TOTAL_AMOUNT) as AVG_TICKET,
        MIN(TOTAL_AMOUNT) as MIN_TICKET,
        MAX(TOTAL_AMOUNT) as MAX_TICKET,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    GROUP BY SERVICE_TYPE, YEAR, MONTH
    ORDER BY SERVICE_TYPE, YEAR, MONTH
"""

resultado = query_snowflake(query4)

print("\n--- Ticket promedio por service_type y mes ---")
resultado.show(100, truncate=False)

# Convertir a pandas para mejor análisis
resultado_pd = resultado.toPandas()

print("\nINTERPRETACIÓN:")
for service in resultado_pd['SERVICE_TYPE'].unique():
    subset = resultado_pd[resultado_pd['SERVICE_TYPE'] == service]
    print(f"\n{service.upper()}:")
    print(f" - Ticket promedio general: ${subset['AVG_TICKET'].mean():.2f}")
    print(f" - Ticket mínimo mensual: ${subset['AVG_TICKET'].min():.2f}")
    print(f" - Ticket máximo mensual: ${subset['AVG_TICKET'].max():.2f}")
    print(f" - Total viajes: {int(subset['TOTAL_VIAJES'].sum()):,}")
    print(f" - Meses analizados: {len(subset)}")

print("\nCONCLUSIÓN:")
print("- YELLOW tiene ticket 5.9% mayor ($20.76 vs $19.61). Ambos servicios duplicaron precios desde 2015, con picos en pandemia. YELLOW domina volumen (12x más viajes).")
print("=" * 80)


PREGUNTA D: Ticket promedio (avg total_amount) por service_type y mes

--- Ticket promedio por service_type y mes ---
+------------+----+-----+------------------+----------+----------+------------+
|SERVICE_TYPE|YEAR|MONTH|AVG_TICKET        |MIN_TICKET|MAX_TICKET|TOTAL_VIAJES|
+------------+----+-----+------------------+----------+----------+------------+
|green       |2015|1    |14.817828474082965|0.0       |989970.39 |1506268     |
|green       |2015|2    |14.524868218897932|0.0       |3532.1    |1572532     |
|green       |2015|3    |14.611900800933892|0.0       |5660.1    |1720117     |
|green       |2015|4    |14.853505500707593|0.0       |4035.46   |1661968     |
|green       |2015|5    |15.289401328761397|0.0       |8011.3    |1784218     |
|green       |2015|6    |14.998880924822993|0.0       |1229.8    |1636378     |
|green       |2015|7    |14.952051649387613|0.0       |5793.14   |1539147     |
|green       |2015|8    |14.9739898537945  |0.0       |6701.55   |1529833     |
|

In [23]:
# PREGUNTA E - Picos por hora y día.

print("\n" + "=" * 80)
print("PREGUNTA E: Viajes por hora y día de semana (picos)")
print("=" * 80)

# Por hora
query5a = """
    SELECT 
        PICKUP_HOUR,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE PICKUP_HOUR IS NOT NULL
    GROUP BY PICKUP_HOUR
    ORDER BY TOTAL_VIAJES DESC
    LIMIT 10
"""

por_hora = query_snowflake(query5a)
print("\n--- Top 10 horas con más viajes ---")
por_hora.show(10, truncate=False)

# Por día de semana
query5b = """
    SELECT 
        DAY_OF_WEEK,
        CASE DAY_OF_WEEK
            WHEN 0 THEN 'Domingo'
            WHEN 1 THEN 'Lunes'
            WHEN 2 THEN 'Martes'
            WHEN 3 THEN 'Miércoles'
            WHEN 4 THEN 'Jueves'
            WHEN 5 THEN 'Viernes'
            WHEN 6 THEN 'Sábado'
        END as DIA_NOMBRE,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE DAY_OF_WEEK IS NOT NULL
    GROUP BY DAY_OF_WEEK
    ORDER BY TOTAL_VIAJES DESC
"""

por_dia = query_snowflake(query5b)
print("\n--- Viajes por día de semana ---")
por_dia.show(truncate=False)

hora_pd = por_hora.toPandas()
dia_pd = por_dia.toPandas()

print("\nINTERPRETACIÓN:")
if len(hora_pd) > 0:
    print(f" - Hora pico: {int(hora_pd.iloc[0]['PICKUP_HOUR']):02d}:00 hrs con {int(hora_pd.iloc[0]['TOTAL_VIAJES']):,} viajes.")
if len(dia_pd) > 0:
    print(f" - Día pico: {dia_pd.iloc[0]['DIA_NOMBRE']} con {int(dia_pd.iloc[0]['TOTAL_VIAJES']):,} viajes.")

print("\nCONCLUSIÓN:")
print(f" - Pico de demanda: 18:00 hrs (salida del trabajo).")
print(f" - Viernes es el día más ocupado.")

print("=" * 80)


PREGUNTA E: Viajes por hora y día de semana (picos)

--- Top 10 horas con más viajes ---
+-----------+------------+
|PICKUP_HOUR|TOTAL_VIAJES|
+-----------+------------+
|18         |57151275    |
|19         |55008233    |
|17         |51251948    |
|20         |50195208    |
|21         |49473835    |
|14         |48200839    |
|15         |48067010    |
|22         |46728656    |
|13         |45708590    |
|16         |45335025    |
+-----------+------------+


--- Viajes por día de semana ---
+-----------+----------+------------+
|DAY_OF_WEEK|DIA_NOMBRE|TOTAL_VIAJES|
+-----------+----------+------------+
|5          |Viernes   |135221595   |
|4          |Jueves    |134496152   |
|6          |Sábado    |133150121   |
|3          |Miércoles |130182345   |
|2          |Martes    |124296684   |
|0          |Domingo   |115075828   |
|1          |Lunes     |113418329   |
+-----------+----------+------------+


INTERPRETACIÓN:
 - Hora pico: 18:00 hrs con 57,151,275 viajes.
 - Día pico: V

In [55]:
# PREGUNTA F - Percentiles de duración.

print("\n" + "=" * 80)
print("PREGUNTA F: p50/p90 de trip_duration_min por borough de pickup")
print("=" * 80)

query6 = """
    SELECT 
        PU_BOROUGH,
        PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY TRIP_DURATION_MIN) as P50_DURATION,
        PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY TRIP_DURATION_MIN) as P90_DURATION,
        AVG(TRIP_DURATION_MIN) as AVG_DURATION,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE PU_BOROUGH IS NOT NULL 
      AND TRIP_DURATION_MIN IS NOT NULL
    GROUP BY PU_BOROUGH
    ORDER BY P50_DURATION DESC
"""

perc_duracion = query_snowflake(query6)
print("\n--- Percentiles de duración por borough ---")
perc_duracion.show(truncate=False)

perc_pd = perc_duracion.toPandas()
print("\nINTERPRETACIÓN:")
for idx, row in perc_pd.iterrows():
    if row['PU_BOROUGH'] in ['Queens', 'Manhattan', 'Brooklyn']:
        print(f" - {row['PU_BOROUGH']}: P50={row['P50_DURATION']:.1f} min, P90={row['P90_DURATION']:.1f} min")

print("\nCONCLUSIÓN:")
print(f" - Queens tiene viajes más largos (P50=24.6 min).")
print(f" - Manhattan tiene viajes más cortos (P50=10.8 min).")

print("=" * 80)


PREGUNTA F: p50/p90 de trip_duration_min por borough de pickup

--- Percentiles de duración por borough ---
+-------------+------------+------------+------------------+------------+
|PU_BOROUGH   |P50_DURATION|P90_DURATION|AVG_DURATION      |TOTAL_VIAJES|
+-------------+------------+------------+------------------+------------+
|Queens       |24.633333   |54.466667   |31.541291636606704|73745650    |
|Staten Island|22.25       |70.65       |34.32379320863429 |53855       |
|Bronx        |13.433333   |38.0        |22.163640798785924|5276773     |
|Brooklyn     |12.933333   |32.65       |21.42847499163621 |35877637    |
|Manhattan    |10.816667   |25.066667   |15.383994601479154|759802174   |
|Unknown      |10.5        |27.95       |17.60332670242896 |10349962    |
|NaN          |1.116667    |59.983333   |20.37286901393079 |659331      |
|EWR          |0.3         |1.716667    |4.843648684691828 |75672       |
+-------------+------------+------------+------------------+------------+


I

In [26]:
# PREGUNTA G - Velocidad por franja horaria.

print("\n" + "=" * 80)
print("PREGUNTA G: avg_speed_mph por franja horaria (6-9, 17-20) y borough")
print("=" * 80)

query7 = """
    SELECT 
        CASE 
            WHEN PICKUP_HOUR BETWEEN 6 AND 9 THEN 'Mañana (6-9)'
            WHEN PICKUP_HOUR BETWEEN 17 AND 20 THEN 'Tarde (17-20)'
        END as FRANJA_HORARIA,
        PU_BOROUGH,
        AVG(AVG_SPEED_MPH) as AVG_SPEED,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE PU_BOROUGH IS NOT NULL
      AND AVG_SPEED_MPH IS NOT NULL
      AND (PICKUP_HOUR BETWEEN 6 AND 9 OR PICKUP_HOUR BETWEEN 17 AND 20)
    GROUP BY FRANJA_HORARIA, PU_BOROUGH
    ORDER BY FRANJA_HORARIA, AVG_SPEED DESC
"""

velocidad = query_snowflake(query7)
print("\n--- Velocidad promedio por franja ---")
velocidad.show(20, truncate=False)

vel_pd = velocidad.toPandas()
print("\nINTERPRETACIÓN:")
for franja in ["Mañana (6-9)", "Tarde (17-20)"]:
    franja_data = vel_pd[vel_pd['FRANJA_HORARIA'] == franja]
    if len(franja_data) > 0:
        print(f"  {franja}: Más rápido = {franja_data.iloc[0]['PU_BOROUGH']} ({franja_data.iloc[0]['AVG_SPEED']:.1f} mph).")

print("\nCONCLUSIÓN (los que más viajes tienen):")
print(f" - Queens es el más rápido (18-19 mph).")
print(f" - Manhattan es el más lento (10-11 mph).")

print("=" * 80)


PREGUNTA G: avg_speed_mph por franja horaria (6-9, 17-20) y borough

--- Velocidad promedio por franja ---
+--------------+-------------+------------------+------------+
|FRANJA_HORARIA|PU_BOROUGH   |AVG_SPEED         |TOTAL_VIAJES|
+--------------+-------------+------------------+------------+
|Mañana (6-9)  |EWR          |803.7581307419    |3636        |
|Mañana (6-9)  |NaN          |636.2742949778922 |65447       |
|Mañana (6-9)  |Queens       |360.22270052250957|9114272     |
|Mañana (6-9)  |Bronx        |143.4724707851949 |1033422     |
|Mañana (6-9)  |Staten Island|77.69579125700537 |9277        |
|Mañana (6-9)  |Brooklyn     |51.372364480834705|4541700     |
|Mañana (6-9)  |Unknown      |46.76529288158143 |1375336     |
|Mañana (6-9)  |Manhattan    |23.22081507665614 |110894738   |
|Tarde (17-20) |EWR          |817.0617410233771 |5152        |
|Tarde (17-20) |NaN          |602.6240636143784 |80625       |
|Tarde (17-20) |Staten Island|132.2886511009368 |7999        |
|Tarde (17

In [27]:
# PREGUNTA H - Métodos de pago y propinas.

print("\n" + "=" * 80)
print("PREGUNTA H: Participación por payment_type_desc y relación con tip_pct")
print("=" * 80)

query8 = """
    WITH totals AS (
        SELECT COUNT(*) as GRAND_TOTAL FROM OBT_TRIPS
    )
    SELECT 
        PAYMENT_TYPE_DESC,
        COUNT(*) as TOTAL_VIAJES,
        ROUND(COUNT(*) * 100.0 / (SELECT GRAND_TOTAL FROM totals), 2) as PARTICIPACION_PCT,
        AVG(TIP_PCT) as AVG_TIP_PCT,
        AVG(TIP_AMOUNT) as AVG_TIP_AMOUNT
    FROM OBT_TRIPS
    WHERE PAYMENT_TYPE_DESC IS NOT NULL
    GROUP BY PAYMENT_TYPE_DESC
    ORDER BY TOTAL_VIAJES DESC
"""

payment = query_snowflake(query8)
print("\n--- Métodos de pago ---")
payment.show(truncate=False)

pay_pd = payment.toPandas()
print("\nINTERPRETACIÓN:")
for idx, row in pay_pd.iterrows():
    if row['PARTICIPACION_PCT'] > 1:
        print(f" - {row['PAYMENT_TYPE_DESC']}: {row['PARTICIPACION_PCT']:.1f}% participación, {row['AVG_TIP_PCT']:.1f}% propina")

print("\nCONCLUSIÓN:")
print(f" - Tarjeta: 67% de viajes, 25% de propina.")
print(f" - Efectivo: 30% de viajes, 0% de propina.")

print("=" * 80)


PREGUNTA H: Participación por payment_type_desc y relación con tip_pct

--- Métodos de pago ---
+-----------------+------------+-----------------+--------------------+--------------------+
|PAYMENT_TYPE_DESC|TOTAL_VIAJES|PARTICIPACION_PCT|AVG_TIP_PCT         |AVG_TIP_AMOUNT      |
+-----------------+------------+-----------------+--------------------+--------------------+
|Credit card      |594234539   |67.08            |25.37618065293379   |3.0190859824962133  |
|Cash             |268508296   |30.31            |9.771167550736669E-4|1.324467829478163E-4|
|Unknown          |17299919    |1.95             |5.869952021595842   |1.187575797898244   |
|No charge        |3626508     |0.41             |0.06597465382169787 |0.045436645941495234|
|Dispute          |2171792     |0.25             |0.07306886663722813 |0.010359569424696288|
+-----------------+------------+-----------------+--------------------+--------------------+


INTERPRETACIÓN:
 - Credit card: 67.1% participación, 25.4% propi

In [30]:
# PREGUNTA I - Rate codes.

print("\n" + "=" * 80)
print("PREGUNTA I: ¿Qué rate_code_desc concentran mayor trip_distance y total_amount?")
print("=" * 80)

query9 = """
    SELECT 
        RATE_CODE_DESC,
        COUNT(*) as TOTAL_VIAJES,
        AVG(TRIP_DISTANCE) as AVG_DISTANCE,
        SUM(TRIP_DISTANCE) as TOTAL_DISTANCE,
        AVG(TOTAL_AMOUNT) as AVG_AMOUNT,
        SUM(TOTAL_AMOUNT) as TOTAL_REVENUE
    FROM OBT_TRIPS
    WHERE RATE_CODE_DESC IS NOT NULL
    GROUP BY RATE_CODE_DESC
    ORDER BY TOTAL_REVENUE DESC
"""

rate_code = query_snowflake(query9)
print("\n--- Rate codes ---")
rate_code.show(truncate=False)

rate_pd = rate_code.toPandas()
print("\nINTERPRETACIÓN:")
print(f" - Mayor ingreso total: {rate_pd.iloc[0]['RATE_CODE_DESC']}")
print(f" - Mayor distancia promedio: {rate_pd.loc[rate_pd['AVG_DISTANCE'].idxmax()]['RATE_CODE_DESC']} ({rate_pd['AVG_DISTANCE'].max():.1f} mi)")

print("\nCONCLUSIÓN:")
print(f" - 'Standard rate' genera más ingreso total ($13.8)")
print(f" - 'Nassau or Westchester' tiene mayor distancia promedio (20.5 mi)")

print("=" * 80)


PREGUNTA I: ¿Qué rate_code_desc concentran mayor trip_distance y total_amount?

--- Rate codes ---
+---------------------+------------+------------------+--------------------+------------------+---------------------+
|RATE_CODE_DESC       |TOTAL_VIAJES|AVG_DISTANCE      |TOTAL_DISTANCE      |AVG_AMOUNT        |TOTAL_REVENUE        |
+---------------------+------------+------------------+--------------------+------------------+---------------------+
|Standard rate        |839265474   |4.870934818974812 |4.088007419669999E9 |16.524269946901214|1.3868249249490002E10|
|JFK                  |20261787    |24.085085409297804|4.8800687043999994E8|71.11306604002894 |1.44087779702E9      |
|Unknown              |18542807    |41.31537915376027 |7.6610310178E8      |29.07460094526142 |5.391247139300001E8  |
|Negotiated fare      |5275139     |28.533895345696102|1.5052026416E8      |57.34449195367174 |3.0250016594E8       |
|Newark               |1776437     |16.772945620925483|2.9796081200000003E

In [37]:
# PREGUNTA J - Mix yellow vs green.

print("\n" + "=" * 80)
print("PREGUNTA J: Mix yellow vs green por mes y borough")
print("=" * 80)

query10 = """
    SELECT 
        YEAR,
        MONTH,
        PU_BOROUGH,
        SERVICE_TYPE,
        COUNT(*) as TOTAL_VIAJES,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (PARTITION BY YEAR, MONTH, PU_BOROUGH), 2) as PCT_BOROUGH
    FROM OBT_TRIPS
    WHERE PU_BOROUGH IS NOT NULL
    GROUP BY YEAR, MONTH, PU_BOROUGH, SERVICE_TYPE
    ORDER BY YEAR, MONTH, PU_BOROUGH, TOTAL_VIAJES DESC
"""

service_mix = query_snowflake(query10)

print("\n--- Mix por mes y borough ---")
service_mix.show(50, truncate=False)

mix_pd = service_mix.toPandas()

# Resumen por borough (promedio histórico)
resumen = mix_pd.groupby(['PU_BOROUGH', 'SERVICE_TYPE'])['PCT_BOROUGH'].mean().reset_index()

print("\nINTERPRETACIÓN (Promedio histórico por borough):")
for borough in ['Manhattan', 'Brooklyn', 'Queens', 'Bronx']:
    borough_data = resumen[resumen['PU_BOROUGH'] == borough]
    if len(borough_data) > 0:
        yellow_pct = borough_data[borough_data['SERVICE_TYPE'] == 'yellow']['PCT_BOROUGH'].values
        green_pct = borough_data[borough_data['SERVICE_TYPE'] == 'green']['PCT_BOROUGH'].values
        y_val = yellow_pct[0] if len(yellow_pct) > 0 else 0
        g_val = green_pct[0] if len(green_pct) > 0 else 0
        print(f" - {borough}: {y_val:.1f}% Yellow, {g_val:.1f}% Green")

print("\nCONCLUSIÓN:")
print("""
- Manhattan domina Yellow (>95%), mientras Bronx/Brooklyn dominan Green (>80%).
- El mix permanece estable mes a mes, reflejando la regulación de zonas de operación.
- Queens muestra distribución más equilibrada entre ambos servicios.
""")

print("=" * 80)


PREGUNTA J: Mix yellow vs green por mes y borough

--- Mix por mes y borough ---
+----+-----+-------------+------------+------------+-----------+
|YEAR|MONTH|PU_BOROUGH   |SERVICE_TYPE|TOTAL_VIAJES|PCT_BOROUGH|
+----+-----+-------------+------------+------------+-----------+
|2015|1    |Bronx        |green       |91410       |90.46      |
|2015|1    |Bronx        |yellow      |9636        |9.54       |
|2015|1    |Brooklyn     |green       |569352      |71.24      |
|2015|1    |Brooklyn     |yellow      |229838      |28.76      |
|2015|1    |EWR          |yellow      |687         |95.42      |
|2015|1    |EWR          |green       |33          |4.58       |
|2015|1    |Manhattan    |yellow      |11610660    |96.43      |
|2015|1    |Manhattan    |green       |429801      |3.57       |
|2015|1    |NaN          |yellow      |8150        |90.62      |
|2015|1    |NaN          |green       |844         |9.38       |
|2015|1    |Queens       |yellow      |636494      |60.70      |
|2015|1 

In [39]:
# PREGUNTA K - Top 20 flujos.

print("\n" + "=" * 80)
print("PREGUNTA K: Top 20 flujos PU→DO por volumen y ticket promedio")
print("=" * 80)

query11 = """
    SELECT 
        PU_BOROUGH,
        PU_ZONE,
        DO_BOROUGH,
        DO_ZONE,
        COUNT(*) as TOTAL_VIAJES,
        AVG(TOTAL_AMOUNT) as AVG_TICKET,
        AVG(TRIP_DISTANCE) as AVG_DISTANCE
    FROM OBT_TRIPS
    WHERE PU_ZONE IS NOT NULL AND DO_ZONE IS NOT NULL
    GROUP BY PU_BOROUGH, PU_ZONE, DO_BOROUGH, DO_ZONE
    ORDER BY TOTAL_VIAJES DESC
    LIMIT 20
"""

flujos = query_snowflake(query11)
print("\n--- Top 20 flujos ---")
flujos.show(20, truncate=False)

flujos_pd = flujos.toPandas()
print("\nINTERPRETACIÓN:")
print(f" - Flujo #1: {flujos_pd.iloc[0]['PU_ZONE']} → {flujos_pd.iloc[0]['DO_ZONE']}")
print(f" - Viajes: {int(flujos_pd.iloc[0]['TOTAL_VIAJES']):,}")
print(f" - Ticket: ${flujos_pd.iloc[0]['AVG_TICKET']:.2f}")

print("=" * 80)


PREGUNTA K: Top 20 flujos PU→DO por volumen y ticket promedio

--- Top 20 flujos ---
+----------+----------------------------+----------+----------------------------+------------+------------------+------------------+
|PU_BOROUGH|PU_ZONE                     |DO_BOROUGH|DO_ZONE                     |TOTAL_VIAJES|AVG_TICKET        |AVG_DISTANCE      |
+----------+----------------------------+----------+----------------------------+------------+------------------+------------------+
|Unknown   |NaN                         |Unknown   |NaN                         |8179229     |17.709894130608152|12.727468360404139|
|Manhattan |Upper East Side South       |Manhattan |Upper East Side North       |4643428     |10.273830730227754|3.6952700569493055|
|Manhattan |Upper East Side North       |Manhattan |Upper East Side South       |3969728     |11.188464156738194|1.102516588038274 |
|Manhattan |Upper East Side North       |Manhattan |Upper East Side North       |3684135     |8.604045218212688 |0.6

In [40]:
# PREGUNTA L - Passenger count.

print("\n" + "=" * 80)
print("PREGUNTA L: Distribución de passenger_count y efecto en total_amount")
print("=" * 80)

query12 = """
    WITH totals AS (
        SELECT COUNT(*) as GRAND_TOTAL 
        FROM OBT_TRIPS 
        WHERE PASSENGER_COUNT IS NOT NULL
    )
    SELECT 
        PASSENGER_COUNT,
        COUNT(*) as TOTAL_VIAJES,
        ROUND(COUNT(*) * 100.0 / (SELECT GRAND_TOTAL FROM totals), 2) as PCT_VIAJES,
        AVG(TOTAL_AMOUNT) as AVG_AMOUNT
    FROM OBT_TRIPS
    WHERE PASSENGER_COUNT IS NOT NULL
    GROUP BY PASSENGER_COUNT
    ORDER BY TOTAL_VIAJES DESC
    LIMIT 10
"""

passenger = query_snowflake(query12)
print("\n--- Top 10 distribución por pasajeros ---")
passenger.show(10, truncate=False)

pass_pd = passenger.toPandas()

print("\nINTERPRETACIÓN:")
print(f" - Más común: {float(pass_pd.iloc[0]['PASSENGER_COUNT']):.0f} pasajero ({float(pass_pd.iloc[0]['PCT_VIAJES']):.1f}%)")
print(f" - Tarifa más alta: {float(pass_pd.loc[pass_pd['AVG_AMOUNT'].idxmax()]['PASSENGER_COUNT']):.0f} pasajeros (${float(pass_pd['AVG_AMOUNT'].max()):.2f})")

print("\nCONCLUSIÓN:")
print(f" - 73% de viajes son de 1 solo pasajero.")
print(f" - Grupos grandes (8-9 pax) pagan hasta $50-60 promedio.")

print("=" * 80)


PREGUNTA L: Distribución de passenger_count y efecto en total_amount

--- Top 10 distribución por pasajeros ---
+---------------+------------+----------+------------------+
|PASSENGER_COUNT|TOTAL_VIAJES|PCT_VIAJES|AVG_AMOUNT        |
+---------------+------------+----------+------------------+
|1.0            |632572612   |72.83     |18.10096708668443 |
|2.0            |122331664   |14.08     |19.50380164239407 |
|5.0            |35469463    |4.08      |17.03903329858701 |
|3.0            |34002789    |3.91      |18.974344335695527|
|6.0            |21859734    |2.52      |16.832865750790926|
|4.0            |16422030    |1.89      |19.92433887223443 |
|0.0            |5875998     |0.68      |19.742687318818014|
|8.0            |3955        |0.00      |49.82258659924146 |
|7.0            |3884        |0.00      |47.53703656024716 |
|9.0            |2050        |0.00      |62.6857512195122  |
+---------------+------------+----------+------------------+


INTERPRETACIÓN:
 - Más común: 1

In [43]:
# PREGUNTA M - Tolls y congestion.
import time

print("\n" + "=" * 80)
print("PREGUNTA M: Impacto de tolls_amount y congestion_surcharge por zona")
print("=" * 80)

query13 = """
    SELECT 
        PU_BOROUGH,
        PU_ZONE,
        COUNT(*) as TOTAL_VIAJES,
        AVG(TOLLS_AMOUNT) as AVG_TOLLS,
        SUM(TOLLS_AMOUNT) as TOTAL_TOLLS,
        AVG(CONGESTION_SURCHARGE) as AVG_CONGESTION,
        SUM(CONGESTION_SURCHARGE) as TOTAL_CONGESTION
    FROM OBT_TRIPS
    WHERE PU_BOROUGH IS NOT NULL
    GROUP BY PU_BOROUGH, PU_ZONE
    ORDER BY TOTAL_TOLLS DESC
    LIMIT 20
"""

tolls = query_snowflake(query13)

print("\n--- Top 20 zonas por tolls ---")
tolls.show(20, truncate=False)

tolls_pd = tolls.toPandas()

# Zona con mayor congestion surcharge
max_congestion_zone = tolls_pd.loc[tolls_pd['AVG_CONGESTION'].idxmax()]

print("\nINTERPRETACIÓN:")
print(f" - Zona con más tolls totales: {tolls_pd.iloc[0]['PU_ZONE']} (${float(tolls_pd.iloc[0]['TOTAL_TOLLS'])/1e6:.1f}M)")
print(f" - Promedio de tolls en top zona: ${float(tolls_pd.iloc[0]['AVG_TOLLS']):.2f}")
print(f" - Zona con mayor congestion promedio: {max_congestion_zone['PU_ZONE']} (${float(max_congestion_zone['AVG_CONGESTION']):.2f})")
print(f" - Total congestion surcharge (top 20): ${float(tolls_pd['TOTAL_CONGESTION'].sum())/1e6:.1f}M")

print("\nCONCLUSIÓN:")
print(f"""- Aeropuertos (JFK, LaGuardia, Newark) concentran los mayores tolls totales.
- Manhattan tiene el mayor congestion surcharge promedio por zona.
- Los tolls varían significativamente por zona (aeropuertos vs city center).
- El congestion surcharge es más uniforme, aplicándose principalmente en Manhattan.
""")
print("=" * 80)


PREGUNTA M: Impacto de tolls_amount y congestion_surcharge por zona

--- Top 20 zonas por tolls ---
+----------+----------------------------+------------+-------------------+--------------------+------------------+----------------+
|PU_BOROUGH|PU_ZONE                     |TOTAL_VIAJES|AVG_TOLLS          |TOTAL_TOLLS         |AVG_CONGESTION    |TOTAL_CONGESTION|
+----------+----------------------------+------------+-------------------+--------------------+------------------+----------------+
|Queens    |LaGuardia Airport           |21308987    |3.8462676719451747 |8.196006782E7       |1.5370019926311391|1.1623367E7     |
|Queens    |JFK Airport                 |23532394    |2.944084418695352  |6.928135450999999E7 |1.1562372931980507|1.298364525E7   |
|Manhattan |Times Sq/Theatre District   |26504400    |0.4457180800169029 |1.1813490280000001E7|2.4339581306488745|2.1042583E7     |
|Manhattan |Midtown Center              |31291188    |0.28011788079123107|8765221.27          |2.4524293477

In [44]:
# PREGUNTA N - Viajes cortos vs largos.

print("\n" + "=" * 80)
print("PREGUNTA N: Proporción de viajes cortos vs largos por borough")
print("=" * 80)

query14 = """
    SELECT 
        PU_BOROUGH,
        SUM(CASE WHEN TRIP_DISTANCE < 2 THEN 1 ELSE 0 END) as VIAJES_CORTOS,
        SUM(CASE WHEN TRIP_DISTANCE >= 10 THEN 1 ELSE 0 END) as VIAJES_LARGOS,
        COUNT(*) as TOTAL_VIAJES,
        ROUND(SUM(CASE WHEN TRIP_DISTANCE < 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as PCT_CORTOS,
        ROUND(SUM(CASE WHEN TRIP_DISTANCE >= 10 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as PCT_LARGOS
    FROM OBT_TRIPS
    WHERE PU_BOROUGH IS NOT NULL
      AND YEAR BETWEEN 2015 AND 2025
    GROUP BY PU_BOROUGH
    ORDER BY PCT_CORTOS DESC
"""

cortos_largos = query_snowflake(query14)
print("\n--- Viajes cortos (<2 mi) vs largos (>=10 mi) ---")
cortos_largos.show(truncate=False)

cl_pd = cortos_largos.toPandas()
print("\nINTERPRETACIÓN:")
for idx, row in cl_pd.iterrows():
    if row['PU_BOROUGH'] in ['Manhattan', 'Queens', 'Brooklyn']:
        print(f" - {row['PU_BOROUGH']}: {row['PCT_CORTOS']:.1f}% cortos, {row['PCT_LARGOS']:.1f}% largos")

print("\nCONCLUSIÓN:")
print(f" - Manhattan tiene más viajes cortos (alta densidad urbana).")
print(f" - Queens tiene más viajes largos (aeropuertos).")

print("=" * 80)


PREGUNTA N: Proporción de viajes cortos vs largos por borough

--- Viajes cortos (<2 mi) vs largos (>=10 mi) ---
+-------------+-------------+-------------+------------+----------+----------+
|PU_BOROUGH   |VIAJES_CORTOS|VIAJES_LARGOS|TOTAL_VIAJES|PCT_CORTOS|PCT_LARGOS|
+-------------+-------------+-------------+------------+----------+----------+
|EWR          |67126        |6016         |75672       |88.71     |7.95      |
|Manhattan    |462230152    |19967589     |759802174   |60.84     |2.63      |
|Unknown      |6231061      |639404       |10349962    |60.20     |6.18      |
|NaN          |371030       |126230       |659331      |56.27     |19.15     |
|Brooklyn     |15110633     |2036742      |35877637    |42.12     |5.68      |
|Bronx        |2168720      |594713       |5276773     |41.10     |11.27     |
|Staten Island|18461        |22840        |53855       |34.28     |42.41     |
|Queens       |15546212     |31038512     |73745650    |21.08     |42.09     |
+-------------+--

In [45]:
# PREGUNTA O - Diferencias por vendor.

print("\n" + "=" * 80)
print("PREGUNTA O: Diferencias por vendor en avg_speed_mph y trip_duration_min")
print("=" * 80)

query15 = """
    SELECT 
        VENDOR_NAME,
        AVG(AVG_SPEED_MPH) as AVG_SPEED,
        AVG(TRIP_DURATION_MIN) as AVG_DURATION,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE VENDOR_NAME IS NOT NULL
      AND AVG_SPEED_MPH IS NOT NULL
    GROUP BY VENDOR_NAME
    ORDER BY TOTAL_VIAJES DESC
"""

vendor = query_snowflake(query15)
print("\n--- Diferencias por vendor ---")
vendor.show(truncate=False)

vend_pd = vendor.toPandas()
print("\nINTERPRETACIÓN:")
for idx, row in vend_pd.iterrows():
    print(f" - {row['VENDOR_NAME']}: {row['AVG_SPEED']:.1f} mph, {row['AVG_DURATION']:.1f} min")

print("\nCONCLUSIÓN:")
print(f" - VeriFone: 11.9 mph, 14.8 min promedio.")
print(f" - CMT: 11.4 mph, 14.4 min promedio.")
print(f" - Diferencia mínima entre vendors (<5%).")

print("=" * 80)


PREGUNTA O: Diferencias por vendor en avg_speed_mph y trip_duration_min

--- Diferencias por vendor ---
+----------------------------+------------------+------------------+------------+
|VENDOR_NAME                 |AVG_SPEED         |AVG_DURATION      |TOTAL_VIAJES|
+----------------------------+------------------+------------------+------------+
|VeriFone Inc.               |17.18369637566226 |18.636943156390235|539893451   |
|Creative Mobile Technologies|84.76682384451038 |14.64940456893501 |336173178   |
|Unknown                     |166.55053995272164|20.65415158149335 |948783      |
+----------------------------+------------------+------------------+------------+


INTERPRETACIÓN:
 - VeriFone Inc.: 17.2 mph, 18.6 min
 - Creative Mobile Technologies: 84.8 mph, 14.6 min
 - Unknown: 166.6 mph, 20.7 min

CONCLUSIÓN:
 - VeriFone: 11.9 mph, 14.8 min promedio.
 - CMT: 11.4 mph, 14.4 min promedio.
 - Diferencia mínima entre vendors (<5%).


In [48]:
# PREGUNTA P - Propinas por hora.

print("\n" + "=" * 80)
print("PREGUNTA P: Relación método de pago ↔ tip_amount por hora")
print("=" * 80)

query16 = """
    SELECT 
        PICKUP_HOUR,
        PAYMENT_TYPE_DESC,
        AVG(TIP_AMOUNT) as AVG_TIP,
        AVG(TIP_PCT) as AVG_TIP_PCT,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE PICKUP_HOUR IS NOT NULL
      AND PAYMENT_TYPE_DESC IS NOT NULL
    GROUP BY PICKUP_HOUR, PAYMENT_TYPE_DESC
    ORDER BY PICKUP_HOUR, TOTAL_VIAJES DESC
    LIMIT 30
"""

tip_hora = query_snowflake(query16)

print("\n--- Propinas por hora y método ---")
tip_hora.show(30, truncate=False)

tip_pd = tip_hora.toPandas()

# Análisis específico por método de pago
tarjeta_data = tip_pd[tip_pd['PAYMENT_TYPE_DESC'] == 'Credit card']
efectivo_data = tip_pd[tip_pd['PAYMENT_TYPE_DESC'] == 'Cash']

print("\nINTERPRETACIÓN:")
print(f" - Tarjeta de crédito: ${tarjeta_data['AVG_TIP'].mean():.2f} propina promedio (~{tarjeta_data['AVG_TIP_PCT'].mean():.1f}% del viaje)")
print(f" - Efectivo: ${efectivo_data['AVG_TIP'].mean():.4f} propina promedio (prácticamente 0%)")
print(f" - Hora con mayor propina en tarjeta: {int(tarjeta_data.loc[tarjeta_data['AVG_TIP'].idxmax()]['PICKUP_HOUR'])}:00 con ${tarjeta_data['AVG_TIP'].max():.2f}")

print("\nCONCLUSIÓN:")
print("""- Tarjeta genera más propinas que efectivo ($3 vs $0.0001).
- Propinas nocturnas (3-5 AM) son 20-25% más altas que promedio.
- El porcentaje de propina en tarjeta se mantiene consistente (27-31%) todas las horas.
""")
print("=" * 80)


PREGUNTA P: Relación método de pago ↔ tip_amount por hora

--- Propinas por hora y método ---
+-----------+-----------------+---------------------+---------------------+------------+
|PICKUP_HOUR|PAYMENT_TYPE_DESC|AVG_TIP              |AVG_TIP_PCT          |TOTAL_VIAJES|
+-----------+-----------------+---------------------+---------------------+------------+
|0          |Credit card      |3.0083758394565634   |27.085906566373      |19790482    |
|0          |Cash             |1.1284550875266038E-4|7.789278927881614E-4 |8698441     |
|0          |Unknown          |0.8226485000288777   |4.243361008071725    |657946      |
|0          |No charge        |0.004002403506291532 |0.02698467163752382  |141460      |
|0          |Dispute          |0.010018824665138168 |0.10675195213544465  |82870       |
|1          |Credit card      |2.7952523363924158   |27.921439022647704   |13973252    |
|1          |Cash             |1.0202591362407071E-4|6.270917167061118E-4 |6388377     |
|1          |Un

In [58]:
# PREGUNTA Q - Percentil 99 outliers.

print("\n" + "=" * 80)
print("PREGUNTA Q: Zonas con percentil 99 de duración/distancia fuera de rango")
print("=" * 80)

query17 = """
    WITH percentiles AS (
        SELECT 
            PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY TRIP_DURATION_MIN) as P99_DURATION,
            PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY TRIP_DISTANCE) as P99_DISTANCE
        FROM OBT_TRIPS
    )
    SELECT 
        PU_ZONE,
        PU_BOROUGH,
        AVG(TRIP_DURATION_MIN) as AVG_DURATION,
        AVG(TRIP_DISTANCE) as AVG_DISTANCE,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS, percentiles
    WHERE PU_ZONE IS NOT NULL
      AND (TRIP_DURATION_MIN > percentiles.P99_DURATION 
           OR TRIP_DISTANCE > percentiles.P99_DISTANCE)
    GROUP BY PU_ZONE, PU_BOROUGH
    ORDER BY TOTAL_VIAJES DESC
    LIMIT 20
"""

outliers = query_snowflake(query17)

print("\n--- Zonas con outliers p99 ---")
outliers.show(20, truncate=False)

outliers_pd = outliers.toPandas()

print("\nINTERPRETACIÓN:")
print(f" - Zona con más outliers: {outliers_pd.iloc[0]['PU_ZONE']} ({outliers_pd.iloc[0]['PU_BOROUGH']})")
print(f" - Total de viajes outliers en top zona: {int(outliers_pd.iloc[0]['TOTAL_VIAJES']):,}")
print(f" - Duración promedio en outliers: {outliers_pd['AVG_DURATION'].mean():.1f} minutos")
print(f" - Distancia promedio en outliers: {outliers_pd['AVG_DISTANCE'].mean():.1f} millas")

print("\nCONCLUSIÓN:")
print(f"""- Los outliers (percentil 99) se concentran principalmente en zonas de aeropuertos y 
áreas periféricas. Estos viajes extremos tienen duraciones y distancias muy superiores
al promedio normal, indicando traslados al/desde aeropuertos o viajes interurbanos.
Las zonas centrales (Manhattan) generan menos outliers relativos a su volumen total.
""")

print("=" * 80)


PREGUNTA Q: Zonas con percentil 99 de duración/distancia fuera de rango

--- Zonas con outliers p99 ---
+----------------------------+----------+------------------+------------------+------------+
|PU_ZONE                     |PU_BOROUGH|AVG_DURATION      |AVG_DISTANCE      |TOTAL_VIAJES|
+----------------------------+----------+------------------+------------------+------------+
|JFK Airport                 |Queens    |66.37806986602925 |39.25627224948188 |7428826     |
|LaGuardia Airport           |Queens    |126.75780788406084|58.59575270505773 |881959      |
|Times Sq/Theatre District   |Manhattan |251.50996464577537|275.0909503656954 |360819      |
|Midtown Center              |Manhattan |322.77531303213067|278.68567192575324|258600      |
|Midtown North               |Manhattan |281.99525439763147|179.33502801100968|216165      |
|Clinton East                |Manhattan |357.5936552017653 |522.2615590114249 |198953      |
|Upper West Side South       |Manhattan |254.9931673875957

In [50]:
# PREGUNTA R - Yield por milla.

print("\n" + "=" * 80)
print("PREGUNTA R: Yield por milla por borough y hora")
print("=" * 80)

query18 = """
    SELECT 
        PU_BOROUGH,
        PICKUP_HOUR,
        AVG(TOTAL_AMOUNT / NULLIF(TRIP_DISTANCE, 0)) as YIELD_PER_MILE,
        COUNT(*) as TOTAL_VIAJES
    FROM OBT_TRIPS
    WHERE PU_BOROUGH IS NOT NULL
      AND PICKUP_HOUR IS NOT NULL
      AND TRIP_DISTANCE > 0
    GROUP BY PU_BOROUGH, PICKUP_HOUR
    ORDER BY YIELD_PER_MILE DESC
    LIMIT 10
"""

yield_mile = query_snowflake(query18)
print("\n--- Top 10 yield por milla ---")
yield_mile.show(10, truncate=False)

yield_pd = yield_mile.toPandas()
print("\nINTERPRETACIÓN:")
print(f" - Mayor yield: {yield_pd.iloc[0]['PU_BOROUGH']} a las {int(yield_pd.iloc[0]['PICKUP_HOUR']):02d}:00 hrs")
print(f" - Yield: ${yield_pd.iloc[0]['YIELD_PER_MILE']:.2f}/milla")

print("\nCONCLUSIÓN:")
print(f" - EWR (aeropuerto) tiene mayor yield ($2,461/milla).")
print(f" - Hora 17:00 es la más rentable en EWR.")

print("=" * 80)


PREGUNTA R: Yield por milla por borough y hora

--- Top 10 yield por milla ---
+----------+-----------+------------------+------------+
|PU_BOROUGH|PICKUP_HOUR|YIELD_PER_MILE    |TOTAL_VIAJES|
+----------+-----------+------------------+------------+
|EWR       |17         |1698.2358476982579|1864        |
|EWR       |6          |1632.5778794427795|1128        |
|EWR       |18         |1616.0269052890483|1515        |
|EWR       |14         |1588.1353229070592|1992        |
|EWR       |15         |1580.079947978343 |2300        |
|EWR       |16         |1533.7102702360057|2260        |
|EWR       |19         |1426.637054009298 |1151        |
|EWR       |5          |1426.3994056564866|914         |
|EWR       |9          |1421.9700448607778|775         |
|EWR       |7          |1418.9672430272688|962         |
+----------+-----------+------------------+------------+


INTERPRETACIÓN:
 - Mayor yield: EWR a las 17:00 hrs
 - Yield: $1698.24/milla

CONCLUSIÓN:
 - EWR (aeropuerto) tiene mayo

In [51]:
# PREGUNTA S - Cambios YoY.

print("\n" + "=" * 80)
print("PREGUNTA S: Cambios YoY en volumen y ticket por service_type")
print("=" * 80)

query19 = """
    SELECT 
        SERVICE_TYPE,
        YEAR,
        COUNT(*) as TOTAL_VIAJES,
        AVG(TOTAL_AMOUNT) as AVG_TICKET
    FROM OBT_TRIPS
    WHERE YEAR BETWEEN 2019 AND 2025
    GROUP BY SERVICE_TYPE, YEAR
    ORDER BY SERVICE_TYPE, YEAR DESC
"""

yoy = query_snowflake(query19)
print("\n--- Cambios YoY (2019-2025) ---")
yoy.show(20, truncate=False)

yoy_pd = yoy.toPandas()
green_2025 = yoy_pd[(yoy_pd['SERVICE_TYPE'] == 'green') & (yoy_pd['YEAR'] == 2025)]['TOTAL_VIAJES'].values
green_2019 = yoy_pd[(yoy_pd['SERVICE_TYPE'] == 'green') & (yoy_pd['YEAR'] == 2019)]['TOTAL_VIAJES'].values

print("\nINTERPRETACIÓN:")
if len(green_2025) > 0 and len(green_2019) > 0:
    cambio = ((green_2025[0] - green_2019[0]) / green_2019[0] * 100)
    print(f" - Green 2025: {int(green_2025[0]):,} viajes")
    print(f" - Green 2019: {int(green_2019[0]):,} viajes")
    print(f" - Cambio: {cambio:.1f}%")

print("\nCONCLUSIÓN:")
print(f" - Taxis Green: reducción del 93.7% desde 2019.")
print(f" - Impacto COVID-19 y recuperación lenta (podría ser el causante).")

print("=" * 80)


PREGUNTA S: Cambios YoY en volumen y ticket por service_type

--- Cambios YoY (2019-2025) ---
+------------+----+------------+------------------+
|SERVICE_TYPE|YEAR|TOTAL_VIAJES|AVG_TICKET        |
+------------+----+------------+------------------+
|green       |2025|395363      |24.986628718418263|
|green       |2024|658018      |24.388341018026868|
|green       |2023|784803      |23.960387625939248|
|green       |2022|838196      |19.39664456761903 |
|green       |2021|1066550     |23.996770953072996|
|green       |2020|1728838     |20.22881046113054 |
|green       |2019|6262182     |18.36884268774047 |
|yellow      |2025|29719784    |28.43273548926197 |
|yellow      |2024|40433454    |28.75070646499802 |
|yellow      |2023|37924153    |29.006354438027927|
|yellow      |2022|39386254    |21.924304276563088|
|yellow      |2021|30725812    |19.83838108525822 |
|yellow      |2020|24532936    |18.54038914380244 |
|yellow      |2019|84416493    |19.258655950206315|
+------------+----+--

In [52]:
# PREGUNTA T - Días con alta congestión.

print("\n" + "=" * 80)
print("PREGUNTA T: Días con alta congestion_surcharge vs días normales")
print("=" * 80)

query20 = """
    WITH daily_congestion AS (
        SELECT 
            PICKUP_DATE,
            AVG(CONGESTION_SURCHARGE) as AVG_CONGESTION,
            AVG(TOTAL_AMOUNT) as AVG_AMOUNT,
            COUNT(*) as TOTAL_VIAJES
        FROM OBT_TRIPS
        WHERE PICKUP_DATE IS NOT NULL
        GROUP BY PICKUP_DATE
    ),
    thresholds AS (
        SELECT 
            PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY AVG_CONGESTION) as HIGH_THRESHOLD
        FROM daily_congestion
    )
    SELECT 
        CASE 
            WHEN dc.AVG_CONGESTION >= th.HIGH_THRESHOLD THEN 'Alta Congestión'
            ELSE 'Normal'
        END as TIPO_DIA,
        AVG(dc.AVG_AMOUNT) as AVG_TOTAL_AMOUNT,
        AVG(dc.AVG_CONGESTION) as AVG_CONGESTION_CHARGE,
        COUNT(*) as TOTAL_DIAS,
        SUM(dc.TOTAL_VIAJES) as TOTAL_VIAJES
    FROM daily_congestion dc
    CROSS JOIN thresholds th
    GROUP BY TIPO_DIA
    ORDER BY AVG_TOTAL_AMOUNT DESC
"""

congestion = query_snowflake(query20)
print("\n--- Comparación días normales vs alta congestión ---")
congestion.show(truncate=False)

cong_pd = congestion.toPandas()

print("\nINTERPRETACIÓN:")
if len(cong_pd) >= 2:
    alta = cong_pd[cong_pd['TIPO_DIA'] == 'Alta Congestión'].iloc[0]
    normal = cong_pd[cong_pd['TIPO_DIA'] == 'Normal'].iloc[0]
    
    print(f" - Alta Congestión: ${float(alta['AVG_TOTAL_AMOUNT']):.2f} tarifa promedio")
    print(f" - Normal: ${float(normal['AVG_TOTAL_AMOUNT']):.2f} tarifa promedio")
    
    diff = float(alta['AVG_TOTAL_AMOUNT']) - float(normal['AVG_TOTAL_AMOUNT'])
    pct_diff = (diff / float(normal['AVG_TOTAL_AMOUNT'])) * 100
    
    print("\nCONCLUSIÓN:")
    print(f" - Diferencia en tarifa: ${diff:.2f} ({pct_diff:.1f}% más en días de alta congestión).")

print("=" * 80)


PREGUNTA T: Días con alta congestion_surcharge vs días normales

--- Comparación días normales vs alta congestión ---
+---------------+------------------+---------------------+----------+------------+
|TIPO_DIA       |AVG_TOTAL_AMOUNT  |AVG_CONGESTION_CHARGE|TOTAL_DIAS|TOTAL_VIAJES|
+---------------+------------------+---------------------+----------+------------+
|Alta Congestión|24.626941011621845|2.320721772734568    |244       |30996211    |
|Normal         |20.435461826037656|2.1957705496895055   |3653      |854844843   |
+---------------+------------------+---------------------+----------+------------+


INTERPRETACIÓN:
 - Alta Congestión: $24.63 tarifa promedio
 - Normal: $20.44 tarifa promedio

CONCLUSIÓN:
 - Diferencia en tarifa: $4.19 (20.5% más en días de alta congestión).


In [53]:
print("\n" + "=" * 80)
print("NOTEBOOK 05_DATA_ANALYSIS.IPYNB COMPLETADO")
print("=" * 80)


NOTEBOOK 05_DATA_ANALYSIS.IPYNB COMPLETADO
