In [109]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error
)

from xgboost import XGBRegressor

import joblib


df = pd.read_csv("../data/clean/tiempos_final.csv")

df.head()

Unnamed: 0,zona,atraccion,tiempo_espera,abierta,ultima_actualizacion,fecha,hora,dia_semana,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada
0,Cartoon Village,A Toda Máquina,5,True,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,Thursday,10.0,False,17.2,71.0,16.5,2.0,baja
1,Cartoon Village,Academia de Pilotos Baby Looney Tunes,0,False,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,Thursday,10.0,False,17.2,71.0,16.5,2.0,baja
2,Cartoon Village,Cartoon Carousel,5,True,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,Thursday,10.0,False,17.2,71.0,16.5,2.0,baja
3,Cartoon Village,Convoy de Camiones,0,False,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,Thursday,10.0,False,17.2,71.0,16.5,2.0,baja
4,Cartoon Village,Correcaminos Bip Bip,15,True,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,Thursday,10.0,False,17.2,71.0,16.5,2.0,baja


In [107]:
df.sample(5)

Unnamed: 0,zona,atraccion,tiempo_espera,abierta,ultima_actualizacion,fecha,hora,dia_semana,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada
1251,DC Super Heroes World,Superman La Atracción de Acero,20,True,2025-10-22 11:35:13+00:00,2025-10-22,11:35,Miércoles,10.0,False,18.8,68.0,17.3,2.0,alta
580,Cartoon Village,Tom & Jerry Picnic en el Parque,10,True,2025-11-08 13:25:09+00:00,2025-11-08,14:26:38,Saturday,11.0,True,14.8,44.0,12.3,2.0,baja
787,Old West Territory,Coaster Express,15,,2025-11-08 15:45:11+00:00,2025-11-08,16:47,Saturday,11.0,True,15.4,39.0,12.5,1.0,baja
414,DC Super Heroes World,Batman Gotham City Escape,40,True,2025-11-08 11:45:10+00:00,2025-11-08,12:46,Saturday,11.0,True,13.4,50.0,10.8,2.0,baja
215,Cartoon Village,Wile E. Coyote Zona de Explosión,0,False,2025-11-08 11:20:09+00:00,2025-11-08,12:22:28,Saturday,11.0,True,13.4,50.0,10.8,2.0,baja


In [108]:
print(df['fecha'].unique())


['2025-10-30' '2025-11-08' '2025-11-15' 'fecha' '2025-10-16' '2025-10-17'
 '2025-10-18' '2025-10-22']


In [110]:
df['fecha'] = pd.to_datetime(df['fecha'])
df['dia_semana_num'] = df['fecha'].dt.weekday
df['mes'] = df['fecha'].dt.month

In [111]:
df.drop(columns=["dia_semana"])

Unnamed: 0,zona,atraccion,tiempo_espera,abierta,ultima_actualizacion,fecha,hora,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num
0,Cartoon Village,A Toda Máquina,5,True,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,10,False,17.2,71.0,16.5,2.0,baja,3
1,Cartoon Village,Academia de Pilotos Baby Looney Tunes,0,False,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,10,False,17.2,71.0,16.5,2.0,baja,3
2,Cartoon Village,Cartoon Carousel,5,True,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,10,False,17.2,71.0,16.5,2.0,baja,3
3,Cartoon Village,Convoy de Camiones,0,False,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,10,False,17.2,71.0,16.5,2.0,baja,3
4,Cartoon Village,Correcaminos Bip Bip,15,True,2025-10-30 15:25:08+00:00,2025-10-30,16:26:19,10,False,17.2,71.0,16.5,2.0,baja,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1425,Movie World Studios,Cine Tour,5,True,2025-10-22 14:05:09+00:00,2025-10-22,14:05,10,False,21.7,51.0,19.2,2.0,alta,2
1426,Movie World Studios,Hotel Embrujado,5,True,2025-10-22 14:05:09+00:00,2025-10-22,14:05,10,False,21.7,51.0,19.2,2.0,alta,2
1427,Movie World Studios,Stunt Fall,10,True,2025-10-22 14:05:09+00:00,2025-10-22,14:05,10,False,21.7,51.0,19.2,2.0,alta,2
1428,Old West Territory,Coaster Express,5,True,2025-10-22 14:05:09+00:00,2025-10-22,14:05,10,False,21.7,51.0,19.2,2.0,alta,2


Unnamed: 0,zona,atraccion,tiempo_espera,abierta,ultima_actualizacion,fecha,hora,dia_semana,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num
506,Halloween,[HSN] IT Experience,0,False,2025-11-08 11:40:09+00:00,2025-11-08,12:42,Saturday,11,True,13.4,50.0,10.8,2.0,baja,5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092 entries, 0 to 1091
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   zona                  1092 non-null   object        
 1   atraccion             1092 non-null   object        
 2   tiempo_espera         1092 non-null   int64         
 3   abierta               846 non-null    object        
 4   ultima_actualizacion  1092 non-null   object        
 5   fecha                 1092 non-null   datetime64[ns]
 6   hora                  1092 non-null   object        
 7   dia_semana            1092 non-null   object        
 8   mes                   1092 non-null   int32         
 9   fin_de_semana         1092 non-null   bool          
 10  temperatura           1092 non-null   float64       
 11  humedad               1092 non-null   float64       
 12  sensacion_termica     1092 non-null   float64       
 13  codigo_clima      

In [112]:
df["hora"] = pd.to_datetime(df["hora"])
df["hora_num"] = df["hora"].dt.hour


  df["hora"] = pd.to_datetime(df["hora"])


In [113]:
df = df.drop(columns=["hora"]) 


In [114]:
df = df.drop(columns=["ultima_actualizacion", "fecha"])

In [115]:
df = df.drop(columns=["abierta"])

In [83]:
df.sample(5)

Unnamed: 0,zona,atraccion,tiempo_espera,dia_semana,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num,hora_num
139,Warner Beach,IT Experience,5,Thursday,10,False,17.2,71.0,16.5,2.0,baja,3,16
301,DC Super Heroes World,La Venganza del Enigma,5,Saturday,11,True,13.4,50.0,10.8,2.0,baja,5,12
45,Warner Beach,IT Experience,5,Thursday,10,False,17.2,71.0,16.5,2.0,baja,3,16
641,DC Super Heroes World,The Joker Coches de Choque,10,Saturday,11,True,14.3,46.0,11.9,3.0,baja,5,13
104,Cartoon Village,Looney Tunes Correo Aéreo,5,Thursday,10,False,17.2,71.0,16.5,2.0,baja,3,16


In [116]:
df["fin_de_semana"] = df["fin_de_semana"].astype(int)
df = df.drop(columns=["dia_semana"])


In [98]:
df.sample(5)

Unnamed: 0,zona,atraccion,tiempo_espera,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num,hora_num
698,Movie World Studios,Hotel Embrujado,5,11,1,15.2,41.0,12.4,2.0,baja,5,15
255,DC Super Heroes World,The Joker Coches de Choque,5,11,1,13.4,50.0,10.8,2.0,baja,5,12
227,Old West Territory,Cataratas Salvajes,0,11,1,13.4,50.0,10.8,2.0,baja,5,12
539,Cartoon Village,La Aventura de Scooby-Doo,30,11,1,14.8,44.0,12.3,2.0,baja,5,14
586,DC Super Heroes World,Shadows of Arkham,25,11,1,14.8,44.0,12.3,2.0,baja,5,14


In [117]:
df["temporada"] = df["temporada"].map({"baja": 0, "media":1, "alta": 2})

In [118]:
df.sample(5)

Unnamed: 0,zona,atraccion,tiempo_espera,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num,hora_num
1265,DC Super Heroes World,Shadows of Arkham,10,10,0,18.8,68.0,17.3,2.0,2,2,11
820,Movie World Studios,Stunt Fall,5,11,1,13.1,83.0,11.9,61.0,0,5,15
946,Cartoon Village,La Captura de Gossamer,5,10,0,22.6,44.0,22.2,1.0,2,4,13
704,Cartoon Village,La Aventura de Scooby-Doo,15,11,1,15.2,41.0,12.4,2.0,0,5,15
1224,Cartoon Village,La Captura de Gossamer,5,10,0,18.8,68.0,17.3,2.0,2,2,11


In [119]:
print(df.dtypes)


zona                  object
atraccion             object
tiempo_espera          int64
mes                    int32
fin_de_semana          int64
temperatura          float64
humedad              float64
sensacion_termica    float64
codigo_clima         float64
temporada              int64
dia_semana_num         int32
hora_num               int32
dtype: object


In [120]:
df.duplicated().sum()

np.int64(887)

In [123]:
df[df.duplicated(keep=False)]

Unnamed: 0,zona,atraccion,tiempo_espera,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num,hora_num
0,Cartoon Village,A Toda Máquina,5,10,0,17.2,71.0,16.5,2.0,0,3,16
1,Cartoon Village,Academia de Pilotos Baby Looney Tunes,0,10,0,17.2,71.0,16.5,2.0,0,3,16
2,Cartoon Village,Cartoon Carousel,5,10,0,17.2,71.0,16.5,2.0,0,3,16
3,Cartoon Village,Convoy de Camiones,0,10,0,17.2,71.0,16.5,2.0,0,3,16
5,Cartoon Village,Emergencias Pato Lucas,0,10,0,17.2,71.0,16.5,2.0,0,3,16
...,...,...,...,...,...,...,...,...,...,...,...,...
1409,Movie World Studios,Cine Tour,5,10,0,20.9,58.0,19.3,1.0,2,2,13
1410,Movie World Studios,Hotel Embrujado,5,10,0,20.9,58.0,19.3,1.0,2,2,13
1411,Movie World Studios,Stunt Fall,5,10,0,20.9,58.0,19.3,1.0,2,2,13
1412,Old West Territory,Coaster Express,5,10,0,20.9,58.0,19.3,1.0,2,2,13


In [124]:
# Detectar filas duplicadas considerando todas las columnas
duplicados = df[df.duplicated(keep=False)]

# Mostrar cuántas filas duplicadas hay
print("Número de filas duplicadas:", len(duplicados))

# Si quieres ver exactamente cuáles son las filas duplicadas
print(duplicados)

Número de filas duplicadas: 1220
                     zona                              atraccion  \
0         Cartoon Village                         A Toda Máquina   
1         Cartoon Village  Academia de Pilotos Baby Looney Tunes   
2         Cartoon Village                       Cartoon Carousel   
3         Cartoon Village                     Convoy de Camiones   
5         Cartoon Village                 Emergencias Pato Lucas   
...                   ...                                    ...   
1409  Movie World Studios                              Cine Tour   
1410  Movie World Studios                        Hotel Embrujado   
1411  Movie World Studios                             Stunt Fall   
1412   Old West Territory                        Coaster Express   
1413   Old West Territory                  Los Carros de la Mina   

      tiempo_espera  mes  fin_de_semana  temperatura  humedad  \
0                 5   10              0         17.2     71.0   
1                 0 

In [125]:
# Contar cuántas veces se repite cada fila
conteo_filas = df.groupby(list(df.columns)).size().reset_index(name='conteo')

# Filtrar solo las que se repiten más de una vez
filas_repetidas = conteo_filas[conteo_filas['conteo'] > 1]

print(filas_repetidas)


                   zona              atraccion  tiempo_espera  mes  \
0       Cartoon Village         A Toda Máquina              5   10   
3       Cartoon Village         A Toda Máquina              5   10   
5       Cartoon Village         A Toda Máquina              5   10   
6       Cartoon Village         A Toda Máquina              5   10   
7       Cartoon Village         A Toda Máquina              5   10   
..                  ...                    ...            ...  ...   
534  Old West Territory  Los Carros de la Mina              5   11   
535  Old West Territory  Los Carros de la Mina              5   11   
539  Old West Territory              Río Bravo              0   10   
541  Old West Territory              Río Bravo              0   11   
542  Old West Territory              Río Bravo              0   11   

     fin_de_semana  temperatura  humedad  sensacion_termica  codigo_clima  \
0                0         17.2     71.0               16.5           2.0   
3    

In [126]:
duplicadas_todas = df[df.duplicated(keep=False)]
print(duplicadas_todas)


                     zona                              atraccion  \
0         Cartoon Village                         A Toda Máquina   
1         Cartoon Village  Academia de Pilotos Baby Looney Tunes   
2         Cartoon Village                       Cartoon Carousel   
3         Cartoon Village                     Convoy de Camiones   
5         Cartoon Village                 Emergencias Pato Lucas   
...                   ...                                    ...   
1409  Movie World Studios                              Cine Tour   
1410  Movie World Studios                        Hotel Embrujado   
1411  Movie World Studios                             Stunt Fall   
1412   Old West Territory                        Coaster Express   
1413   Old West Territory                  Los Carros de la Mina   

      tiempo_espera  mes  fin_de_semana  temperatura  humedad  \
0                 5   10              0         17.2     71.0   
1                 0   10              0         17.2 

In [127]:
df[df.duplicated(keep=False)]


Unnamed: 0,zona,atraccion,tiempo_espera,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num,hora_num
0,Cartoon Village,A Toda Máquina,5,10,0,17.2,71.0,16.5,2.0,0,3,16
1,Cartoon Village,Academia de Pilotos Baby Looney Tunes,0,10,0,17.2,71.0,16.5,2.0,0,3,16
2,Cartoon Village,Cartoon Carousel,5,10,0,17.2,71.0,16.5,2.0,0,3,16
3,Cartoon Village,Convoy de Camiones,0,10,0,17.2,71.0,16.5,2.0,0,3,16
5,Cartoon Village,Emergencias Pato Lucas,0,10,0,17.2,71.0,16.5,2.0,0,3,16
...,...,...,...,...,...,...,...,...,...,...,...,...
1409,Movie World Studios,Cine Tour,5,10,0,20.9,58.0,19.3,1.0,2,2,13
1410,Movie World Studios,Hotel Embrujado,5,10,0,20.9,58.0,19.3,1.0,2,2,13
1411,Movie World Studios,Stunt Fall,5,10,0,20.9,58.0,19.3,1.0,2,2,13
1412,Old West Territory,Coaster Express,5,10,0,20.9,58.0,19.3,1.0,2,2,13


In [128]:
df.isna().sum()

zona                 0
atraccion            0
tiempo_espera        0
mes                  0
fin_de_semana        0
temperatura          0
humedad              0
sensacion_termica    0
codigo_clima         0
temporada            0
dia_semana_num       0
hora_num             0
dtype: int64

In [130]:
df.sample(5)

Unnamed: 0,zona,atraccion,tiempo_espera,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num,hora_num
1231,DC Super Heroes World,Superman La Atracción de Acero,20,10,0,18.8,68.0,17.3,2.0,2,2,11
681,Movie World Studios,Stunt Fall,5,11,1,15.2,41.0,12.4,2.0,0,5,15
625,Movie World Studios,Stunt Fall,5,11,1,14.8,44.0,12.3,2.0,0,5,14
659,DC Super Heroes World,Superman La Atracción de Acero,30,11,1,15.2,41.0,12.4,2.0,0,5,15
1011,Old West Territory,Los Carros de la Mina,5,10,0,23.8,40.0,23.4,1.0,2,4,14


In [137]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Tus columnas categóricas
categorical_features = ['zona', 'atraccion']

# Inicializar OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Ajustar y transformar
ohe_array = ohe.fit_transform(df[categorical_features])

# Obtener nombres de las nuevas columnas
ohe_columns = ohe.get_feature_names_out(categorical_features)

# Crear DataFrame con las columnas codificadas
df_ohe = pd.DataFrame(ohe_array, columns=ohe_columns, index=df.index)

# Concatenar con el DataFrame original (opcional: puedes quitar las columnas originales)
df_final = pd.concat([df.drop(columns=categorical_features), df_ohe], axis=1)

# Hacer un sample de 10 filas para ver



In [139]:
df_final.sample()

Unnamed: 0,tiempo_espera,mes,fin_de_semana,temperatura,humedad,sensacion_termica,codigo_clima,temporada,dia_semana_num,hora_num,...,atraccion_Piolín y Silvestre Paseo en Autobús,atraccion_Rápidos ACME,atraccion_Río Bravo,atraccion_Scooby-Doo's Tea Party Mistery,atraccion_Shadows of Arkham,atraccion_Stunt Fall,atraccion_Superman La Atracción de Acero,atraccion_The Joker Coches de Choque,atraccion_Tom & Jerry Picnic en el Parque,atraccion_Wile E. Coyote Zona de Explosión
114,0,10,0,17.2,72.0,16.6,2.0,0,3,17,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
df.sample(5)
