## Importación de librerías

In [1]:
# Librerías para análisis de datos y visualización
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import shapiro, poisson, chisquare, expon, kstest

# Configuración
pd.set_option('display.max_columns', None)

# Ignorar advertencias
import warnings
warnings.filterwarnings("ignore")

# Librerías para imputación de datos
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Para manejo de datos tipo JSON
import json

## Carga y exploración inicial del dataset

In [2]:
# Carga de datos
df_viajes = pd.read_csv("viajes.csv")

In [3]:
# Vista previa de los datos
df_viajes.head()

Unnamed: 0,id,city,country,region,short_description,latitude,longitude,avg_temp_monthly,ideal_durations,budget_level,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion
0,c54acf38-3029-496b-8c7a-8343ad82785c,Milan,Italy,europe,"Chic streets lined with fashion boutiques, his...",45.464194,9.189635,"{""1"":{""avg"":3.7,""max"":7.8,""min"":0.4},""2"":{""avg...","[""Short trip"",""One week""]",Luxury,5,2,2,1,4,5,3,5,2
1,0bd12654-ed64-424e-a044-7bc574bcf078,Yasawa Islands,Fiji,oceania,"Crystal-clear waters, secluded beaches, and vi...",-17.290947,177.125786,"{""1"":{""avg"":28,""max"":30.8,""min"":25.8},""2"":{""av...","[""Long trip"",""One week""]",Luxury,2,4,5,5,2,3,4,1,5
2,73036cda-9134-46fc-a2c6-807782d59dfb,Whistler,Canada,north_america,Snow-capped peaks and lush forests create a se...,50.11719,-122.954302,"{""1"":{""avg"":-2.5,""max"":0.4,""min"":-5.5},""2"":{""a...","[""Short trip"",""Weekend"",""One week""]",Luxury,3,5,5,2,3,3,4,2,4
3,3872c9c0-6b6e-49e1-9743-f46bfe591b86,Guanajuato,Mexico,north_america,Winding cobblestone streets and colorful facad...,20.9877,-101.0,"{""1"":{""avg"":15.5,""max"":22.8,""min"":8.7},""2"":{""a...","[""Weekend"",""One week"",""Short trip""]",Mid-range,5,3,3,1,3,4,3,4,2
4,e1ebc1b6-8798-422d-847a-22016faff3fd,Surabaya,Indonesia,asia,Bustling streets filled with the aroma of loca...,-7.245972,112.737827,"{""1"":{""avg"":28.1,""max"":32.5,""min"":25.5},""2"":{""...","[""Short trip"",""Weekend""]",Budget,4,3,3,2,3,4,3,4,2


In [4]:
# Información general del dataset
df_viajes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 560 non-null    object 
 1   city               560 non-null    object 
 2   country            560 non-null    object 
 3   region             560 non-null    object 
 4   short_description  560 non-null    object 
 5   latitude           560 non-null    float64
 6   longitude          560 non-null    float64
 7   avg_temp_monthly   560 non-null    object 
 8   ideal_durations    560 non-null    object 
 9   budget_level       560 non-null    object 
 10  culture            560 non-null    int64  
 11  adventure          560 non-null    int64  
 12  nature             560 non-null    int64  
 13  beaches            560 non-null    int64  
 14  nightlife          560 non-null    int64  
 15  cuisine            560 non-null    int64  
 16  wellness           560 non

In [5]:
# Dimensiones del dataset
df_viajes.shape

(560, 19)

## Distribución de variables numéricas y categóricas

In [6]:
# Distribución de variables numéricas
for col in df_viajes.select_dtypes(include='number'):
    print('-----------------------------')
    print(f"La distribución de las categorías para la columna {col.upper()}")
    print(df_viajes[col].nunique())
    print(df_viajes[col].value_counts(normalize=True)) 

-----------------------------
La distribución de las categorías para la columna LATITUDE
552
latitude
 50.061947    0.003571
 46.813743    0.003571
 40.712728    0.003571
 38.895037    0.003571
 43.213036    0.003571
                ...   
-18.910012    0.001786
-6.159562     0.001786
 46.484302    0.001786
 54.596391    0.001786
-19.047725    0.001786
Name: proportion, Length: 552, dtype: float64
-----------------------------
La distribución de las categorías para la columna LONGITUDE
552
longitude
 19.936856    0.003571
-71.208406    0.003571
-74.006015    0.003571
-77.036543    0.003571
 2.349107     0.003571
                ...   
 47.525581    0.001786
 39.200969    0.001786
 30.732288    0.001786
-5.930183     0.001786
-65.259431    0.001786
Name: proportion, Length: 552, dtype: float64
-----------------------------
La distribución de las categorías para la columna CULTURE
4
culture
4    0.425000
3    0.308929
5    0.230357
2    0.035714
Name: proportion, dtype: float64
---------

In [7]:
# Distribución de variables categóricas
for col in df_viajes.select_dtypes(include='object'):
    print('-----------------------------')
    print(f"La distribución de las categorías para la columna {col.upper()}")
    print(df_viajes[col].nunique())
    print(df_viajes[col].value_counts(normalize=True)) 

-----------------------------
La distribución de las categorías para la columna ID
560
id
c54acf38-3029-496b-8c7a-8343ad82785c    0.001786
12aa2887-b59c-4f3d-bad6-7b57dac21879    0.001786
acd5d99c-5303-4f22-b176-f67a87c7e59c    0.001786
29e3c666-7847-4ce2-a1ab-c4036a94b9db    0.001786
e79d9e7b-6fc0-43de-a4fd-df7874e6f754    0.001786
                                          ...   
d9f5aa87-6ce6-4675-b55c-e64635035d90    0.001786
5eb2414d-ccc4-4665-b86a-f463c43a9c1b    0.001786
e90eff18-779e-4ad9-8f75-944a034c0600    0.001786
72a44cc1-8aff-4424-95a0-81afd9aa6db1    0.001786
62caf451-c399-4d64-a770-1efd93a603ef    0.001786
Name: proportion, Length: 560, dtype: float64
-----------------------------
La distribución de las categorías para la columna CITY
559
city
Granada          0.003571
Milan            0.001786
Cuenca           0.001786
Moroni           0.001786
Prague           0.001786
                   ...   
Zanzibar Town    0.001786
Odessa           0.001786
Belfast          0.0017

## Transformación y limpieza de datos

In [8]:
# Convertir columna de string a diccionario
df_viajes['avg_temp_monthly'] = df_viajes['avg_temp_monthly'].apply(json.loads)

In [9]:
# Crear nuevo dataframe con temperaturas mensuales desglosadas
rows = []
for i, row in df_viajes.iterrows():
    id_val = row['id']
    monthly_dict = row['avg_temp_monthly']
    for month, temps in monthly_dict.items():
        rows.append({
            'id': id_val,
            'Month': int(month),
            'Avg': temps['avg'],
            'Max': temps['max'],
            'Min': temps['min']
        })

df_monthly = pd.DataFrame(rows)

In [10]:
# Vista general del nuevo dataframe
df_monthly.head()

Unnamed: 0,id,Month,Avg,Max,Min
0,c54acf38-3029-496b-8c7a-8343ad82785c,1,3.7,7.8,0.4
1,c54acf38-3029-496b-8c7a-8343ad82785c,2,7.1,12.0,2.8
2,c54acf38-3029-496b-8c7a-8343ad82785c,3,10.5,15.5,5.5
3,c54acf38-3029-496b-8c7a-8343ad82785c,4,13.8,18.9,8.7
4,c54acf38-3029-496b-8c7a-8343ad82785c,5,17.9,22.5,13.4


In [11]:
# Estadísticas descriptivas
df_monthly.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Month,6720.0,6.5,3.452309,1.0,3.75,6.5,9.25,12.0
Avg,6720.0,17.966071,8.934916,-22.1,11.8,19.3,25.8,40.2
Max,6720.0,22.688958,9.257218,-14.8,16.7,24.55,30.2,47.0
Min,6720.0,13.747426,9.003137,-23.7,7.2,14.5,21.7,33.4


In [12]:
# Dimensiones del nuevo dataset
df_monthly.shape

(6720, 5)

In [13]:
# Corrección de comas y puntos en variables numéricas
df_monthly['Min'] = df_monthly['Min'].astype(str).str.replace('.', ',', regex=False)
df_monthly['Max'] = df_monthly['Max'].astype(str).str.replace('.', ',', regex=False)
df_monthly['Avg'] = df_monthly['Avg'].astype(str).str.replace('.', ',', regex=False)

In [14]:
# Revisión
df_monthly.head()

Unnamed: 0,id,Month,Avg,Max,Min
0,c54acf38-3029-496b-8c7a-8343ad82785c,1,37,78,4
1,c54acf38-3029-496b-8c7a-8343ad82785c,2,71,120,28
2,c54acf38-3029-496b-8c7a-8343ad82785c,3,105,155,55
3,c54acf38-3029-496b-8c7a-8343ad82785c,4,138,189,87
4,c54acf38-3029-496b-8c7a-8343ad82785c,5,179,225,134


In [15]:
# Guardar como CSV
df_monthly.to_csv("monthly_temp.csv",index=False)

In [16]:
# Convertir la columna de listas como texto a listas reales
df_viajes['ideal_durations_list'] = df_viajes['ideal_durations'].apply(lambda x: json.loads(x))
# Definir las duraciones a evaluar
durations = ["Weekend", "Short trip", "One week", "Long trip", "Day trip"]
# Crear columnas con "Sí" o "No" por cada duración
for duration in durations:
    col_name = f'{duration.lower().replace(" ", "_")}'
    df_viajes[col_name] = df_viajes['ideal_durations_list'].apply(
        lambda x: "Sí" if duration in x else "No"
    )
# Crear una tabla con solo la ciudad y las nuevas columnas de duración
duration_flags = df_viajes[['id'] + [f'{d.lower().replace(" ", "_")}' for d in durations]]

In [17]:
# Revisión
df_viajes.head()

Unnamed: 0,id,city,country,region,short_description,latitude,longitude,avg_temp_monthly,ideal_durations,budget_level,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion,ideal_durations_list,weekend,short_trip,one_week,long_trip,day_trip
0,c54acf38-3029-496b-8c7a-8343ad82785c,Milan,Italy,europe,"Chic streets lined with fashion boutiques, his...",45.464194,9.189635,"{'1': {'avg': 3.7, 'max': 7.8, 'min': 0.4}, '2...","[""Short trip"",""One week""]",Luxury,5,2,2,1,4,5,3,5,2,"[Short trip, One week]",No,Sí,Sí,No,No
1,0bd12654-ed64-424e-a044-7bc574bcf078,Yasawa Islands,Fiji,oceania,"Crystal-clear waters, secluded beaches, and vi...",-17.290947,177.125786,"{'1': {'avg': 28, 'max': 30.8, 'min': 25.8}, '...","[""Long trip"",""One week""]",Luxury,2,4,5,5,2,3,4,1,5,"[Long trip, One week]",No,No,Sí,Sí,No
2,73036cda-9134-46fc-a2c6-807782d59dfb,Whistler,Canada,north_america,Snow-capped peaks and lush forests create a se...,50.11719,-122.954302,"{'1': {'avg': -2.5, 'max': 0.4, 'min': -5.5}, ...","[""Short trip"",""Weekend"",""One week""]",Luxury,3,5,5,2,3,3,4,2,4,"[Short trip, Weekend, One week]",Sí,Sí,Sí,No,No
3,3872c9c0-6b6e-49e1-9743-f46bfe591b86,Guanajuato,Mexico,north_america,Winding cobblestone streets and colorful facad...,20.9877,-101.0,"{'1': {'avg': 15.5, 'max': 22.8, 'min': 8.7}, ...","[""Weekend"",""One week"",""Short trip""]",Mid-range,5,3,3,1,3,4,3,4,2,"[Weekend, One week, Short trip]",Sí,Sí,Sí,No,No
4,e1ebc1b6-8798-422d-847a-22016faff3fd,Surabaya,Indonesia,asia,Bustling streets filled with the aroma of loca...,-7.245972,112.737827,"{'1': {'avg': 28.1, 'max': 32.5, 'min': 25.5},...","[""Short trip"",""Weekend""]",Budget,4,3,3,2,3,4,3,4,2,"[Short trip, Weekend]",Sí,Sí,No,No,No


In [24]:
# Eliminamos sólo las columnas que estén presentes en el DataFrame
cols_a_eliminar = [
    "avg_temp_monthly", "ideal_durations", "ideal_durations_list",
    "duration_weekend", "duration_short_trip", "duration_one_week",
    "duration_long_trip", "duration_day_trip"
]
df_viajes.drop(columns=[col for col in cols_a_eliminar if col in df_viajes.columns], inplace=True)

In [25]:
# Revisión
df_viajes.head()

Unnamed: 0,id,city,country,region,short_description,latitude,longitude,budget_level,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion,weekend,short_trip,one_week,long_trip,day_trip
0,c54acf38-3029-496b-8c7a-8343ad82785c,Milan,Italy,Europe,"Chic streets lined with fashion boutiques, his...",45.464194,9.189635,Luxury,5,2,2,1,4,5,3,5,2,No,Sí,Sí,No,No
1,0bd12654-ed64-424e-a044-7bc574bcf078,Yasawa Islands,Fiji,Oceania,"Crystal-clear waters, secluded beaches, and vi...",-17.290947,177.125786,Luxury,2,4,5,5,2,3,4,1,5,No,No,Sí,Sí,No
2,73036cda-9134-46fc-a2c6-807782d59dfb,Whistler,Canada,North America,Snow-capped peaks and lush forests create a se...,50.11719,-122.954302,Luxury,3,5,5,2,3,3,4,2,4,Sí,Sí,Sí,No,No
3,3872c9c0-6b6e-49e1-9743-f46bfe591b86,Guanajuato,Mexico,North America,Winding cobblestone streets and colorful facad...,20.9877,-101.0,Mid-range,5,3,3,1,3,4,3,4,2,Sí,Sí,Sí,No,No
4,e1ebc1b6-8798-422d-847a-22016faff3fd,Surabaya,Indonesia,Asia,Bustling streets filled with the aroma of loca...,-7.245972,112.737827,Budget,4,3,3,2,3,4,3,4,2,Sí,Sí,No,No,No


In [26]:
# Limpiamos el formato de los nombres de región: reemplazamos guiones bajos por espacios y aplicamos formato título
df_viajes['region'] = df_viajes['region'].str.replace('_', ' ').str.title()

In [27]:
# Verificamos los cambios
df_viajes.head()

Unnamed: 0,id,city,country,region,short_description,latitude,longitude,budget_level,culture,adventure,nature,beaches,nightlife,cuisine,wellness,urban,seclusion,weekend,short_trip,one_week,long_trip,day_trip
0,c54acf38-3029-496b-8c7a-8343ad82785c,Milan,Italy,Europe,"Chic streets lined with fashion boutiques, his...",45.464194,9.189635,Luxury,5,2,2,1,4,5,3,5,2,No,Sí,Sí,No,No
1,0bd12654-ed64-424e-a044-7bc574bcf078,Yasawa Islands,Fiji,Oceania,"Crystal-clear waters, secluded beaches, and vi...",-17.290947,177.125786,Luxury,2,4,5,5,2,3,4,1,5,No,No,Sí,Sí,No
2,73036cda-9134-46fc-a2c6-807782d59dfb,Whistler,Canada,North America,Snow-capped peaks and lush forests create a se...,50.11719,-122.954302,Luxury,3,5,5,2,3,3,4,2,4,Sí,Sí,Sí,No,No
3,3872c9c0-6b6e-49e1-9743-f46bfe591b86,Guanajuato,Mexico,North America,Winding cobblestone streets and colorful facad...,20.9877,-101.0,Mid-range,5,3,3,1,3,4,3,4,2,Sí,Sí,Sí,No,No
4,e1ebc1b6-8798-422d-847a-22016faff3fd,Surabaya,Indonesia,Asia,Bustling streets filled with the aroma of loca...,-7.245972,112.737827,Budget,4,3,3,2,3,4,3,4,2,Sí,Sí,No,No,No


In [28]:
# Guardamos el dataframe final en un archivo CSV
df_viajes.to_csv("travel_info.csv",index=False)