In [2]:
import pandas as pd

# Load the datasets
pernoctacions = pd.read_csv('../data/pernoctacions_2019_2024.csv')
tourism = pd.read_csv('../data/tourism_flux.csv')
clima_df = pd.read_csv('../data/temperature_precipitation.csv')

In [2]:
# Convert the 'Day' column to datetime with specified format
tourism['Day'] = pd.to_datetime(tourism['Day'], format='%m/%d/%Y', errors='coerce')

# # Check for unparsed dates
# if tourism['Day'].isnull().any():
#     print("Some dates could not be parsed:")
#     print(tourism[tourism['Day'].isnull()])

# Group by month and year to calculate total flights per month
monthly_flights = tourism.groupby(tourism['Day'].dt.to_period('M'))['Flights'].sum().reset_index()
monthly_flights.columns = ['Month', 'Total Flights']

# Merge with original DataFrame to calculate percentage
flights_df = tourism.merge(monthly_flights, left_on=tourism['Day'].dt.to_period('M'), right_on='Month', how='left')

# Calculate percentage of flights
flights_df['Percentage of Flights'] = (flights_df['Flights'] / flights_df['Total Flights']) * 100

# Clean up DataFrame
flights_df.drop(columns=['Month'], inplace=True)


In [None]:
# Convertir las fechas en ambos datasets al mismo formato
pernoctacions['Data'] = pd.to_datetime(pernoctacions['Data'], format='%m/%d/%Y')
flights_df['Day'] = pd.to_datetime(flights_df['Day'])
pernoctacions['Pernoctacions'] = pd.to_numeric(pernoctacions['Pernoctacions'].str.replace(',', ''), errors='coerce')

# Iterar sobre los meses de pernoctaciones y distribuir según los porcentajes diarios de vuelos
resultados = []

for _, row in pernoctacions.iterrows():
    # Extraer el mes y año
    mes = row['Data'].month
    anyo = row['Data'].year
    pernoctaciones_mensuales = row['Pernoctacions']
    
    # Filtrar los vuelos de ese mes y año
    vuelos_mes = flights_df[(flights_df['Day'].dt.month == mes) & (flights_df['Day'].dt.year == anyo)]
    
    # Normalizar los porcentajes de vuelos
    vuelos_mes['Percentage of Flights'] = vuelos_mes['Percentage of Flights'] / vuelos_mes['Percentage of Flights'].sum()
    
    # Distribuir las pernoctaciones diarias
    for _, vuelo in vuelos_mes.iterrows():
        pernoctaciones_diarias = round(int(pernoctaciones_mensuales) * vuelo['Percentage of Flights'])
        resultados.append({
            'Day': vuelo['Day'],
            'tipus allotjament': row['Tipologia d\'allotjament'],
            'pernoctacions': pernoctaciones_diarias
        })

# Crear el DataFrame final
resultado_df = pd.DataFrame(resultados)

# Don't print warnings
pd.options.mode.chained_assignment = None

In [4]:
resultado_df = resultado_df.sort_values(by='Day', ascending=False)

In [5]:
clima_df = clima_df.groupby('fecha').agg({
    'temp_max': 'mean',
    'temp_min': 'mean',
    'precipitacion': 'mean'
}).round(2).reset_index()


In [6]:
# Convertir la columna 'fecha' a formato datetime
clima_df['fecha'] = pd.to_datetime(clima_df['fecha'])

# Asegurarse de que la columna 'Day' en resultado_df esté en formato datetime
resultado_df['Day'] = pd.to_datetime(resultado_df['Day'])

# Hacer el merge por la columna de fecha (Day en resultado_df y fecha en clima_df)
merged_df = pd.merge(resultado_df, clima_df, left_on='Day', right_on='fecha', how='left')

In [12]:
# Step 1: Remove the `fecha` column
df = merged_df.drop(columns=['fecha'])

# Step 2: Pivot the data
df_pivoted = df.pivot_table(
    index=["Day", "temp_max", "temp_min", "precipitacion"],
    columns="tipus allotjament",
    values="pernoctacions",
    fill_value=0  # Optional: fill missing values with 0
).reset_index()

# Step 3: Rename columns
df_pivoted.columns = [
    "Day", "temp_max", "temp_min", "precipitacion",
    "Pernoctacions_Albergs", "Pernoctacions_Habitatge_turístic", "Pernoctacions_Hotel"
]

# Step 4: Add `Pernoctacions_Albergs` to `Pernoctacions_Hotel`
df_pivoted["Pernoctacions_Hotel"] += df_pivoted["Pernoctacions_Albergs"]

# Step 5: Drop `Pernoctacions_Albergs` column
df_pivoted = df_pivoted.drop(columns=["Pernoctacions_Albergs"])

#Change Day format
df_pivoted['Day'] = pd.to_datetime(df_pivoted['Day']).dt.strftime('%d-%m-%Y')


In [13]:
df_pivoted.head()

Unnamed: 0,Day,temp_max,temp_min,precipitacion,Pernoctacions_Habitatge_turístic,Pernoctacions_Hotel
0,01-01-2021,7.55,4.3,5.65,3500,8700.0
1,02-01-2021,8.55,1.55,0.0,6369,15828.0
2,03-01-2021,9.55,0.85,0.0,6904,17160.0
3,04-01-2021,10.1,0.65,0.0,5825,14478.0
4,05-01-2021,9.4,0.65,0.0,4669,11603.0


In [14]:
df_pivoted.to_csv('../data/Preprocessed_Data/Tourism_Temp_bcn.csv', index=False)