NOTE: This notebook is assuming you have the `daily_dataset.csv` inside the folder local_data in data, and the `pernoctacions_2019_2024.csv`, `temperature_precipitation.csv` and `tourism_flux.csv` datasets inside the folder data.

The `daily_dataset.csv` should be exactly how it is when it is being recently downloaded from the AB Datachallenge portal.

In [1]:
import pandas as pd
from collections import Counter
import seaborn as sns
import os
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt

In [2]:
# Load the datasets
pernoctacions = pd.read_csv('../data/pernoctacions_2019_2024.csv')
tourism = pd.read_csv('../data/tourism_flux.csv')
clima_df = pd.read_csv('../data/temperature_precipitation.csv')

In [3]:
# Convert the 'Day' column to datetime with specified format
tourism['Day'] = pd.to_datetime(tourism['Day'], format='%m/%d/%Y', errors='coerce')

# Group by month and year to calculate total flights per month
monthly_flights = tourism.groupby(tourism['Day'].dt.to_period('M'))['Flights'].sum().reset_index()
monthly_flights.columns = ['Month', 'Total Flights']

# Merge with original DataFrame to calculate percentage
flights_df = tourism.merge(monthly_flights, left_on=tourism['Day'].dt.to_period('M'), right_on='Month', how='left')

# Calculate percentage of flights
flights_df['Percentage of Flights'] = (flights_df['Flights'] / flights_df['Total Flights']) * 100

# Clean up DataFrame
flights_df.drop(columns=['Month'], inplace=True)


In [4]:
# Convertir las fechas en ambos datasets al mismo formato
pernoctacions['Data'] = pd.to_datetime(pernoctacions['Data'], format='%m/%d/%Y')
flights_df['Day'] = pd.to_datetime(flights_df['Day'])
pernoctacions['Pernoctacions'] = pd.to_numeric(pernoctacions['Pernoctacions'].str.replace(',', ''), errors='coerce')

# Iterar sobre los meses de pernoctaciones y distribuir según los porcentajes diarios de vuelos
resultados = []

for _, row in pernoctacions.iterrows():
    # Extraer el mes y año
    mes = row['Data'].month
    anyo = row['Data'].year
    pernoctaciones_mensuales = row['Pernoctacions']
    
    # Filtrar los vuelos de ese mes y año
    vuelos_mes = flights_df[(flights_df['Day'].dt.month == mes) & (flights_df['Day'].dt.year == anyo)]
    
    # Normalizar los porcentajes de vuelos
    vuelos_mes['Percentage of Flights'] = vuelos_mes['Percentage of Flights'] / vuelos_mes['Percentage of Flights'].sum()
    
    # Distribuir las pernoctaciones diarias
    for _, vuelo in vuelos_mes.iterrows():
        pernoctaciones_diarias = round(int(pernoctaciones_mensuales) * vuelo['Percentage of Flights'])
        resultados.append({
            'Day': vuelo['Day'],
            'tipus allotjament': row['Tipologia d\'allotjament'],
            'pernoctacions': pernoctaciones_diarias
        })

# Crear el DataFrame final
resultado_df = pd.DataFrame(resultados)

# Don't print warnings
pd.options.mode.chained_assignment = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vuelos_mes['Percentage of Flights'] = vuelos_mes['Percentage of Flights'] / vuelos_mes['Percentage of Flights'].sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vuelos_mes['Percentage of Flights'] = vuelos_mes['Percentage of Flights'] / vuelos_mes['Percentage of Flights'].sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

ValueError: cannot convert float NaN to integer

In [None]:
resultado_df = resultado_df.sort_values(by='Day', ascending=False)

In [None]:
clima_df = clima_df.groupby('fecha').agg({
    'temp_max': 'mean',
    'temp_min': 'mean',
    'precipitacion': 'mean'
}).round(2).reset_index()


In [None]:
# Convertir la columna 'fecha' a formato datetime
clima_df['fecha'] = pd.to_datetime(clima_df['fecha'])

# Asegurarse de que la columna 'Day' en resultado_df esté en formato datetime
resultado_df['Day'] = pd.to_datetime(resultado_df['Day'])

# Hacer el merge por la columna de fecha (Day en resultado_df y fecha en clima_df)
merged_df = pd.merge(resultado_df, clima_df, left_on='Day', right_on='fecha', how='left')

In [None]:
# Step 1: Remove the `fecha` column
df = merged_df.drop(columns=['fecha'])

# Step 2: Pivot the data
df_pivoted = df.pivot_table(
    index=["Day", "temp_max", "temp_min", "precipitacion"],
    columns="tipus allotjament",
    values="pernoctacions",
    fill_value=0  # Optional: fill missing values with 0
).reset_index()

# Step 3: Rename columns
df_pivoted.columns = [
    "Day", "temp_max", "temp_min", "precipitacion",
    "Pernoctacions_Albergs", "Pernoctacions_Habitatge_turístic", "Pernoctacions_Hotel"
]

# Step 4: Add `Pernoctacions_Albergs` to `Pernoctacions_Hotel`
df_pivoted["Pernoctacions_Hotel"] += df_pivoted["Pernoctacions_Albergs"]

# Step 5: Drop `Pernoctacions_Albergs` column
df_pivoted = df_pivoted.drop(columns=["Pernoctacions_Albergs"])

#Change Day format
df_pivoted['Day'] = pd.to_datetime(df_pivoted['Day']).dt.strftime('%d-%m-%Y')

In [None]:
df_pivoted.head()

In [None]:
# df_pivoted.to_csv('../data/Preprocessed_Data/Tourism_Temp_bcn.csv', index=False)
tourism_temp_bcn = df_pivoted

### End of Martí's code

In [None]:
daily_aigues = pd.read_csv("../data/local_data/daily_dataset.csv", encoding='latin1')
daily_aigues.tail()

In [None]:
daily_aigues = daily_aigues.rename(columns={
    daily_aigues.columns[0]: 'Census Section',
    daily_aigues.columns[1]: 'District',
    daily_aigues.columns[2]: 'Municipality',
    daily_aigues.columns[3]: 'Date',
    daily_aigues.columns[4]: 'Use',
    daily_aigues.columns[5]: 'Number of Meters',
    daily_aigues.columns[6]: 'Accumulated Consumption'
})

daily_aigues['Date'] = pd.to_datetime(daily_aigues['Date'])

daily_aigues.info()

In [None]:
daily_aigues['Use'] = daily_aigues['Use'].replace({
    'Comercial/Comercial/Commercial': 'Commercial',
    'DomÃ¨stic/DomÃ©stico/Domestic': 'Domestic',
    'Industrial/Industrial/Industrial': 'Industrial'
})

daily_aigues['Use'].unique()

In [None]:
daily_aigues['Municipality'].unique()

In [None]:
daily_aigues_bcn = daily_aigues[daily_aigues['Municipality'] == 'BARCELONA'].drop(columns=["Municipality"])

In [None]:
daily_aigues_bcn.tail(20)

In [None]:
daily_aigues_bcn.shape

In [None]:
# tourism_temp_bcn imported from Marti's code above
# tourism_temp_bcn = pd.read_csv("../data/tourism_temp_bcn.csv")
tourism_temp_bcn.head()

In [None]:
tourism_temp_bcn = tourism_temp_bcn.rename(columns={
    tourism_temp_bcn.columns.values[0] : 'Day',
    tourism_temp_bcn.columns.values[1] : 'Max Temperature',
    tourism_temp_bcn.columns.values[2] : 'Min Temperature',
    tourism_temp_bcn.columns.values[3] : 'Precipitation',
    tourism_temp_bcn.columns.values[4] : 'Tourist Accommodations',
    tourism_temp_bcn.columns.values[5] : 'Hotel Overnight Stays'
})

tourism_temp_bcn['Day'] = pd.to_datetime(tourism_temp_bcn['Day'], format="%d-%m-%Y")

In [None]:
tourism_temp_bcn.info()

In [None]:
merged_data = (daily_aigues_bcn.merge(tourism_temp_bcn, left_on='Date', right_on='Day', how='left')).drop(columns=["Day"])

In [None]:
merged_data.shape

In [None]:
merged_data.head(5)

In [None]:
# one_hot_encoded = pd.get_dummies(merged_data)

### End of Yuyan's code

In [None]:
# the column hotel overnight stays has a .0 at the end and it should be int. Let's fix it
merged_data['Hotel Overnight Stays'] = merged_data['Hotel Overnight Stays'].astype(int)

In [None]:
merged_data.isna().sum()

In [None]:
# As the columns census section and district contain some nulls, we will drop the rows with nulls in these columns
merged_data = merged_data.dropna(subset=['Census Section', 'District'])

In [None]:
merged_data.isna().sum()

In [None]:
# plot the accumulated consumption to detect outliers
plt.figure(figsize=(20, 6))
plt.plot(merged_data['Accumulated Consumption'])
plt.title('Accumulated Consumption')
plt.show()

In [None]:
# Drop all the accumulated consumption values below 0
merged_data = merged_data[merged_data['Accumulated Consumption'] > 0]

In [None]:
merged_data.describe()

In [None]:
# There's a couple of observations with the Census Section and the District columns that are <NULL> and >. We will drop these rows
merged_data = merged_data[merged_data['Census Section'] != '<NULL>']
merged_data = merged_data[merged_data['District'] != '>']
merged_cleaned_data = merged_data

In [None]:
# Save CSV data

# Create the directory if it does not exist
os.makedirs('../data/local_data/', exist_ok=True)

# Save the cleaned dataset
merged_cleaned_data.to_csv('../data/local_data/merged_cleaned_data.csv', index=False)