NOTE: This notebook is assuming you have the `daily_dataset.csv` inside the folder local_data in data, and the `pernoctacions_2019_2024.csv`, `temperature_precipitation.csv` and `tourism_flux.csv` datasets inside the folder data.

The `daily_dataset.csv` should be exactly how it is when it is being recently downloaded from the AB Datachallenge portal.

In [1]:
import pandas as pd
from collections import Counter
import seaborn as sns
import os
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


## 1. Preprocessing of daily_dataset

In [2]:
# Step 1: Load and preprocess daily water consumption data
daily_aigues = pd.read_csv("../data/local_data/daily_dataset.csv", encoding='latin1')

# Rename columns for clarity
daily_aigues = daily_aigues.rename(columns={
    daily_aigues.columns[0]: 'Census Section',
    daily_aigues.columns[1]: 'District',
    daily_aigues.columns[2]: 'Municipality',
    daily_aigues.columns[3]: 'Date',
    daily_aigues.columns[4]: 'Use',
    daily_aigues.columns[5]: 'Number of Meters',
    daily_aigues.columns[6]: 'Accumulated Consumption'
})

# Convert Date column to datetime
daily_aigues['Date'] = pd.to_datetime(daily_aigues['Date'])

# Replace values in the 'Use' column
daily_aigues['Use'] = daily_aigues['Use'].replace({
    'Comercial/Comercial/Commercial': 'Commercial',
    'DomÃ¨stic/DomÃ©stico/Domestic': 'Domestic',
    'Industrial/Industrial/Industrial': 'Industrial'
})

# Filter data for Barcelona
daily_aigues_bcn = daily_aigues[daily_aigues['Municipality'] == 'BARCELONA'].drop(columns=["Municipality"])

# Drop rows with invalid Census Section or District values
daily_aigues_bcn = daily_aigues_bcn[daily_aigues_bcn['Census Section'] != '<NULL>']
daily_aigues_bcn = daily_aigues_bcn[daily_aigues_bcn['District'] != '>']

# Remove rows with Accumulated Consumption values below 0
daily_aigues_bcn = daily_aigues_bcn[daily_aigues_bcn['Accumulated Consumption'] > 0]



## 2. Adding meteorological data

In [3]:
# Step 2: Adding meteorological data (temperature and precipitation)
clima_df = pd.read_csv('../data/temperature_precipitation.csv')

# Convert 'fecha' column to datetime and group by date
clima_df['fecha'] = pd.to_datetime(clima_df['fecha'])
clima_df = clima_df.groupby('fecha').agg({
    'temp_max': 'mean',
    'temp_min': 'mean',
    'precipitacion': 'mean'
}).round(2).reset_index()

# Merge meteorological data with water consumption data
merged_data = daily_aigues_bcn.merge(clima_df, left_on='Date', right_on='fecha', how='left').drop(columns=['fecha'])

# Debug: Check the merged data
print("Merged data after adding meteorological data:")
print(merged_data.head())


Merged data after adding meteorological data:
  Census Section District       Date         Use  Number of Meters  \
0      801901001        1 2021-01-01  Commercial                56   
1      801901001        1 2021-01-01    Domestic               322   
2      801901001        1 2021-01-01  Industrial                10   
3      801901001        1 2021-01-02  Commercial                56   
4      801901001        1 2021-01-02    Domestic               322   

   Accumulated Consumption  temp_max  temp_min  precipitacion  
0                     2710      7.55      4.30           5.65  
1                    10058      7.55      4.30           5.65  
2                   140646      7.55      4.30           5.65  
3                     3573      8.55      1.55           0.00  
4                     9600      8.55      1.55           0.00  


## 3. Adding tourism data

## 4. Save the cleaned dataset

In [None]:
# Step 4: Save the cleaned dataset
os.makedirs('../data/local_data/', exist_ok=True)
merged_data.to_csv('../data/local_data/merged_cleaned_data_NEW_p.csv', index=False)