# Etape 2.2 : Fusion et enrichissement

## Charger les consommations nettoyees (depuis Parquet)

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Configuration affichage
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Chemins
DATA_DIR = "../data"
OUTPUT_DIR = "../data/output"
PARQUET_DIR = "../data/output/consommations_clean"

In [None]:

# chargement des données
df_consommations = pd.read_parquet(PARQUET_DIR, engine='fastparquet')
print(f"Données chargées : {len(df_consommations):,} lignes")


df_consommations.head(5)

exist
Données chargées : 6,996,788 lignes


Unnamed: 0,batiment_id,timestamp,consommation,unite,timestamp_parsed,consommation_clean,hour,year,month,commune,date,type_energie
0,BAT0001,2023-01-01 02:00:00,0.2,m3,2023-01-01 02:00:00,0.2,2,2023,1,Paris,2023-01-01,eau
1,BAT0139,01/01/2023 18:00,78.93,m3,2023-01-01 18:00:00,78.93,18,2023,1,Toulon,2023-01-01,eau
2,BAT0001,2023-01-01 17:00:00,2.6,m3,2023-01-01 17:00:00,2.6,17,2023,1,Paris,2023-01-01,eau
3,BAT0139,01/01/2023 22:00,121.29,m3,2023-01-01 22:00:00,121.29,22,2023,1,Toulon,2023-01-01,eau
4,BAT0001,01/01/2023 21:00:00,1.85,m3,2023-01-01 21:00:00,1.85,21,2023,1,Paris,2023-01-01,eau


### - Fusionner avec les donnees meteo (sur commune et timestamp arrondi a l'heure)

In [3]:
df_meteo = pd.read_csv("../data/output/meteo_clean.csv" , sep=",")
df_meteo.head(5)
df_join_meteo = pd.merge(
    df_consommations, 
    df_meteo, 
    on=["commune", "timestamp", "month"], 
    how="inner"
)

df_join_meteo.head(5)

Unnamed: 0,batiment_id,timestamp,consommation,unite,timestamp_parsed,consommation_clean,hour,year,month,commune,date,type_energie,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm,day,season,weekday
0,BAT0001,2023-01-01 17:00:00,2.6,m3,2023-01-01 17:00:00,2.6,17,2023,1,Paris,2023-01-01,eau,78.3,78.3,534.3,32.0,0.0,1,1,6
1,BAT0002,2023-01-01 09:00:00,1.04,m3,2023-01-01 09:00:00,1.04,9,2023,1,Paris,2023-01-01,eau,49.7,49.7,36.2,18.1,0.0,1,1,6
2,BAT0140,2023-01-01 13:00:00,23.3,m3,2023-01-01 13:00:00,23.3,13,2023,1,Toulon,2023-01-01,eau,60.4,60.4,764.9,16.2,4.2,1,1,6
3,BAT0003,2023-01-01 03:00:00,0.15,m3,2023-01-01 03:00:00,0.15,3,2023,1,Paris,2023-01-01,eau,59.7,59.7,35.4,25.4,0.0,1,1,6
4,BAT0003,2023-01-01 10:00:00,1.74,m3,2023-01-01 10:00:00,1.74,10,2023,1,Paris,2023-01-01,eau,84.8,84.8,582.7,13.3,0.0,1,1,6


### - Fusionner avec le referentiel batiments

In [4]:
df_batiments = pd.read_csv("../data/batiments.csv" , sep=",")
df_batiments.head(5)
df_join_batiments = df_join_meteo.merge(
    df_batiments,
    on=['commune','batiment_id'],
    how='left'
)

df_join_batiments.head(5)

Unnamed: 0,batiment_id,timestamp,consommation,unite,timestamp_parsed,consommation_clean,hour,year,month,commune,date,type_energie,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm,day,season,weekday,nom,type,surface_m2,annee_construction,classe_energetique,nb_occupants_moyen
0,BAT0001,2023-01-01 17:00:00,2.6,m3,2023-01-01 17:00:00,2.6,17,2023,1,Paris,2023-01-01,eau,78.3,78.3,534.3,32.0,0.0,1,1,6,Ecole Paris 1,ecole,1926,1978,E,225
1,BAT0002,2023-01-01 09:00:00,1.04,m3,2023-01-01 09:00:00,1.04,9,2023,1,Paris,2023-01-01,eau,49.7,49.7,36.2,18.1,0.0,1,1,6,Ecole Paris 2,ecole,1156,2004,C,402
2,BAT0140,2023-01-01 13:00:00,23.3,m3,2023-01-01 13:00:00,23.3,13,2023,1,Toulon,2023-01-01,eau,60.4,60.4,764.9,16.2,4.2,1,1,6,Gymnase Toulon 140,gymnase,1697,1985,F,121
3,BAT0003,2023-01-01 03:00:00,0.15,m3,2023-01-01 03:00:00,0.15,3,2023,1,Paris,2023-01-01,eau,59.7,59.7,35.4,25.4,0.0,1,1,6,Ecole Paris 3,ecole,1695,2014,D,219
4,BAT0003,2023-01-01 10:00:00,1.74,m3,2023-01-01 10:00:00,1.74,10,2023,1,Paris,2023-01-01,eau,84.8,84.8,582.7,13.3,0.0,1,1,6,Ecole Paris 3,ecole,1695,2014,D,219


### - Fusionner avec les tarifs pour calculer le cout financier

In [None]:
df_tarifs = pd.read_csv("../data/tarifs_energie.csv" , sep=",")
df_tarifs.head(5)

df_join_batiments['date'] = pd.to_datetime(df_join_batiments['date'])
df_tarifs['date_debut'] = pd.to_datetime(df_tarifs['date_debut'])
df_tarifs['date_fin'] = pd.to_datetime(df_tarifs['date_fin'])

df_final = df_join_batiments.merge(
    df_tarifs,
    on='type_energie',
    how='left'
)

# df_final = df_join_batiments.merge(
#     df_tarifs,
#     on=['commune','batiment_id'],
#     how='left'
# )

df_final.head(5)


Unnamed: 0,batiment_id,timestamp,consommation,unite,timestamp_parsed,consommation_clean,hour,year,month,commune,date,type_energie,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm,day,season,weekday,nom,type,surface_m2,annee_construction,classe_energetique,nb_occupants_moyen,date_debut,date_fin,tarif_unitaire
0,BAT0001,2023-01-01 17:00:00,2.6,m3,2023-01-01 17:00:00,2.6,17,2023,1,Paris,2023-01-01,eau,78.3,78.3,534.3,32.0,0.0,1,1,6,Ecole Paris 1,ecole,1926,1978,E,225,2023-01-01,2023-12-31,3.5
1,BAT0001,2023-01-01 17:00:00,2.6,m3,2023-01-01 17:00:00,2.6,17,2023,1,Paris,2023-01-01,eau,78.3,78.3,534.3,32.0,0.0,1,1,6,Ecole Paris 1,ecole,1926,1978,E,225,2024-01-01,2024-12-31,3.75
2,BAT0002,2023-01-01 09:00:00,1.04,m3,2023-01-01 09:00:00,1.04,9,2023,1,Paris,2023-01-01,eau,49.7,49.7,36.2,18.1,0.0,1,1,6,Ecole Paris 2,ecole,1156,2004,C,402,2023-01-01,2023-12-31,3.5
3,BAT0002,2023-01-01 09:00:00,1.04,m3,2023-01-01 09:00:00,1.04,9,2023,1,Paris,2023-01-01,eau,49.7,49.7,36.2,18.1,0.0,1,1,6,Ecole Paris 2,ecole,1156,2004,C,402,2024-01-01,2024-12-31,3.75
4,BAT0140,2023-01-01 13:00:00,23.3,m3,2023-01-01 13:00:00,23.3,13,2023,1,Toulon,2023-01-01,eau,60.4,60.4,764.9,16.2,4.2,1,1,6,Gymnase Toulon 140,gymnase,1697,1985,F,121,2023-01-01,2023-12-31,3.5


### Creer des features derivees :


  - Consommation par occupant


In [None]:
df_final['consommation_par_occupant'] = df_final['consommation_clean'] / df_final['nb_occupants_moyen']

  - Consommation par m2

In [None]:
df_final['consommation_par_m2'] = df_final['consommation_clean'] / df_final['surface_m2']

  - Cout journalier, mensuel, annuel

In [None]:
df_final['cout_instantane'] = df_final['consommation_clean'] * df_final['tarif_unitaire']
costs = df_final.groupby('batiment_id')['cout_instantane'].agg(['sum', 'mean'])

  - Indice de performance energetique (IPE)

In [None]:
conso_annuelle = df_final.groupby('batiment_id')['consommation_clean'].transform('sum')
df_final['IPE'] = conso_annuelle / df_final['surface_m2']

  - Ecart a la moyenne de la categorie

In [None]:
df_final['moyenne_type'] = df_final.groupby('type')['consommation_par_m2'].transform('mean')

df_final['ecart_moyenne_categorie_pct'] = ((df_final['consommation_par_m2'] - df_final['moyenne_type']) / df_final['moyenne_type']) * 100

In [None]:
df_final.head(5)

Unnamed: 0,batiment_id,timestamp,consommation,unite,timestamp_parsed,consommation_clean,hour,year,month,commune,date,type_energie,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm,day,season,weekday,nom,type,surface_m2,annee_construction,classe_energetique,nb_occupants_moyen,date_debut,date_fin,tarif_unitaire,consommation_par_occupant,consommation_par_m2,cout_instantane,IPE,moyenne_type,ecart_moyenne_categorie_pct
0,BAT0001,2023-01-01 17:00:00,2.6,m3,2023-01-01 17:00:00,2.6,17,2023,1,Paris,2023-01-01,eau,78.3,78.3,534.3,32.0,0.0,1,1,6,Ecole Paris 1,ecole,1926,1978,E,225,2023-01-01,2023-12-31,3.5,0.011556,0.00135,9.1,2932.126282,0.084182,-98.39639
1,BAT0001,2023-01-01 17:00:00,2.6,m3,2023-01-01 17:00:00,2.6,17,2023,1,Paris,2023-01-01,eau,78.3,78.3,534.3,32.0,0.0,1,1,6,Ecole Paris 1,ecole,1926,1978,E,225,2024-01-01,2024-12-31,3.75,0.011556,0.00135,9.75,2932.126282,0.084182,-98.39639
2,BAT0002,2023-01-01 09:00:00,1.04,m3,2023-01-01 09:00:00,1.04,9,2023,1,Paris,2023-01-01,eau,49.7,49.7,36.2,18.1,0.0,1,1,6,Ecole Paris 2,ecole,1156,2004,C,402,2023-01-01,2023-12-31,3.5,0.002587,0.0009,3.64,2046.317993,0.084182,-98.931296
3,BAT0002,2023-01-01 09:00:00,1.04,m3,2023-01-01 09:00:00,1.04,9,2023,1,Paris,2023-01-01,eau,49.7,49.7,36.2,18.1,0.0,1,1,6,Ecole Paris 2,ecole,1156,2004,C,402,2024-01-01,2024-12-31,3.75,0.002587,0.0009,3.9,2046.317993,0.084182,-98.931296
4,BAT0140,2023-01-01 13:00:00,23.3,m3,2023-01-01 13:00:00,23.3,13,2023,1,Toulon,2023-01-01,eau,60.4,60.4,764.9,16.2,4.2,1,1,6,Gymnase Toulon 140,gymnase,1697,1985,F,121,2023-01-01,2023-12-31,3.5,0.192562,0.01373,81.55,6197.985386,0.137677,-90.027272


In [None]:
# Exporter le fichier
df_final.to_csv('../data/output/consommations_enrichies.csv', index=False)