# EDA Données changement climatique - LSH (Longues Séries Homogénéisées)
Séries mensuelles corrigées par homogénéisation statistique pour les paramètres températures minimales (TN), maximales (TX), insolations (IN) ou précipitations (RR).

Liste des fichier qui vont être étudiés : 
- SH_IN pas de lien pour l'outremer correspond au cumul mensuel des durées d'insolation. 
- SH_RR_Outremer ( cumul mensuel des hauteurs de précipitation en mm et 1/10)
- SH_TN_Outremer (Moyenne mensuelle de la température minimal )
- SH_TX_Outremer (Moyenne mensuelle de la température maximal sous abri)


In [12]:
import io
import os
import pandas as pd
import requests 
import zipfile

In [13]:
# Read and extract CSV files from a ZIP URL
def extract_csv_to_df(urls):
    response = requests.get(urls)

    # Check if the request was successful
    if response.status_code == 200:
        array_metadata = []
        array_data = []
        
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
            for filename in zip_file.namelist():
                if filename.endswith('.csv'):

                    print(f"Reading {filename}...")
                    # We read the metadata
                    with zip_file.open(filename) as file:                
                        df_metadata = pd.read_csv(file).head(12)
                    
                    # We read the data
                    with zip_file.open(filename) as file:
                        df_data = pd.read_csv(file, sep=';', skiprows=13)

                        # Get the num_post
                        num_post = df_metadata.iloc[0,0].split('=')[1].strip()
                        
                        # Add a num_post column to trace the origin of the measurements
                        df_data.insert(0, 'num_post', num_post)

                    array_metadata.append(df_metadata)
                    array_data.append(df_data)
                        
        return array_metadata, array_data

In [20]:
# Format the metadata lines into a structured DataFrame
def format_metadata(df_meta):
    
    noms_colonnes_bigquery = {
        "LATITUDE (°)": "LATITUDE_DEG",
        "LONGITUDE(°)": "LONGITUDE_DEG",
        "ALTITUDE (m)": "ALTITUDE_M",
        "ETAT DU POSTE (fermé avec date de fermeture AAAA-MM-JJ ou ouvert à la date de production du fichier le 13/03/2025)": "ETAT_POSTE",
        "Amplitude minimale détectable de la série": "AMPLITUDE_MIN_DETECTABLE",
        "Période homogénéisée": "PERIODE_HOMOGENISEE",
        "Date(s) de rupture(s) d'homogénéité": "DATE_RUPTURE_HOMOGENEITE",
        "Periode homogénéisée  196501 201312  : Q_HOM": "Q_HOM_196501_201312",
        "Données prolongées si nécessaire jusqu'en 202412 par des donnees mensuelles non homogénéisées : Q_HOM": "Q_HOM_PROLONGEES_202412",
        "Periode homogénéisée  195501 201512  : Q_HOM": "Q_HOM_195501_201512",
        "Periode homogénéisée  196801 201312  : Q_HOM": "Q_HOM_196801_201312",
        "Periode homogénéisée  196401 201712  : Q_HOM": "Q_HOM_196401_201712",
        "Periode homogénéisée  197201 201512  : Q_HOM": "Q_HOM_197201_201512",
        "Periode homogénéisée  196701 201512  : Q_HOM": "Q_HOM_196701_201512",
        "Periode homogénéisée  195701 201512  : Q_HOM": "Q_HOM_195701_201512",
        "Periode homogénéisée  196501 201512  : Q_HOM": "Q_HOM_196501_201512",
        "Periode homogénéisée  197001 201512  : Q_HOM": "Q_HOM_197001_201512",
        "Periode homogénéisée  196901 201512  : Q_HOM": "Q_HOM_196901_201512",
        "Periode homogénéisée  196801 201512  : Q_HOM": "Q_HOM_196801_201512",
        "Periode homogénéisée  196501 202112  : Q_HOM": "Q_HOM_196501_202112"
    }
    
    # Extract the column as a list
    lines = df_meta.iloc[:, 0].tolist()
    data = {}
    
    for line in lines:
        line = line.strip()
        
        # Skip lines not starting with "#"
        if not line.startswith("#"):
            continue
        
        # Remove the "#" at the beginning
        content = line[1:].strip()
        
        # Split into key=value or key:value
        if "=" in content:
            key, value = content.split("=", 1)
        elif ":" in content:
            key, value = content.split(":", 1)

        # Remove extra whitespace
        key = key.strip()
        value = value.strip()

        # Handle special case for keys to ensure the import works correctly on BigQuery
        if key in noms_colonnes_bigquery:
            key = noms_colonnes_bigquery[key]

        data[key] = value
            
    # Convert to DataFrame
    df_meta = pd.DataFrame([data])
    
    print(df_meta)
    return df_meta

In [21]:
test = "ALTITUDE (m)"

if test.endswith("(m)"):
    test = test[:-3].strip() + "_M"

print(test)

ALTITUDE_M


In [22]:
# Save the dataframe into a CSV file
def df_to_csv(df, output_file):
    df.to_csv(output_file, index=False)
    print(f"✓ Saved in {output_file}")

In [23]:
# Stack all the metadata dataframes
def format_stackv_metadata(array_df_metadata):
    frames = []
    for df_meta in array_df_metadata:
        frames.append(format_metadata(df_meta))
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

In [24]:
# Fetch the CSVs, split metadata and data, and combine them
def split_and_combine_csv(key, url):
    # Fetch then extract metadata and data
    array_metadata, array_data = extract_csv_to_df(url)
    # Format and stack all the metadata dataframes
    df_metadata = format_stackv_metadata(array_metadata)
    # Stack all the dataframes
    df_combined = pd.concat(array_data, ignore_index=True)
    # Rename the "VALEUR" column to the specified, more meaningful, key
    df_data = df_combined.rename(columns={"VALEUR": key})
    
    return df_metadata, df_data

In [25]:
# Key = name of the file to be generated
# URL = link to the zip file
urls = {
    "Precipitations": "https://www.data.gouv.fr/api/1/datasets/r/9617eade-a4ae-4fa1-bbe7-88458aff67b2",
    "Max_Temp": "https://www.data.gouv.fr/api/1/datasets/r/69f2f61e-781c-4132-84c1-dee3e0becbbb",
    "Min_Temp": "https://www.data.gouv.fr/api/1/datasets/r/9cfc5c85-3a23-4a2b-99b6-57ae76ce25b4"
}

output_dir = os.path.join("..", "data")
os.makedirs(output_dir, exist_ok=True)

for key, url in urls.items():
    basename = f"{output_dir}/{key.lower()}"
    df_metadata, df_data = split_and_combine_csv(key, url)
    df_to_csv(df_metadata, output_file=f"{basename}_metadata.csv")
    df_to_csv(df_data, output_file=f"{basename}_data.csv")

Reading SH_MRR097105002.csv...
Reading SH_MRR097107002.csv...
Reading SH_MRR097108003.csv...
Reading SH_MRR097111001.csv...
Reading SH_MRR097115003.csv...
Reading SH_MRR097117002.csv...
Reading SH_MRR097118001.csv...
Reading SH_MRR097124004.csv...
Reading SH_MRR097128008.csv...
Reading SH_MRR097209017.csv...
Reading SH_MRR097304001.csv...
Reading SH_MRR097305001.csv...
Reading SH_MRR097409230.csv...
Reading SH_MRR097410238.csv...
Reading SH_MRR097411150.csv...
Reading SH_MRR097418110.csv...
Reading SH_MRR098715003.csv...
Reading SH_MRR098718001.csv...
Reading SH_MRR098722003.csv...
Reading SH_MRR098722005.csv...
Reading SH_MRR098723001.csv...
Reading SH_MRR098723003.csv...
Reading SH_MRR098723004.csv...
Reading SH_MRR098731002.csv...
Reading SH_MRR098738010.csv...
Reading SH_MRR098747002.csv...
Reading SH_MRR098747013.csv...
Reading SH_MRR098748003.csv...
Reading SH_MRR098748009.csv...
Reading SH_MRR098757001.csv...
Reading SH_MRR098822001.csv...
Reading SH_MRR098832002.csv...
Reading 