In [26]:
from dbfread import DBF
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import datetime



import os
import ee
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
import csv
import datetime
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

import logging
import json

# Configuration du logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

from time import sleep
from pyproj import Transformer


In [27]:
# Lire les couches


profiles = pd.DataFrame(iter(DBF('../data/use/afsp/GIS_Dbf/AfSP012Qry_Profiles.dbf', encoding='latin-1')))

profiles


geo = gpd.read_file('../data/use/afsp/GIS_Shape/AfSP012Qry_GeoPoints.shp')
geo['Longitude'] = geo.geometry.x
geo['Latitude'] = geo.geometry.y



# Fusionner les deux sur 'ProfileID'
merged = pd.merge(profiles, geo[['ProfileID', 'Longitude', 'Latitude']], on='ProfileID', how='left')
profiles_ = merged.dropna(subset=['Longitude', 'Latitude'])

# Afficher les profils géoréférencés
profiles_ = profiles_[['ProfileID', 'Longitude', 'Latitude','T_Year']]
#convert the T_Year to int
profiles_['T_Year'] = pd.to_numeric(profiles_['T_Year'], errors='coerce').astype('Int64')
# afficher toute les date avec le poucentage de chaque année
year_counts = profiles_['T_Year'].value_counts(normalize=True) * 100
# print("\nPourcentage de profils par année :")
print(year_counts)

# Afficher les années avec le pourcentage
year_counts = year_counts.reset_index()
profiles_

#afficher les années  unique
unique_years = profiles_['T_Year'].unique()
print("\nAnnées uniques dans les profils géoréférencés :", unique_years)

print("before filtering:", profiles_.shape)
profiles_ = profiles_[profiles_['T_Year'] >= 1982 ]
# profiles_ = profiles_[ profiles_['T_Year'] <= 2013]
print("after filtering year:", profiles_.shape)
available_years = profiles_['T_Year'].unique()

# print("available years:", pd.Series(available_years).sort_values())

# filter longitude and latitude
profiles_ = profiles_[(profiles_['Longitude'] != 0.0) &
                        (profiles_['Latitude'] != 0.0) &
                        (profiles_['Longitude'].notnull()) &
                        (profiles_['Latitude'].notnull())]
print("after filtering coordinates:", profiles_.shape)  

profiles_ = profiles_.drop_duplicates(subset=['Longitude', 'Latitude'], keep='first')
print("after removing duplicates:", profiles_.shape)


profiles_list_afsp =  profiles_[['ProfileID', 'Longitude', 'Latitude','T_Year']].values.tolist()



T_Year
-9999    11.881509
1988      8.449792
1989      6.987536
1997      4.289645
1990      4.030648
           ...    
1945      0.053958
1953      0.016187
1942      0.010792
1938      0.005396
1941      0.005396
Name: proportion, Length: 68, dtype: Float64

Années uniques dans les profils géoréférencés : <IntegerArray>
[ 1957,  1963,  1962,  1966,  1960,  1954,  1958,  1965,  1964,  1956,  1952,
  1961,  1969,  1955,  1959,  1968,  1970,  1951,  1967,  1986,  1997,  1980,
  1981,  1984,  1982,  1983,  1987, -9999,  1996,  1995,  1994,  1998,  1999,
  1988,  1990,  1991,  1977,  1989,  1985,  1979,  1992,  1978,  1974,  1976,
  1973,  1975,  1971,  2000,  2004,  1993,  2003,  2002,  1972,  2001,  2006,
  2007,  2008,  1941,  1938,  1945,  2009,  2005,  1942,  1944,  2010,  1946,
  1953,  2011]
Length: 68, dtype: Int64
before filtering: (18533, 4)
after filtering year: (10794, 4)
after filtering coordinates: (10425, 4)
after removing duplicates: (9481, 4)


In [28]:
""" ird_data =  pd.read_csv('../data/use/ird/Data_to_Analyze.csv',)


tr = Transformer.from_crs(32628, 4326, always_xy=True)
ird_data[["lon", "lat"]] = ird_data.apply(
    lambda r: tr.transform(r["X_Centroid"], r["Y_Centroid"]),
    axis=1, result_type="expand"
)

ird_data = ird_data.drop(columns=["X_Centroid", "Y_Centroid"])
ird_data = ird_data.rename(columns={"lon": "Longitude", "lat": "Latitude", "Profile_id": "ProfileID"})
# ird_data = ird_data.drop_duplicates(subset=["geometry"], keep="first")
profiles_ird = ird_data[['ProfileID', 'Longitude', 'Latitude']]
profiles_ird = profiles_ird.drop_duplicates(subset=['Longitude', 'Latitude'], keep='first', ignore_index=True)

profiles_ird["T_Year"] = 2016
profiles_ird """

' ird_data =  pd.read_csv(\'../data/use/ird/Data_to_Analyze.csv\',)\n\n\ntr = Transformer.from_crs(32628, 4326, always_xy=True)\nird_data[["lon", "lat"]] = ird_data.apply(\n    lambda r: tr.transform(r["X_Centroid"], r["Y_Centroid"]),\n    axis=1, result_type="expand"\n)\n\nird_data = ird_data.drop(columns=["X_Centroid", "Y_Centroid"])\nird_data = ird_data.rename(columns={"lon": "Longitude", "lat": "Latitude", "Profile_id": "ProfileID"})\n# ird_data = ird_data.drop_duplicates(subset=["geometry"], keep="first")\nprofiles_ird = ird_data[[\'ProfileID\', \'Longitude\', \'Latitude\']]\nprofiles_ird = profiles_ird.drop_duplicates(subset=[\'Longitude\', \'Latitude\'], keep=\'first\', ignore_index=True)\n\nprofiles_ird["T_Year"] = 2016\nprofiles_ird '

In [29]:
ird_data =  pd.read_csv('../data/use/ird/all_profiles.csv',)


tr = Transformer.from_crs(32628, 4326, always_xy=True)
ird_data[["Longitude", "Latitude"]] = ird_data.apply(
    lambda r: tr.transform(r["X_Centroid"], r["Y_Centroid"]),
    axis=1, result_type="expand"
)

ird_data = ird_data.drop(columns=["X_Centroid", "Y_Centroid"])
ird_data = ird_data.rename(columns={"Date": "T_Year"})

ird_data['ProfileID'] = ird_data.index
# ird_data = ird_data.drop_duplicates(subset=["geometry"], keep="first")
profiles_ird = ird_data[['ProfileID', 'Longitude', 'Latitude', 'T_Year']]
profiles_ird = profiles_ird.drop_duplicates(subset=['Longitude', 'Latitude'], keep='first', ignore_index=True)
profiles_ird['ProfileID'] =profiles_ird.index.map(lambda x: f'IRD_{x+1}')  # Assign unique ProfileID based on index
# ProfileID  = Point(longitude, latitude)
map_longitude_latitude_profile_id = profiles_ird.set_index(['Longitude', 'Latitude'])['ProfileID'].to_dict()
# profiles_ird["T_Year"] = 2016
# conver T_Year '01/01/2016' to int(2016)
profiles_ird['T_Year'] = profiles_ird['T_Year'].apply(lambda x: int(x.split("/")[-1]))
profiles_ird
# map_longitude_latitude_profile_id
# save map_longitude_latitude_profile_id  as pd
map_longitude_latitude_profile_id_df = pd.DataFrame(list(map_longitude_latitude_profile_id.items()), columns=['Longitude_Latitude', 'ProfileID'])
map_longitude_latitude_profile_id_df.to_csv('../data/use/ird/map_longitude_latitude_profile_id.csv', index=False)



In [30]:
wosis  = pd.read_csv('../data/use/wosis/WoSIS_2023_December/wosis_202312_profiles.tsv', sep='\t', low_memory=False)
print(wosis.columns)
wosis_profiles =wosis[['profile_id',  'longitude', 'latitude', 'fao_publication_year']].dropna(axis=0, how='any')
wosis_profiles = wosis_profiles.rename(columns={"profile_id": "ProfileID", "longitude": "Longitude", "latitude": "Latitude", "fao_publication_year": "T_Year"})
wosis_profiles = wosis_profiles.drop_duplicates(subset=['Longitude', 'Latitude'], keep='first', ignore_index=True)
#convert the T_Year to int
wosis_profiles['T_Year'] = wosis_profiles['T_Year'].astype(int)
wosis_profiles = wosis_profiles[wosis_profiles['T_Year'] >= 1982 ]
wosis_profiles


Index(['profile_id', 'profile_code', 'dataset_code', 'site_id',
       'positional_uncertainty', 'country_name', 'longitude', 'latitude',
       'wrb_reference_soil_group_code', 'wrb_reference_soil_group',
       'wrb_prefix_qualifiers', 'wrb_suffix_qualifiers',
       'wrb_principal_qualifiers', 'wrb_supplementary_qualifiers',
       'wrb_publication_year', 'fao_major_group_code', 'fao_major_group',
       'fao_soil_unit_code', 'fao_soil_unit', 'fao_publication_year',
       'usda_order_name', 'usda_suborder', 'usda_subgroup', 'usda_great_group',
       'usda_publication_year'],
      dtype='object')


Unnamed: 0,ProfileID,Longitude,Latitude,T_Year
0,1144360,-15.495200,13.932300,1997
1,1144361,-14.838810,14.565890,1997
2,1144362,-15.515833,14.164722,1997
3,1144363,29.430000,-2.060000,1997
4,1144364,29.590000,-2.070000,1997
...,...,...,...,...
28900,1691332,19.112700,47.627899,1997
28901,1691333,19.962099,47.432400,1997
28902,1691334,20.952499,46.399300,1997
28903,1691335,-1.300000,34.916698,1997


In [31]:
# Portée requise par Earth Engine
# SCOPES = ["https://www.googleapis.com/auth/earthengine.readonly"]



# def authenticate_earth_engine():
    
#     creds = None

#     # Vérifie si token.json existe (authentification précédente)
#     if os.path.exists("token.json"):
#         creds = Credentials.from_authorized_user_file("token.json", SCOPES)

#     # Sinon, lance le flow OAuth pour obtenir un token
#     if not creds or not creds.valid:
#         if creds and creds.expired and creds.refresh_token:
#             creds.refresh(Request())
#         else:
#             flow = InstalledAppFlow.from_client_secrets_file(
#                 "credentials.json", SCOPES
                
#             )
#             creds = flow.run_local_server(port=0)

#         # Sauvegarde le token pour les prochaines fois
#         with open("token.json", "w") as token:
#             token.write(creds.to_json())

#     # Initialisation Earth Engine avec les credentials OAuth
#     ee.Initialize(credentials=creds)
#     print(" Earth Engine authentifié avec succès !")

# # Authentification Earth Engine
# authenticate_earth_engine()


# Initialisation Earth Engine
SCOPES = ["https://www.googleapis.com/auth/earthengine.readonly"]

def initialize_earth_engine():
    """Initialise Earth Engine avec les credentials"""
    try:
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
        ee.Initialize(credentials=creds)
        logger.info("Earth Engine initialisé avec succès")
    except Exception as e:
        logger.error(f"Erreur d'initialisation Earth Engine: {str(e)}")
        raise

In [32]:

LANDSAT_COLLECTIONS = {
    "Landsat-1": {
        "years": (1972, 1978),
        "available_years": (1972, 1978),
        "path": "LANDSAT/LM01/C02/T2",
        "bands": ["B4", "B5", "B6", "B7", "QA_PIXEL"]
    },
    "Landsat-2": {
        "years": (1975, 1981),
        "available_years": (1975, 1981),
        "path": "LANDSAT/LM02/C02/T1",
        "bands": ["B4", "B5", "B6", "B7", "QA_PIXEL"]
    },
    "Landsat-3": {
        "years": (1978, 1983),
        "available_years": (1978, 1983),
        "path": "LANDSAT/LM03/C02/T1",
        "bands": ["B4", "B5", "B6", "B7", "QA_PIXEL"]
    },
    "Landsat-4": {
        "years": (1982, 1993),
        "available_years": (1982, 1993),
        "path": "LANDSAT/LT04/C02/T1_L2",
        "bands": ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B7", "QA_PIXEL"]
    },
    "Landsat-5": {
        "years": (1984, 2013),
        "available_years": (1984, 2013),
        "path": "LANDSAT/LT05/C02/T1_L2",
        "bands": ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B7", "QA_PIXEL"]
    },
    "Landsat-7": {
        "years": (1999, 2022),
        "available_years": (1999, 2022),
        
        "path": "LANDSAT/LE07/C02/T1_L2",
        "bands": ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B7", "QA_PIXEL"]
    },
    "Landsat-8": {
        "years": (2013, 2025),  # en fonction de la dernière date disponible
        "available_years": (2013, 2025),  # en fonction de la dernière date disponible
        "path": "LANDSAT/LC08/C02/T1_L2",
        "bands": ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B6", "SR_B7", "SR_B10", "SR_B11", "QA_PIXEL"]
    },
    "Landsat-9": {
        "years": (2021, 2025),  # pareil, à adapter
        "available_years": (2021, 2025),  # pareil, à adapter
        "path": "LANDSAT/LC09/C02/T1_L2",
        "bands": ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B6", "SR_B7", "SR_B10", "SR_B11", "QA_PIXEL"]
    }
}

# for sat_name, info in LANDSAT_COLLECTIONS.items():
#     try:
#         collection = ee.ImageCollection(info["path"])
#         # Vérifier si la collection est vide
#         if collection.size().getInfo() == 0:
#             logger.warning(f"Aucune image trouvée pour {sat_name}")
#         else:
#             logger.info(f"Collection {sat_name} initialisée avec succès")
#     except Exception as e:
#         logger.error(f"Erreur lors de l'initialisation de la collection {sat_name}: {str(e)}")


In [33]:


initialize_earth_engine()



def get_appropriate_satellite(year):
    """Trouve le satellite Landsat approprié pour une année donnée"""
    year = int(year)
    
    if year < 1972:
        logger.debug(f"Année {year} avant le début de Landsat, utilisation de 1972 comme proxy")
        year = 1972
    available_sat = []
    for name, info in LANDSAT_COLLECTIONS.items():
        start, end = info["years"]
        if start <= year <= end:
            available_sat.append((name, info["path"], info["bands"], year))
 
    return  available_sat[::-1]
      
  




def get_band_info(image, band_name, point):
    try:
        band = image.select(band_name)
        
        # Petite géométrie autour du point
        region = point.buffer(30).bounds()

        stats = band.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=region,
            scale=30,  # résolution Landsat
            bestEffort=True  # évite erreur maxPixels
        ).getInfo()

        return {
            band_name: stats.get(band_name)
        }
    except Exception as e:
        logger.warning(f"Erreur pour la bande {band_name}: {str(e)}")
        return None


def get_landsat_data(point, target_year):
    """
    Récupère les données Landsat pour un point et une année cible
    """
    try:
        # Vérifier les coordonnées valides
        coords = point.coordinates().getInfo()
        if coords == [0, 0] or None in coords:
            logger.warning("Coordonnées invalides - point ignoré")
            error = "get_landsat_data 1 : Coordonnées invalides - point ignoré"
            return None, None, None, None ,error

        # Trouver le satellite approprié
        # sat_name, sat_path, target_bands, effective_year = get_appropriate_satellite(target_year)
        available_sat = get_appropriate_satellite(target_year)
        start_date = f"{target_year}-01-01"
        end_date = f"{target_year}-12-31"
        # Période de recherche (±3 ans autour de l'année cible)
        if not available_sat:
            logger.debug(f"Aucun satellite disponible pour l'année {target_year}")
            error = f"get_landsat_data 1 : Aucun satellite disponible pour l'année {target_year}"
            return None, None, None, None ,error
        count = 0
        for sat_name, sat_path, target_bands, effective_year in available_sat:
            logger.debug(f"Recherche {sat_name} ({start_date} à {end_date}) pour {coords}")
            
            # Créer la collection d'images
            collection = ee.ImageCollection(sat_path) \
                .filterDate(start_date, end_date) \
                .filterBounds(point) \
                .sort('CLOUD_COVER', True)
            
            # Vérifier si des images sont disponibles
            count = collection.size().getInfo()
            if count > 0:
                logger.debug(f"{count} images trouvées pour {sat_name} ({start_date} à {end_date})")
                # Si des images sont trouvées, on sort de la boucle
                break
            
        if count == 0:
            logger.debug(f"Aucune image trouvée pour {sat_name} {start_date}-{end_date}")
            error = f"get_landsat_data 1 : Aucune image trouvée pour  {sat_name} {start_date}-{end_date}"
            return start_date, end_date, sat_name, None ,error

        # Prendre l'image la moins nuageuse
        image = collection.first()
        if not image:
            return start_date, end_date, sat_name, None ,f"Aucune image dans la collection {sat_name}"

        # Vérifie les bandes valides
        available_bands = image.bandNames().getInfo()
        valid_bands = [b for b in target_bands if b in available_bands]

        if not valid_bands:
            return start_date, end_date, sat_name, None, f"Aucune bande valide trouvée dans {available_bands}"

        bands_info = {}
        for band in valid_bands:
            info = get_band_info(image, band, point=point)
            if info:
                bands_info[band] = info[band]

        if not bands_info:
            return start_date, end_date, sat_name, None , "Aucune information de bande récupérée"

        return start_date, end_date, sat_name, bands_info, None
    except Exception as e:
        logger.error(f"Erreur lors de la récupération des données Landsat: {str(e)}")
        return None, None, None, None ,f"get_landsat_data 2 : Erreur lors de la récupération des données Landsat: {str(e)}"


def process_row(row):
    """Traite une ligne du dataframe"""
    try:
        profile_id, lon, lat, year = row
        
        # Créer le point géographique
        point = ee.Geometry.Point(lon, lat)
        
        # Récupérer les données Landsat
        start_date, end_date, sat_name, bands_info ,error = get_landsat_data(point, int(year))
        # start_date, end_date, sat_name, bands_info = get_sentinel_data(point, start_date="2015-06-01", end_date="2025-01-01")

        if bands_info is None:
            logger.debug(f"Aucune donnée pour {profile_id}")
            bands_data = {"error": error}
        else:
            bands_data = bands_info

        return {
            'ProfileID': profile_id,
            'Longitude': lon,
            'Latitude': lat,
            'T_Year': year,
            'Start_Date': start_date,
            'End_Date': end_date,
            'Satellite': sat_name,
            
            

        } ,bands_data

    except Exception as e:
        logger.error(f"Erreur pour le profil {row[0]}: {str(e)}")
        return None

def main( output_csv,profiles_list,limit=None):
    """Fonction principale"""
    # Charger les données d'entrée
    # df = pd.read_csv(input_csv)
    # profiles_list = df[['ProfileID', 'Longitude', 'Latitude', 'T_Year']].values.tolist()
    
    # Check if the output file already exists et le creer si nécessaire
    if not os.path.exists(output_csv):
        with open(output_csv, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['ProfileID', 'Longitude', 'Latitude', 'T_Year', 'Start_Date', 'End_Date', 'Satellite'])
        
    already_get_data = pd.read_csv(output_csv)

    # Filtrer les profils déjà traités
    processed_profiles = already_get_data['ProfileID'].tolist()
    profiles_list = [p for p in profiles_list if p[0] not in processed_profiles]
    print(f"Nombre de profils à traiter : {len(profiles_list)}")
    # prendre les 5000 premiers profils pour le test
    if limit:
        profiles_list = profiles_list[:limit]

    results = []
    # Traitement parallèle
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(process_row, row) for row in profiles_list]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Traitement des profils"):
            result , bands_data = future.result()
            if result:
                results.append({**result, **bands_data})

    # Sauvegarder les résultats
    output_df = pd.DataFrame(results)
    # data_to_save = already_get_data.append(output_df, ignore_ind"""  """ex=True)
    data_to_save = pd.concat([already_get_data, output_df], ignore_index=True)

    data_to_save.to_csv(output_csv, index=False)
    logger.info(f"Résultats sauvegardés dans {output_csv}")
    





INFO:__main__:Earth Engine initialisé avec succès


In [35]:
# main("lansat_afsp_results_all.csv", profiles_list_afsp)
main("lansat_ird_results_landsat8_3.csv", profiles_ird.values.tolist())
# main("wosis_profiles.csv", wosis_profiles.values.tolist(),)

Nombre de profils à traiter : 1790


Traitement des profils: 100%|██████████| 1790/1790 [13:39<00:00,  2.18it/s]
INFO:__main__:Résultats sauvegardés dans lansat_ird_results_landsat8_3.csv


Nombre de profils à traiter : 1780


Traitement des profils: 100%|██████████| 1780/1780 [18:59<00:00,  1.56it/s]
INFO:__main__:Résultats sauvegardés dans lansat_ird_results_landsat8.csv
