In [1]:
import os
import math
import numpy as np
import pandas as pd
import rasterio
import ee
import warnings
warnings.filterwarnings('ignore')

from rasterio.warp import transform

In [2]:
plots_metadata = pd.read_csv("sPlotOpen_header(2).txt", delimiter="\t")

In [3]:
# Fill missing data of WorldClim, SoilGrids, and human footprint with nearest non-missing value

def find_nearest_non_missing(data, row, col, no_data_value, max_radius=100):
    rows, cols = data.shape
    for radius in range(1, max_radius + 1):
        for dy in range(-radius, radius + 1):
            for dx in range(-radius, radius + 1):
                r, c = row + dy, col + dx
                if 0 <= r < rows and 0 <= c < cols and not np.isclose(data[r, c], no_data_value, atol=0):
                    return data[r, c].item()
    return None  # Return None if no valid value is found within the max_radius

### Metadata

In [4]:
plots_metadata[["PlotObservationID", "Location_uncertainty", "Releve_area"]].to_csv("metadata_data.csv")

plots_metadata[["PlotObservationID", 'Cover_total',
       'Cover_tree_layer', 'Cover_shrub_layer', 'Cover_herb_layer',
       'Cover_moss_layer', 'Cover_lichen_layer', 'Cover_algae_layer',
       'Cover_litter_layer', 'Cover_bare_rocks', 'Cover_cryptogams',
       'Cover_bare_soil']].to_csv("cover_data.csv")

plots_metadata[["PlotObservationID", 'Height_trees_highest', 'Height_trees_lowest',
       'Height_shrubs_highest', 'Height_shrubs_lowest', 'Height_herbs_average',
       'Height_herbs_lowest', 'Height_herbs_highest']].to_csv("height_data.csv")

In [5]:
plots_metadata[["PlotObservationID", 'Cover_total',
       'Cover_tree_layer', 'Cover_shrub_layer', 'Cover_herb_layer',
       'Cover_moss_layer', 'Cover_lichen_layer', 'Cover_algae_layer',
       'Cover_litter_layer', 'Cover_bare_rocks', 'Cover_cryptogams',
       'Cover_bare_soil']].describe()

Unnamed: 0,PlotObservationID,Cover_total,Cover_tree_layer,Cover_shrub_layer,Cover_herb_layer,Cover_moss_layer,Cover_lichen_layer,Cover_algae_layer,Cover_litter_layer,Cover_bare_rocks,Cover_cryptogams,Cover_bare_soil
count,95104.0,19407.0,12094.0,16804.0,29668.0,9681.0,708.0,41.0,3161.0,2747.0,772.0,2745.0
mean,306038.6,76.912042,58.630437,29.137318,55.722856,39.292945,14.303672,16.0,27.209744,22.343284,13.873057,32.281628
std,343545.5,30.178138,27.672273,24.85845,33.367561,34.373927,15.759854,28.407745,30.139559,23.388648,14.330433,26.199666
min,16.0,1.0,0.5,0.5,0.2,1.0,1.0,1.0,1.0,1.0,1.0,0.02
25%,53159.75,60.0,35.0,10.0,25.0,6.0,2.0,1.0,5.0,5.0,5.0,10.0
50%,138963.5,88.0,60.0,22.0,60.0,30.0,8.0,1.0,15.0,10.0,10.0,25.0
75%,487983.8,100.0,80.0,40.0,90.0,70.0,20.0,15.0,40.0,30.0,15.0,55.0
max,1126790.0,990.0,150.0,170.0,199.0,100.0,90.0,100.0,107.0,100.0,90.0,99.0


### Location

In [6]:
plots_metadata[["PlotObservationID","Longitude", "Latitude"]].to_csv("location_data.csv")

### Topographic

In [None]:
project_name = "TOFILL"
ee.Authenticate(auth_mode="notebook")
ee.Initialize(project=project_name)

In [8]:
batch_size = 1000
num_batches = math.ceil(len(plots_metadata[["Longitude", "Latitude"]]) / batch_size)

all_values = []

print(f"#batches: {num_batches}")

for b in range(num_batches):
    
    if b % 25 == 0:
        print(f"Batch: {b}")

    locations = plots_metadata[["Longitude", "Latitude"]].values[b*batch_size:(b+1)*batch_size]
    
    point_list = []
    for lon, lat in locations:
        point = ee.Geometry.Point(lon, lat)
        point_list.append(ee.Feature(point))

    feature_collection = ee.FeatureCollection(point_list)

    # Load SRTM DEM dataset and compute slope and aspect
    dataset = ee.Image('CGIAR/SRTM90_V4')
    elevation = dataset.select('elevation')
    slope = ee.Terrain.slope(elevation)
    aspect = ee.Terrain.aspect(elevation)

    # Combine elevation, slope, and aspect into a single image
    terrain_image = elevation.addBands(slope).addBands(aspect).rename(['elevation', 'slope', 'aspect'])

    # Sample the image at the feature locations
    sampled_values = terrain_image.reduceRegions(
        collection=feature_collection,
        reducer=ee.Reducer.first(),
        scale=90 # SRTM has a resolution of 90m
    )
    
    values = sampled_values.getInfo()
    all_values.append(values)
    
locations = plots_metadata[["Longitude", "Latitude"]].values

all_results = []
i = 0
for values in all_values:
    for feature in values["features"]:
        all_results.append({
            "PlotObservationID": plots_metadata["PlotObservationID"][i],
            "Elevation": feature["properties"].get('elevation'),
            "Slope": feature["properties"].get('slope'),
            "Aspect": feature["properties"].get('aspect'),
        })
        i += 1

df = pd.DataFrame(all_results)

df.to_csv("topographic_data.csv")
df.describe()

#batches: 96
Batch: 0
Batch: 25
Batch: 50
Batch: 75


Unnamed: 0,PlotObservationID,Elevation,Slope,Aspect
count,95104.0,87967.0,87383.0,87383.0
mean,306038.6,942.715086,9.150711,172.811386
std,343545.5,922.071895,10.239486,104.766612
min,16.0,-32.0,0.0,0.0
25%,53159.75,182.0,1.426445,84.978508
50%,138963.5,642.0,4.513407,180.0
75%,487983.8,1521.0,14.366525,265.996155
max,1126790.0,4823.0,73.07415,359.701508


### WorldClim

In [10]:
locations = plots_metadata[["Longitude", "Latitude"]].values

worldclim_variables = ['bio_' + str(i+1) for i in range(19)]
worldclim_data = np.zeros((len(locations), 19), dtype="float32")

no_data_value = -3.4e+38

for j, wv in enumerate(worldclim_variables):
    print(f"Processing {wv}")
    with rasterio.open(f"worldclim/wc2.1_30s_{wv}.tif") as src:

        data = src.read(1)
        for i, val in enumerate(src.sample(locations)):
            if np.isclose(val, no_data_value, atol=0):
                x, y = locations[i]
                row, col = src.index(x, y)
                val = find_nearest_non_missing(data, row, col, no_data_value)
            worldclim_data[i, j] = val

Processing bio_1
Processing bio_2
Processing bio_3
Processing bio_4
Processing bio_5
Processing bio_6
Processing bio_7
Processing bio_8
Processing bio_9
Processing bio_10
Processing bio_11
Processing bio_12
Processing bio_13
Processing bio_14
Processing bio_15
Processing bio_16
Processing bio_17
Processing bio_18
Processing bio_19


In [11]:
worldclim_data = pd.DataFrame(worldclim_data, columns=worldclim_variables)
worldclim_data["PlotObservationID"] = plots_metadata["PlotObservationID"]
worldclim_data.describe()

Unnamed: 0,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,bio_9,bio_10,bio_11,bio_12,bio_13,bio_14,bio_15,bio_16,bio_17,bio_18,bio_19,PlotObservationID
count,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0
mean,10.806923,10.536043,41.045025,632.48291,25.329079,-2.378197,27.707275,14.090211,8.738209,18.580229,3.147605,959.717773,135.278122,40.526875,43.368679,361.834045,140.303162,278.454773,201.590958,306038.6
std,8.411316,3.418387,15.306807,323.379333,7.441097,11.006687,9.888825,8.644604,12.23323,6.632763,11.128435,693.381348,102.799438,36.872803,28.418333,270.808167,121.764641,219.86409,184.929123,343545.5
min,-18.066668,2.791667,12.111928,12.146515,3.1,-39.599998,8.0,-17.049999,-33.150002,-0.65,-33.299999,3.0,1.0,0.0,5.065311,2.0,0.0,0.0,0.0,16.0
25%,4.9375,7.925,30.885474,422.292458,19.9,-8.6,21.400002,7.683333,-0.2,13.6,-3.616667,444.0,68.0,11.0,23.48293,183.0,43.0,124.0,70.0,53159.75
50%,10.070833,9.758333,37.620029,625.835663,25.4,-2.0,26.0,13.966666,11.016666,18.4,2.9,791.0,108.0,30.0,33.856127,291.0,107.0,224.0,150.0,138963.5
75%,16.954166,13.075,47.523811,786.124863,31.299999,4.8,34.099998,21.316666,18.216667,23.799999,10.683333,1344.0,175.0,63.0,54.982403,476.0,214.0,387.0,285.0,487983.8
max,29.245834,21.883333,93.131866,1840.884521,44.799999,22.799999,63.799999,33.233334,35.549999,35.549999,27.5,7098.0,1153.0,432.0,147.443298,2863.0,1344.0,2863.0,1980.0,1126790.0


In [12]:
worldclim_data.to_csv("worldclim_data.csv")

### SoilGrids

In [13]:
locations = plots_metadata[["Longitude", "Latitude"]].values
soilgrid_data = np.zeros((len(locations), 8))
soil_variables = []

for j, soil_file in enumerate(os.listdir("soilgrids250")):
    soil_variable = soil_file[:6]
    soil_variables.append(soil_variable)
    print(f"Processing {soil_variable}")
    with rasterio.open(f"soilgrids250/{soil_file}") as src:
        if soil_variable in ["ORCDRC", "CECSOL", "BDTICM", "BLDFIE"]:
            no_data_value = -32768.0
        elif soil_variable in ["PHIHOX", "CLYPPT", "SLTPPT", "SNDPPT"]:
            no_data_value = 255
        else:
            raise ValueError(f"Unknown missing value for {soil_variable}")
        data = src.read(1)
        for i, val in enumerate(src.sample(locations)):
            if val == no_data_value:
                x, y = locations[i]
                row, col = src.index(x, y)
                val = find_nearest_non_missing(data, row, col, no_data_value)
            soilgrid_data[i, j] = val

Processing BDTICM
Processing BLDFIE
Processing CECSOL
Processing CLYPPT
Processing ORCDRC
Processing PHIHOX
Processing SLTPPT
Processing SNDPPT


In [14]:
soilgrid_data = pd.DataFrame(soilgrid_data, columns=soil_variables)
soilgrid_data["PlotObservationID"] = plots_metadata["PlotObservationID"]
soilgrid_data.describe()

Unnamed: 0,BDTICM,BLDFIE,CECSOL,CLYPPT,ORCDRC,PHIHOX,SLTPPT,SNDPPT,PlotObservationID
count,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0
mean,2601.618302,1236.570081,21.931938,20.870247,46.094276,61.429256,30.852971,48.26982,306038.6
std,3319.249711,209.205348,9.960055,8.414922,47.552199,10.514921,10.094515,12.706167,343545.5
min,0.0,176.0,0.0,0.0,0.0,40.0,1.0,7.0,16.0
25%,1130.0,1092.0,16.0,15.0,15.0,53.0,25.0,39.0,53159.75
50%,1708.0,1278.0,20.0,20.0,31.0,59.0,33.0,47.0,138963.5
75%,2695.0,1400.0,26.0,26.0,64.0,69.0,38.0,56.0,487983.8
max,78536.0,1618.0,156.0,60.0,513.0,91.0,78.0,97.0,1126790.0


In [15]:
soilgrid_data.to_csv("soilgrid_data.csv")

### Human footprints

In [17]:
human_data = np.zeros((len(plots_metadata), 9))

human_variables = ["HFP2009", "Built2009", "Croplands2005", "Lights2009", "Navwater2009", "Pasture2009", "Popdensity2010", "Railways", "Roads"]

for j, human_file in enumerate(["HFP2009.tif", "Built2009.tif", "croplands2005.tif", "Lights2009.tif", "Navwater2009.tif", "Pasture2009.tif", "Popdensity2010.tif", "Railways.tif", "Roads.tif"]):
    with rasterio.open(f"human_footprint_venter/Dryadv2/Maps/{human_file}") as src:
        print(f"Processing {human_file}")
        
        raster_crs = src.crs
        
        # Transform coordinates to the raster CRS
        longitudes = plots_metadata["Longitude"].values
        latitudes = plots_metadata["Latitude"].values
        x_coords, y_coords = transform('EPSG:4326', raster_crs, longitudes, latitudes)

        # Sample the raster at each coordinate
        data = src.read(1)
        for i, val in enumerate(src.sample(zip(x_coords, y_coords))):
            if val[0] == src.nodata:
                row, col = src.index(x_coords[i], y_coords[i])
                val = find_nearest_non_missing(data, row, col, src.nodata)
                human_data[i, j] = val
            else:
                human_data[i, j] = val[0]

Processing HFP2009.tif
Processing Built2009.tif
Processing croplands2005.tif
Processing Lights2009.tif
Processing Navwater2009.tif
Processing Pasture2009.tif
Processing Popdensity2010.tif
Processing Railways.tif
Processing Roads.tif


In [18]:
human_data = pd.DataFrame(human_data, columns=human_variables)
human_data["PlotObservationID"] = plots_metadata["PlotObservationID"]
human_data = human_data.set_index("PlotObservationID")
human_data = human_data.reset_index()
human_data = human_data.fillna(0) # Only 5 remaining samples have NaNs
human_data.describe()

Unnamed: 0,PlotObservationID,HFP2009,Built2009,Croplands2005,Lights2009,Navwater2009,Pasture2009,Popdensity2010,Railways,Roads
count,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0,95104.0
mean,306038.6,10.22876,0.758012,0.994164,1.32107,0.700857,0.549735,3.196827,0.395188,2.312876
std,343545.5,9.261474,2.646812,2.443532,2.758877,1.293931,0.954342,2.695611,1.733599,2.820298
min,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53159.75,3.629548,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.257526
50%,138963.5,7.839467,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.757507
75%,487983.8,13.296728,0.0,0.0,0.0,0.645898,1.0,5.0,0.0,4.0
max,1126790.0,50.0,10.0,7.0,10.0,4.0,4.0,10.0,8.0,8.0


In [None]:
human_data.to_csv("human_data.csv")

# Satclip

In [None]:
from satclip.satclip.load import get_satclip

locations = plots_metadata[["Longitude", "Latitude"]].values
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = get_satclip('satclip-vit16-l40.ckpt', device=device) # Only loads location encoder by default
model.eval()
with torch.no_grad():
    emb = model(torch.Tensor(locations).double().to(device)).detach().cpu()

In [None]:
np.save('satclip_embeddings.npy', emb.numpy())