In [27]:
! pip install soilgrids



In [28]:
from soilgrids import SoilGrids

soil_grids = SoilGrids()

# India bounding box
extent = {
    "west": 68,
    "south": 8,
    "east": 98,
    "north": 36
}

# Required SoilGrids services
layers = [
    "phh2o",   # pH
    "cec",     # cation exchange capacity
    "ocd",     # organic carbon density
    "bdod",    # bulk density
    "sand",    # sand %
    "silt",    # silt %
    "clay",    # clay %
    #"awc"      # available water capacity
]

depth = "0-5cm"

for layer in layers:

    coverage = f"{layer}_{depth}_mean"
    output_file = f"india_{coverage}.tif"

    print(f"Downloading {coverage} ...")

    soil_grids.get_coverage_data(
        service_id=layer,
        coverage_id=coverage,
        west=extent["west"],
        south=extent["south"],
        east=extent["east"],
        north=extent["north"],
        crs="urn:ogc:def:crs:EPSG::4326",
        width=2000,
        height=2000,
        output=output_file
    )

print("All layers downloaded.")


Downloading phh2o_0-5cm_mean ...
Downloading cec_0-5cm_mean ...
Downloading ocd_0-5cm_mean ...
Downloading bdod_0-5cm_mean ...
Downloading sand_0-5cm_mean ...
Downloading silt_0-5cm_mean ...
Downloading clay_0-5cm_mean ...
All layers downloaded.


In [35]:
import rasterio
import numpy as np
import pandas as pd

# Files downloaded earlier
files = {
    "pH": "india_phh2o_0-5cm_mean.tif",
    #"moisture": "india_awc_0-5cm_mean.tif",
    "conductivity_proxy": "india_cec_0-5cm_mean.tif",
    "salinity_proxy": "india_ocd_0-5cm_mean.tif",
    "sand": "india_sand_0-5cm_mean.tif",
    "silt": "india_silt_0-5cm_mean.tif",
    "clay": "india_clay_0-5cm_mean.tif",
    "ocd" : "india_ocd_0-5cm_mean.tif"
}

arrays = {}
transform = None

# Read rasters
for key, path in files.items():
    with rasterio.open(path) as src:
        arrays[key] = src.read(1)
        transform = src.transform

height, width = arrays["pH"].shape

# Create coordinate grid
rows, cols = np.meshgrid(np.arange(height), np.arange(width), indexing='ij')
xs, ys = rasterio.transform.xy(transform, rows, cols)

data = {
    "lat": np.array(ys).flatten(),
    "lon": np.array(xs).flatten()
}

# Add soil features
for key in arrays:
    data[key] = arrays[key].flatten()

df = pd.DataFrame(data)

# Remove invalid pixels
df = df.replace(-32768, np.nan).dropna()



In [36]:
df["tds_proxy"] = (
    df["conductivity_proxy"] * 0.6 +
    df["clay"] * 0.2 +
    df["salinity_proxy"] * 0.2
)


In [37]:
df["moisture_proxy"] = (
    0.5 * df["clay"] +
    0.3 * df["ocd"] -
    0.2 * df["sand"]
)


In [38]:
df.rename(columns={'tds_proxy': 'tds',
                   'conductivity_proxy': 'EC',
                   'salinity_proxy': 'salinity',
                   'moisture_proxy' : 'moisture',
                   'ocd' : 'OC'}, inplace=True)   #matching the sensor coulmns
df.drop(columns = ['sand', 'silt', 'clay', 'lat', 'lon'], inplace = True) #no longer required

In [39]:
print(df.head())
print(df.shape)

     pH     EC  salinity     OC    tds  moisture
0  77.0  220.0     243.0  243.0  241.0     165.3
1  77.0  220.0     218.0  218.0  239.2     168.4
2  77.0  218.0     218.0  218.0  236.0     162.8
3  77.0  224.0     260.0  260.0  247.0     170.7
4  77.0  217.0     264.0  264.0  243.8     173.0
(2612234, 6)


In [40]:
df.to_csv('soil_dataset_final.csv', index=False)