# TerraClimate Data Download & Processing

This notebook automates the download and preprocessing of **TerraClimate** data for a selected region of interest.  
The current setup extracts data for a bounding box covering the three countries of interest.

## What it does

1. **Download**  
   - Retrieves monthly TerraClimate NetCDF files from the official server.  
   - Variables include temperature, precipitation, evapotranspiration, vapour pressure, wind speed, and others relevant for crop suitability modelling.  
   - Each full dataset for one scenario (e.g. one variable across all years) is approximately **600 MB**.

2. **Crop & Subset**  
   - Crops the global dataset to the bounding box around the three target countries.  
   - Keeps only the required region, reducing file size and computation time.

3. **Process & Save**  
   - Extracts and stores the climatological variables needed for **suitability modelling** of agroforestry systems.  
   - Saves processed files in a structured format for later use.

## Notes

- Ensure that output paths are correctly set before running.  
- Depending on network speed, downloading the full set of variables can take time.  
- Cropped and processed data are much lighter than the full global files.
- Check that the base directory is well defined in the config.py file in the parent directory


In [None]:
from pathlib import Path
import sys

# --- Use cwd when __file__ isn't available (Jupyter/IPython) ---
this_dir = Path().resolve()        # current working directory
parent_dir = this_dir.parent
sys.path.insert(0, str(parent_dir))


from config import DATA_DIR


In [None]:
import os
import requests
import xarray as xr
from config import DATA_DIR
# ========================
# Settings
# ========================
output_dir = DATA_DIR / "terra_climate"
os.makedirs(output_dir, exist_ok=True)

years = range(1985, 2023)
variables = ["tmin","tmax","pet", "ppt","aet", "def", "srad", "vap", "ws", "vpd"]
lat_bounds = (25, 10)
lon_bounds = (-115, -50)

# ========================
# Helper function
# ========================
def download_and_crop(var, year):
    filename = f"TerraClimate_{var}_{year}.nc"
    url = f"http://thredds.northwestknowledge.net:8080/thredds/fileServer/TERRACLIMATE_ALL/data/{filename}"
    print(f"🔄 Downloading {url}")
    try:
        # Download full file to disk
        response = requests.get(url, timeout=120)
        response.raise_for_status()
        with open(filename, "wb") as f:
            f.write(response.content)

        # Open, shift lon, crop, save
        ds = xr.open_dataset(filename)
        ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))
        ds_crop = ds[var].sel(lat=slice(*lat_bounds), lon=slice(*lon_bounds))

        out_path = os.path.join(output_dir, f"TerraClimate_{var}_{year}_CA.nc")
        ds_crop.to_netcdf(out_path, mode="w")
        print(f"✅ Saved to {out_path}")

        # Clean up temp file
        os.remove(filename)

    except Exception as e:
        print(f"❌ Failed for {var} {year}: {e}")

# ========================
# Run batch download
# ========================
for var in variables:
    for year in years:
        download_and_crop(var, year)


In [None]:
import xarray as xr
import os
import requests

# ========================
# Settings
# ========================
scenario = "plus2C"
prefix = "2c"
output_dir = DATA_DIR / "terra_climate_scenarios_ncss/plus2C"
os.makedirs(output_dir, exist_ok=True)

years = range(1985, 2015)
lat_bounds = (25,10)
lon_bounds = (-115, -50)


# ========================
# Helper function
# ========================
def download_and_crop(var, year):
    filename = f"TerraClimate_{prefix}_{var}_{year}.nc"
    url = f"http://thredds.northwestknowledge.net:8080/thredds/fileServer/TERRACLIMATE_ALL/data_{scenario}/{filename}"
    print(f"🔄 Accessing {url}")
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        with open(filename, "wb") as f:
            f.write(response.content)
        ds = xr.open_dataset(filename)
        ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))
        ds_crop = ds[var].sel(lat=slice(*lat_bounds), lon=slice(*lon_bounds))
        out_path = os.path.join(output_dir, f"TerraClimate_{scenario}_{var}_{year}_CA.nc")
        ds_crop.to_netcdf(out_path)
        print(f"✅ Saved to {out_path}")
        os.remove(filename)
    except Exception as e:
        print(f"❌ Failed for {var} {year}: {e}")

# ========================
# Run batch download
# ========================
for var in variables:
    for year in years:
        download_and_crop(var, year)


In [None]:
import xarray as xr
import os
import requests

# ========================
# Settings
# ========================
scenario = "plus4C"
prefix = "4c"
output_dir = DATA_DIR / "terra_climate_scenarios_ncss/plus4C"
os.makedirs(output_dir, exist_ok=True)


years = range(1985, 2015)
lat_bounds = (25,10)
lon_bounds = (-115, -50)


# ========================
# Helper function
# ========================
def download_and_crop(var, year):
    filename = f"TerraClimate_{prefix}_{var}_{year}.nc"
    url = f"http://thredds.northwestknowledge.net:8080/thredds/fileServer/TERRACLIMATE_ALL/data_{scenario}/{filename}"
    print(f"🔄 Accessing {url}")
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        with open(filename, "wb") as f:
            f.write(response.content)
        ds = xr.open_dataset(filename)
        ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180))
        ds_crop = ds[var].sel(lat=slice(*lat_bounds), lon=slice(*lon_bounds))
        out_path = os.path.join(output_dir, f"TerraClimate_{scenario}_{var}_{year}_CA.nc")
        ds_crop.to_netcdf(out_path)
        print(f"✅ Saved to {out_path}")
        os.remove(filename)
    except Exception as e:
        print(f"❌ Failed for {var} {year}: {e}")

# ========================
# Run batch download
# ========================
for var in variables:
    for year in years:
        download_and_crop(var, year)


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import glob
import os

import xarray as xr
import numpy as np
import pandas as pd
import glob
import os

def compute_suitability_variables(data_dir, output_path, file_prefix, start_year, end_year):
    """
    Compute suitability variables from TerraClimate or scenario NetCDFs.
    Includes bioclim-style quarterly metrics.
    """

    # ------------------
    # Helper: load + merge variable
    # ------------------
    def open_merge_var(varname):
        files = sorted(glob.glob(f"{data_dir}{file_prefix}_{varname}_*.nc"))
        if not files:
            raise FileNotFoundError(f"No files found for {varname} in {data_dir}")
        datasets = [xr.open_dataset(f)[varname] for f in files]
        da = xr.concat(datasets, dim="time")

        # Create datetime index
        start_file_year = int(files[0].split("_")[-2])
        time_index = pd.date_range(start=f"{start_file_year}-01-01", periods=da.sizes["time"], freq="MS")
        da = da.assign_coords(time=("time", time_index))

        # Select time range
        da = da.sel(time=slice(f"{start_year}-01-01", f"{end_year}-12-31"))
        return da

    # ------------------
    # Load variables
    # ------------------
    tmin = open_merge_var("tmin")
    tmax = open_merge_var("tmax")
    ppt  = open_merge_var("ppt")
    pet  = open_merge_var("pet")
    aet  = open_merge_var("aet")
    deficit = open_merge_var("def")
    #srad = open_merge_var("srad")

    # ------------------
    # Monthly climatologies
    # ------------------
    tmin_clim = tmin.groupby("time.month").mean(dim="time")
    tmax_clim = tmax.groupby("time.month").mean(dim="time")
    ppt_clim  = ppt.groupby("time.month").mean(dim="time")
    pet_clim  = pet.groupby("time.month").mean(dim="time")
    aet_clim  = aet.groupby("time.month").mean(dim="time")
    deficit_clim = deficit.groupby("time.month").mean(dim="time")
    #srad_clim = srad.groupby("time.month").mean(dim="time")

    # ------------------
    # Core metrics
    # ------------------
    mean_diurnal_range = (tmax_clim - tmin_clim).mean(dim="month")
    temp_ann_range = tmax_clim.max(dim="month") - tmin_clim.min(dim="month")
    isothermality = (mean_diurnal_range / temp_ann_range) * 100
    prec_seasonality = (ppt_clim.std(dim="month") / ppt_clim.mean(dim="month")) * 100

    annual_pet = pet_clim.sum(dim="month")
    annual_aet = aet_clim.sum(dim="month")
    annual_def = deficit_clim.sum(dim="month")
    mean_srad = srad_clim.mean(dim="month")

    # ------------------
    # Seasonal extremes (rolling 3-month windows)
    # ------------------
    tavg = (tmax + tmin) / 2
    ppt_roll_sum = ppt.rolling(time=3, center=False).sum()
    tavg_roll_mean = tavg.rolling(time=3, center=False).mean()

    years = np.unique(ppt['time.year'])

    # --- Driest month ---
    prec_driest_month = ppt.groupby("time.year").min(dim="time").mean(dim="year")

    # --- Wettest month ---
    prec_wettest_month = ppt.groupby("time.year").max(dim="time").mean(dim="year")

    # --- Mean Temp Driest Quarter ---
    mt_driest_quarter_list = []
    for y in years:
        ppt_y = ppt_roll_sum.sel(time=str(y))
        tavg_y = tavg_roll_mean.sel(time=str(y))
        driest_idx = ppt_y.fillna(1e9).argmin(dim="time")
        mt_driest_quarter_list.append(tavg_y.isel(time=driest_idx))
    mean_temp_driest_quarter = xr.concat(mt_driest_quarter_list, dim="year").mean(dim="year")

    # --- Mean Temp Wettest Quarter (bio08) ---
    mt_wettest_quarter_list = []
    for y in years:
        ppt_y = ppt_roll_sum.sel(time=str(y))
        tavg_y = tavg_roll_mean.sel(time=str(y))
        wettest_idx = ppt_y.fillna(-1e9).argmax(dim="time")
        mt_wettest_quarter_list.append(tavg_y.isel(time=wettest_idx))
    mean_temp_wettest_quarter = xr.concat(mt_wettest_quarter_list, dim="year").mean(dim="year")

    # --- Precip Warmest Quarter (bio18) ---
    ppt_warmest_quarter_list = []
    for y in years:
        tavg_y = tavg_roll_mean.sel(time=str(y))
        ppt_y = ppt_roll_sum.sel(time=str(y))
        warmest_idx = tavg_y.fillna(-1e9).argmax(dim="time")
        ppt_warmest_quarter_list.append(ppt_y.isel(time=warmest_idx))
    prec_warmest_quarter = xr.concat(ppt_warmest_quarter_list, dim="year").mean(dim="year")

    # --- Precip Coldest Quarter (bio19) ---
    ppt_coldest_quarter_list = []
    for y in years:
        tavg_y = tavg_roll_mean.sel(time=str(y))
        ppt_y = ppt_roll_sum.sel(time=str(y))
        coldest_idx = tavg_y.fillna(1e9).argmin(dim="time")
        ppt_coldest_quarter_list.append(ppt_y.isel(time=coldest_idx))
    prec_coldest_quarter = xr.concat(ppt_coldest_quarter_list, dim="year").mean(dim="year")

    # ------------------
    # Combine into dataset
    # ------------------
    var_dict = {
        "MeanDiurnalRange": mean_diurnal_range,
        "Isothermality": isothermality,
        "PrecSeasonality": prec_seasonality,
        "AnnualPET": annual_pet,
        "AnnualAET": annual_aet,
        "AnnualDeficit": annual_def,
        "MeanSRAD": mean_srad,
        "PrecDriestMonth": prec_driest_month,
        "PrecWettestMonth": prec_wettest_month,
        "MeanTempDriestQuarter": mean_temp_driest_quarter,
        "MeanTempWettestQuarter": mean_temp_wettest_quarter,
        "PrecWarmestQuarter": prec_warmest_quarter,
        "PrecColdestQuarter": prec_coldest_quarter
    }

    stack = xr.Dataset(var_dict)

    # ------------------
    # Save
    # ------------------
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    stack.to_netcdf(output_path)
    print(f"✅ Saved suitability variables to: {output_path}")



# # ===== Example usage =====
# # Historical
# compute_suitability_variables(
#     data_dir="/Users/szelie/data/unu/terra_climate/",
#     output_path="/Users/szelie/data/unu/terra_climate/SuitabilityVariables_1990_2014.nc",
#     file_prefix="TerraClimate",
#     start_year=1990,
#     end_year=2014
# )

# Future
compute_suitability_variables(
    data_dir=DATA_DIR / "terra_climate_scenarios_ncss/plus4C/",
    output_path=DATA_DIR / "terra_climate_scenarios_ncss/plus4C/SuitabilityVariables_plus4C_1990_2014.nc",
    file_prefix="TerraClimate_plus4C",
    start_year=1990,
    end_year=2014
)


In [None]:
import xarray as xr
import pandas as pd

# Paths to your datasets
hist_path = DATA_DIR / "terra_climate/SuitabilityVariables_1990_2014.nc"
fut_path  = DATA_DIR / "terra_climate_scenarios_ncss/plus2C/SuitabilityVariables_plus2C_1990_2014.nc"

# Open datasets
hist_ds = xr.open_dataset(hist_path)
fut_ds  = xr.open_dataset(fut_path)

# Make sure they have the same variable names
vars_to_compare = list(hist_ds.data_vars)

# Collect statistics
stats = []
for var in vars_to_compare:
    hist = hist_ds[var]
    fut = fut_ds[var]

    stats.append({
        "Variable": var,
        "Hist_min": float(hist.min()),
        "Fut_min": float(fut.min()),
        "Hist_max": float(hist.max()),
        "Fut_max": float(fut.max()),
        "Hist_mean": float(hist.mean()),
        "Fut_mean": float(fut.mean()),
        "Hist_std": float(hist.std()),
        "Fut_std": float(fut.std()),
        "Mean_diff": float(fut.mean() - hist.mean()),
        "Std_ratio": float(fut.std() / hist.std()) if float(hist.std()) != 0 else None
    })

# Create dataframe and sort by largest mean difference
stats_df = pd.DataFrame(stats).sort_values(by="Mean_diff", key=abs, ascending=False)

# Nice formatting
pd.set_option("display.float_format", lambda x: f"{x:,.2f}")
print(stats_df)


In [None]:
import xarray as xr
import pandas as pd

# Path to fixed historical dataset
nc_path = DATA_DIR / "terra_climate/SuitabilityVariables_1990_2014.nc"

# Load dataset
ds = xr.open_dataset(nc_path)

# Convert to DataFrame
df = ds.to_dataframe().reset_index()

# Keep only the data variable columns
var_cols = list(ds.data_vars.keys())

# Drop rows with NaN in any variable
df_clean = df[var_cols].dropna()

# Compute Pearson correlation
corr = df_clean.corr(method="pearson")

# Display
print("\n📊 Correlation matrix:\n")
print(corr.round(2))

# Show high correlation pairs
corr_threshold = 0.8
print(f"\n🔍 Highly correlated pairs (|r| > {corr_threshold}):")
for i, v1 in enumerate(var_cols):
    for j, v2 in enumerate(var_cols):
        if i < j and abs(corr.loc[v1, v2]) > corr_threshold:
            print(f"{v1} ↔ {v2}: r = {corr.loc[v1, v2]:.2f}")
