In [None]:
import sys
sys.path.insert(0,'./scikit-map')
import skmap
print(skmap.__file__)

In [None]:
wd = '/mnt/tupi/WRI/livestock_global_modeling/'

In [None]:
import joblib
livestock_polygons = joblib.load(f'{wd}livestock_census_raw/gpw_livestock.animals_gpw.fao.faostat.malek.2024_polygon.samples_20000101_20231231_go_epsg.4326_v1.lz4')
livestock_polygons

## Features

In [None]:
import pandas as pd
from pathlib import Path

livestock_covs = pd.read_csv(f'{wd}/livestock_census_ard/livestock_cov.csv')
livestock_covs

In [None]:
static_covs = [ l for l in livestock_covs[livestock_covs['type'] == 'static'].layername ]
temporal_covs = [ l for l in livestock_covs[livestock_covs['type'] == 'temporal'].layername ]

## Modeling

In [None]:
import numpy as np

def rec_diff(vals, th, niter=1):
    result = []
    for w in range(1,niter+1):
        w_diff = (np.abs(vals - np.roll(vals, w)) / vals)[w:]
        w_breaks = np.pad(w_diff > th*w, [w,0], mode='constant',constant_values=False)
        result.append(w_breaks)
    
    return np.where(np.any(np.stack(result), axis=0))[0]

def detect_outliers(vals, th=0.30, niter=2):
    indices = rec_diff(vals, th, niter)
    return indices

def drop_outlier(vals):
    oi = detect_outliers(vals, 0.3, 3)
    n_outlier = len(oi)
    
    if n_outlier > 0:
        vals[oi] = np.nan
        
    return vals, n_outlier

def drop_breakpoints(vals):
    n_breaks = 0
    niter = 1
    bi = rec_diff(vals, 0.3, niter) #math.floor(niter/2)
    
    if len(bi) > 0:
        bi = np.max(bi)
        n_bef_break = np.sum(np.logical_not(np.isnan(vals[:bi])))
        
        vals[:bi] = np.nan
        n_breaks = n_bef_break
    return vals, n_breaks

In [None]:
import fiona
from pathlib import Path

def normalize(rows):
    
    result = []
    
    animals = ['cattle','goat','horse','sheep', 'buffalo']
    for _, row in rows.iterrows():
        for c in animals:
            animal_cols = sorted(rows.columns[rows.columns.str.contains(f'{c}_2')])
            vals = row[animal_cols].to_numpy().flatten().astype('float32')

            vals, n_outlier = drop_outlier(vals)
            vals, n_breaks = drop_breakpoints(vals)
            
            row[animal_cols] = vals
            row[f'{c}_noutlier'] = n_outlier
            row[f'{c}_nbreaks'] = n_breaks
            row[f'{c}_nyears'] = np.sum(np.logical_not(np.isnan(vals)))
        result.append(row)
        
    return pd.DataFrame(result)

In [None]:
import math

n_elems = livestock_polygons.shape[0]
n_batch = math.ceil(n_elems/96)

args = []

for i0 in range(0, n_elems, n_batch):
    i1=i0+n_batch
    if i1>n_elems:
        i1=n_elems
    args.append((livestock_polygons.iloc[i0:i1],))

print(len(args))

In [None]:
from skmap import parallel

livestock_polygons_norm = pd.concat([ row for row in parallel.job(normalize, args, n_jobs=96) ])
livestock_polygons_norm

## Outlier removal assessment

In [None]:
idx = 105
animal = 'horse'

cols = ['gazID', 'gazName', 'source'] + list(livestock_polygons_norm.columns[livestock_polygons_norm.columns.str.contains(animal+'_2')])
gazID = livestock_polygons_norm[livestock_polygons_norm[f'{animal}_noutlier'] >= 1].iloc[idx]['gazID']

pd.concat([
    livestock_polygons[livestock_polygons['gazID'] == gazID][cols].T,
    livestock_polygons_norm[livestock_polygons_norm['gazID'] == gazID][cols].T
],axis=1).to_numpy()

In [None]:
cols = livestock_polygons.columns[livestock_polygons.columns.str.contains('nyear')]
(livestock_polygons_norm[cols].sum() - livestock_polygons[cols].sum()) / livestock_polygons[cols].sum()

In [None]:
TO_DROP = [
  {
    'country': ['Iceland','Australia','Canada','Austria','Bulgaria','Croatia','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland','Italy','Latvia','Lithuania','Macedonia','Montenegro','Peru','Poland','Portugal','Romania','Serbia','Slovakia','Slovenia','Spain','Sweden','Turkey','Ukraine'],
    'level': [2],
    'source': ['GPW']
  },
  {
    'country': ['Bulgaria','Croatia','Estonia','Greece','Hungary','Latvia','Lithuania','Macedonia','Romania','Serbia','Slovakia','Turkey'],
    'level': [3],
    'source': ['GPW']
  },
  {
    'gazName': ['Australia.Northern Territory.Northern Territory - Outback','Australia.Queensland.Ipswich','Australia.Queensland.Townsville','Australia.South Australia.South Australia - Outback','Australia.Western Australia.Western Australia - Outback (North)','Australia.Western Australia.Western Australia - Outback (South)','Australia.Western Australia.Western Australia - Wheat Belt','United States of America.Aleutians East','United States of America.Anchorage','United States of America.Fairbanks North Star','United States of America.Kenai Peninsula','United States of America.Juneau','United States of America.Unorganized Borough'],
    'source': ['GPW']
  },
  {
    'country': ['Argentina', 'Austria', 'Belgium', 'Brazil', 'Chile', 'China', 'Colombia', 'Czechia', 'Denmark', 'Finland', 'France', 'India', 'Italy', 'Netherlands', 'Poland', 'Portugal', 'Spain', 'Sweden', 'Switzerland', 'United states'],
    'level': ['1','2'],
    'source': ['FAO']
  },
  {
    'country': ['Paraguay', 'Australia', 'American Samoa', 'Kenya', 'Swaziland', 'United kingdom of great britain and northern ireland'],
    'level': ['1'],
    'source': ['FAO']
  },
  {
    'country': ['Ireland'],
    'level': ['0'],
    'source': ['FAO']
  }
] 

In [None]:
or_list = []
for fil in TO_DROP:
    and_list = []
    for k in fil.keys():
        and_list.append(livestock_polygons_norm[k].isin(fil[k]))
    and_list = np.logical_and.reduce(and_list)
    print(livestock_polygons_norm[and_list].shape)
    or_list.append(and_list)
    
KEEP_MASK = np.logical_not(np.logical_or.reduce(or_list))

#Removing 13685 samples
print(f"Removing {livestock_polygons_norm.shape[0] - livestock_polygons_norm[KEEP_MASK].shape[0]} samples")

livestock_polygons_norm = livestock_polygons_norm[KEEP_MASK]

In [None]:
import geopandas as gpd

igh = '+proj=igh +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs'

livestock_polygons_norm = gpd.GeoDataFrame(livestock_polygons_norm, geometry=livestock_polygons_norm['geometry'], crs='EPSG:4326').to_crs(igh)

In [None]:
livestock_polygons_norm['area_km2'] = livestock_polygons_norm.to_crs(igh).area / 1000000

In [None]:
from skmap.parallel import TilingProcessing
import rasterio

igh_bounds = livestock_polygons_norm.to_crs(igh).total_bounds
cv_tile_size = int(livestock_polygons_norm['area_km2'].median()) * 1000

cv_tiles = TilingProcessing.generate_tiles(cv_tile_size, igh_bounds, igh)
cv_tiles_id = cv_tiles.sjoin(livestock_polygons_norm[['geometry','gazID']].to_crs(igh), how='inner')['tile_id'].unique()
cv_tiles = cv_tiles[cv_tiles['tile_id'].isin(cv_tiles_id)]
cv_tiles

In [None]:
cv_tiles.to_file(f'{wd}livestock_census_ard/gpw_livestock.animals_gpw.cv.tiles_20000101_20231231_go_epsg.4326_v1.gpkg')

In [None]:
import pandas as pd
livestock_zonal = pd.read_parquet('livestock_zonal_20250923.pq')
livestock_zonal

In [None]:
livestock_zonal_ah = pd.read_parquet('livestock_zonal_20250923_adhoc.pq')
livestock_zonal_ah

In [None]:
livestock_zonal = livestock_zonal.merge(livestock_zonal_ah[[
    'bsf_glad.swa.ard2_m_30m_s_year0101_year1231_go_epsg.4326_v1',
    'gpw_short.veg.height_egbt_m_30m_s_year0101_year1231_go_epsg.4326_v1',
    'polygon_idx','year'
]], on=['polygon_idx','year'])

In [None]:
livestock_samples = livestock_zonal.set_index('polygon_idx', drop=True).merge(
    livestock_polygons_norm,
    left_index = True,
    right_index = True
)

livestock_samples = gpd.GeoDataFrame(livestock_samples, geometry=livestock_samples['geometry'], crs='EPSG:4326')

livestock_samples

In [None]:
# Sanity check
livestock_samples[livestock_samples['gazName'] == 'Australia.Queensland.Moreton Bay'][['gazName', 'mask_km2', 'area_km2']]

In [None]:
import numpy as np
import math
livestock_samples_row = []

animals = ['cattle', 'horse', 'goat', 'sheep', 'buffalo']
igh = '+proj=igh +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs'
sample_cols = list(livestock_zonal.columns.drop(['polygon_idx'])) + ['gazID', 'gazName', 'source', 'level', 'method', 'country', 'area_km2']

for year, rows in livestock_samples.groupby('year'):
    
    print(f"Preparing {rows.shape} rows for {year}")
    row_cols = list(sample_cols) + ['geometry', 'x', 'y', '1_radius']
    
    for animal in animals:
        rows['mask_prop'] = ((rows['mask_km2'] / rows['area_km2']) * 100).astype('int')
        ['mask_prop']
        
        density_col = f'{animal}_density'
        rows[density_col] = rows[f'{animal}_{year}'] / rows['mask_km2']
        row_cols.append(density_col)
        
        heads_col = f'{animal}_heads'
        rows[heads_col] = rows[f'{animal}_{year}']
        row_cols.append(heads_col)
        
        rows.loc[np.isinf(rows[f'{animal}_density']),f'{animal}_density'] = np.nan
        rows.loc[rows[f'{animal}_density'] == 0,f'{animal}_density'] = np.nan
    
    # Handleling multi-part polygons
    rows_wei = gpd.GeoDataFrame(rows['geometry']).to_crs(igh).explode()
    rows_wei['1_radius'] = rows_wei.minimum_bounding_circle().area
    rows_wei['1_radius'] = rows_wei['1_radius'].apply(lambda f: 1/math.sqrt(f / math.pi))
    rows['1_radius'] = rows_wei['1_radius'].groupby(level='polygon_idx').max()
    
    rows['x'] = gpd.GeoSeries(rows['geometry']).representative_point().x
    rows['y'] = gpd.GeoSeries(rows['geometry']).representative_point().y
    rows['geometry'] = gpd.GeoSeries(rows['geometry'])
    livestock_samples_row.append(rows[row_cols])

livestock_samples_row = pd.concat(livestock_samples_row).reset_index(drop=True)
livestock_samples_row

In [None]:
import numexpr as ne
from datetime import datetime
import math

def geo_temp(fi, day, a=37.03043, b=-15.43029):
    f =fi
    pi = math.pi 

    #math.cos((day - 18) * math.pi / 182.5 + math.pow(2, (1 - math.copysign(1, fi))) * math.pi) 
    sign = 'where(abs(fi) - fi == 0, 1, -1)'
    costeta = f"cos((day - 18) * pi / 182.5 + 2**(1 - {sign}) * pi)"

    #math.cos(fi * math.pi / 180)
    cosfi = "cos(fi * pi / 180)"
    A = cosfi

    #(1 - costeta) * abs(math.sin(fi * math.pi / 180) )
    B = f"(1 - {costeta}) * abs(sin(fi * pi / 180) )"

    x = f"(a * {A} + b * {B})"
    return ne.evaluate(x)

def add_geo_temp(pts):

    elev_corr = 0.006 * pts['filtered.dtm_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20240528'].to_numpy() * 0.1

    for m in range(1,13):
        doy = (datetime.strptime(f'2000-{m}-15', '%Y-%m-%d').timetuple().tm_yday)
        max_temp_name = f'clm_lst_max.geom.temp_m_30m_s_m{m}' 
        min_temp_name = f'clm_lst_min.geom.temp_m_30m_s_m{m}'
        print(f"Adding {max_temp_name} & {min_temp_name}")
        pts[max_temp_name] = ((geo_temp(pts['y'].to_numpy(), day=doy, a=37.03043, b=-15.43029) - elev_corr) * 100).round()
        pts[min_temp_name] = ((geo_temp(pts['y'].to_numpy(), day=doy, a=24.16453, b=-15.71751) - elev_corr) * 100).round()

    return(pts)

livestock_samples_row = add_geo_temp(livestock_samples_row)
livestock_samples_row

In [None]:
livestock_samples_row_bkp = livestock_samples_row.copy()
livestock_samples_row_bkp

#livestock_samples = livestock_samples_row_bkp.copy()

In [None]:
livestock_samples = livestock_samples_row

In [None]:
meta_cols = ['gazID', 'gazName', 'source', 'level', 'country', 'geometry', 'method', 'x', 'y', '1_radius', 'year', 'area_km2', 'mask_km2', 
             'cattle_density', 'horse_density', 'goat_density', 'sheep_density', 'buffalo_density', 
             'cattle_heads', 'horse_heads', 'goat_heads', 'sheep_heads', 'buffalo_heads']

covs = sorted(list(livestock_samples.columns.drop(meta_cols)))

In [None]:
livestock_samples.loc[livestock_samples['level'].isnull(),'level'] = -1
livestock_samples['level'] = livestock_samples['level'].astype('int')

In [None]:
animals = ['cattle', 'horse', 'goat', 'sheep', 'buffalo']

In [None]:
import numpy as np

n_heads_zero_th = livestock_samples[np.logical_and.reduce([
    livestock_samples['mask_km2'] == 0,
    np.any(np.isnan(livestock_samples[covs]),1) == False
])][[ f'{a}_heads' for a in animals ]].median().to_dict()

for k in n_heads_zero_th.keys():
    print(k)
    to_drop = livestock_samples[np.logical_and.reduce([
        livestock_samples['mask_km2'] == 0,
        livestock_samples[k] > n_heads_zero_th[k],
    ])].index
    
    to_zero = livestock_samples[np.logical_and.reduce([
        livestock_samples['mask_km2'] == 0,
        livestock_samples[k] <= n_heads_zero_th[k],
    ])].index
    
    livestock_samples = livestock_samples.drop(index=to_drop)
    print(f"Removing {to_drop.shape[0]} samples")
    
    livestock_samples.loc[to_zero, k.replace('_heads','_density')] = 0
    print(f"Inputing zeros {to_zero.shape[0]} samples")

In [None]:
import numpy as np
from skmap.misc import ttprint

any_nan_samples = np.sum(np.isnan(livestock_samples[covs].to_numpy()).astype('int'), axis=0)
cols_to_remove = []

for th in [1, 0.5, 0.2, 0.1, 0.05, 0.02]:
    ttprint(f"Columns with {th*100}% of nan values")
    for c,s in zip(covs, any_nan_samples):
        if s > (livestock_samples.shape[0] * th):
            ttprint(f' - {s} => {c}')
            cols_to_remove += [c]

cols_to_remove = set(cols_to_remove)
ttprint(cols_to_remove)
ttprint(f"Removing {len(cols_to_remove)} columns (>= {th*100}% of nan values)")

In [None]:
livestock_samples = livestock_samples.drop(columns=cols_to_remove)

In [None]:
covs = sorted(list(livestock_samples.columns.drop(meta_cols)))

In [None]:
nan_samples = livestock_samples[np.any(np.isnan(livestock_samples[covs].to_numpy()).astype('int'), axis=1)].index
ttprint(f"Removing {len(nan_samples)} samples with at least one column with nan")
livestock_samples = livestock_samples.drop(nan_samples)

density_cols = livestock_samples.columns[livestock_samples.columns.str.contains('_density')]
dnan_samples = livestock_samples[np.all(np.isnan(livestock_samples[density_cols].to_numpy()), axis=1)].index
ttprint(f"Removing {len(dnan_samples)} samples with all densities nan")
livestock_samples = livestock_samples.drop(dnan_samples)

livestock_samples

In [None]:
def test_calib_sampling_strata(samples, cv_tiles, test_pct, calib_pct, seed=1989):
        
    cv_tiles_animal = cv_tiles.sjoin(samples[['geometry','gazID']], how='inner')['tile_id'].value_counts()
    cv_tiles_animal = pd.DataFrame(cv_tiles_animal.reset_index())
    cv_tiles_animal['qgroup'] = pd.qcut(cv_tiles_animal['count'], q=10, labels=False)
    cv_tiles_animal = cv_tiles_animal.set_index('tile_id')
    cv_tiles_animal

    test_tiles_animal = cv_tiles_animal.groupby('qgroup', group_keys=False).apply(lambda x: x.sample(int(x.shape[0]*test_pct), random_state=seed))
    calib_tiles_animal = cv_tiles_animal.drop(index=test_tiles_animal.index).groupby('qgroup', group_keys=False).apply(lambda x: x.sample(int(x.shape[0]*calib_pct), random_state=seed))
    
    return (
        cv_tiles[cv_tiles['tile_id'].isin(test_tiles_animal.index)],
        cv_tiles[cv_tiles['tile_id'].isin(calib_tiles_animal.index)],
    )

In [None]:
animals = ['cattle', 'horse', 'goat', 'sheep', 'buffalo']
calib_pct = 0.1
test_pct = 0.1

for animal in animals:
    
    mask = np.logical_not(np.isnan(livestock_samples[f'{animal}_density']))
    animal_samples = livestock_samples[mask]
    
    cv_tiles_test, cv_tiles_calib = test_calib_sampling_strata(livestock_samples, cv_tiles, calib_pct, test_pct)
    
    test_idx = livestock_samples[mask][['geometry']].sjoin(cv_tiles_test, how='inner').index.unique()
    calib_idx = livestock_samples[mask][['geometry']].sjoin(cv_tiles_calib, how='inner').index.unique()
    
    test_mask = livestock_samples.index.isin(test_idx)
    calib_mask = livestock_samples.index.isin(calib_idx)
    
    ttprint(f"{animal} shape: {np.sum(mask.astype('int'))}")
    ttprint(f"Testing shape: {np.sum(test_mask.astype('int'))}")
    ttprint(f"Calibration shape: {np.sum(calib_mask.astype('int'))}")
    
    livestock_samples.loc[mask, f'ind_{animal}'] = 1
    livestock_samples.loc[mask, f'{animal}_ml_type'] = 'training'
    livestock_samples.loc[calib_mask, f'{animal}_ml_type'] = 'calibration'
    livestock_samples.loc[test_mask, f'{animal}_ml_type'] = 'testing'

livestock_samples

In [None]:
livestock_samples['weight'] = np.nan
meta_cols += ['weight']

for animal in animals:
    mask = (livestock_samples[f'ind_{animal}'] == 1)
    
    rmin, rmax = livestock_samples[mask]['1_radius'].min(), livestock_samples[mask]['1_radius'].max()
    livestock_samples.loc[mask, 'weight'] = (livestock_samples[mask]['1_radius'] - rmin*0.1) / (rmax - rmin*0.1)
    
    print(mask.value_counts(), livestock_samples[mask]['weight'].min(), livestock_samples[mask]['weight'].max())

In [None]:
to_drop = ['ls.factor_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','dfme_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','geomorphon_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','maxic_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','minic_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','neg.openness_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','nodepress.dtm_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','pos.openness_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','pro.curv_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','ring.curv_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','shpindx_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','spec.catch_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','ssdon_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230','tan.curv_edtm_m_960m_s_20000101_20221231_go_epsg.4326_v20241230']

meta_cols = meta_cols + [ f'{animal}_ml_type' for animal in animals ]  + [ f'ind_{animal}' for animal in animals ]

#covs = sorted(list(livestock_samples.columns.drop(meta_cols + to_drop)))
covs = sorted(list(livestock_samples.columns.drop(meta_cols)))

print(len(covs))
covs

In [None]:
livestock_samples = livestock_samples[meta_cols + covs].reset_index(drop=True)
livestock_samples

In [None]:
livestock_samples[[ f'{a}_density' for a in animals ]] = livestock_samples[[ f'{a}_density' for a in animals ]].round(1)

In [None]:
zero_val = 0.001

for animal in animals:
    col_density = f'{animal}_density'
    col_ind = f'ind_{animal}'
    
    n_samples = np.sum(livestock_samples[col_ind] == 1)
    
    mask = np.logical_and.reduce([
        livestock_samples[col_ind] == 1,
        livestock_samples[col_density] > 0
    ])
    
    q02 = livestock_samples[mask][col_density].quantile(0.02)
    mask = np.logical_and.reduce([
        livestock_samples[col_ind] == 1,
        livestock_samples[col_density] <= q02
    ])
    print(animal, n_samples, (np.sum(mask) / n_samples) * 100, q02)
    livestock_samples.loc[mask,col_density] = zero_val

In [None]:
cov_idx = livestock_samples.columns.get_loc(list(livestock_samples.columns[livestock_samples.columns.str.contains('ind_')])[-1]) + 1
livestock_samples.columns[cov_idx:]

In [None]:
geom_cv_cols = list(livestock_samples.columns[0:9].drop(['geometry','x','y'])) + \
list(livestock_samples.columns[livestock_samples.columns.str.contains('ind_')]) + \
list(livestock_samples.columns[livestock_samples.columns.str.contains('ml_type')])

livestock_samples[geom_cv_cols + ['geometry']].drop_duplicates(subset=geom_cv_cols).to_file(f'{wd}/livestock_census_ard/gpw_livestock.animals_gpw.fao.malek.2024_zonal.samples.ml.type_20000101_20231231_go_epsg.4326_v1.gpkg')

In [None]:
cols = ['gazID','gazName','source','level','country','method','geometry']
livestock_samples[cols].drop_duplicates(subset=cols).to_parquet(f'{wd}/livestock_census_ard/gpw_livestock.animals_gpw.fao.malek.2024_zonal.samples.ml.type.multi.scale_20000101_20231231_go_epsg.4326_v1.pq')

In [None]:
max_density = {
    'cattle': 1511,
    'sheep': 713,
    'goat': 832,
    'horse': 83,
    'buffalo': 490
}

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="pastel")
sns.set_context("notebook")

animal_types = ['cattle', 'sheep', 'goat', 'horse', 'buffalo']

for animal in animal_types:
    col_density = f'{animal}_density'
    print(animal, livestock_samples[np.logical_and.reduce([
        livestock_samples[f'ind_{animal}'] == 1,
        livestock_samples[col_density] > 0.001
    ])][[col_density]].quantile(q=[0.0,0.01,0.02,0.03,0.04,0.05,0.5,0.95,0.975,0.98,0.99,1.0]).round())
    livestock_samples[np.logical_and(livestock_samples[f'ind_{animal}'] == 1,livestock_samples[col_density] < 10000)][[col_density]].plot(kind='hist', bins=64,  histtype='step', linewidth=1.5, log=True, legend=True)