In [None]:
import sys
sys.path.insert(0,'./scikit-map')
import skmap
print(skmap.__file__)

In [None]:
wd = '/mnt/tupi/WRI/livestock_global_modeling/'

## Polygon samples

In [None]:
import geopandas as gpd
import joblib
#polygons_fn = f'{wd}livestock_census_ard/gpw_livestock.animals_gpw.fao.glw3_polygon.samples_20000101_20231231_go_epsg.4326_v1.gpkg'
#polygon_samples = gpd.read_file(polygons_fn)
polygon_samples = joblib.load('polygon_samples.lz4')
polygon_samples

## Features

In [None]:
import pandas as pd
from pathlib import Path

livestock_covs = pd.read_csv(f'{wd}/livestock_census_ard/livestock_cov.csv')
livestock_covs

In [None]:
import numpy as np

prefixes = [
    'bare', 'crop', 'clm_accum.precipitation', 'clm_lst_mod11a2.nighttime_p50', 'clm_lst_mod11a2.nighttime.trend_p50',
    'clm_lst_mod11a2.daytime_p50', 'clm_lst_mod11a2.daytime.trend_p50', 'clm_lst_max.geom', 'clm_lst_min.geom', 
    'veg_blue_mod13q1.v061_p50_', 'veg_mir_mod13q1.v061_p50_', 'veg_red_mod13q1.v061_p50_', 'veg_nir_mod13q1.v061_p50_',  'veg_ndvi_mod13q1.v061.trend_p50_', 
    'easterness_','filtered.dtm_','flow.accum_','geomorphon_', 'hillshade_','lcv_', 'neg.openness_', 'northerness_', 'nosink_', 
    'pop.count_','pos.openness_', 'slope_','spec.catch.area.factor_','wilderness_','wv_','surface.water','wetlands_',
    'lcv_wetlands','forest.cover', 'gdp.per.capita', 'hdi',
]
n_prefixes = ['clm_lst_mod11a2.nighttime_sd', 'clm_lst_mod11a2.daytime_sd', 
              'lcv_water.distance_glad.interanual.dynamic.classes', 'lcv_bare.surface_landsat.', 'bsf']

def _covs(layernames, prefixes, n_prefixes):
    return layernames[
        np.logical_and(
            np.logical_or.reduce(
                [ layernames.str.contains(p) for p in prefixes ]
            ),
            np.logical_not(np.logical_or.reduce(
                [ layernames.str.contains(p) for p in n_prefixes ]
        ))
    )]

static_covs = _covs(livestock_covs[livestock_covs['type'] == 'static'].layername, prefixes, n_prefixes)
temporal_covs = _covs(livestock_covs[livestock_covs['type'] == 'temporal'].layername, prefixes, n_prefixes)

## Modeling

In [None]:
import geopandas as gpd
import joblib

livestock_polygons = joblib.load(f'{wd}/livestock_census_raw/gpw_livestock.animals_gpw.fao.glw3_polygon.samples_20000101_20231231_go_epsg.4326_v1.lz4')
#livestock_polygons = joblib.load('/mnt/tupi/WRI/internal-develop/gpw/livestock-modeling/faostat_livestock_all.lz4')
livestock_polygons

In [None]:
import pandas as pd
livestock_zonal = pd.read_parquet('livestock_zonal_ultimate.pq')
#livestock_zonal = pd.read_parquet('livestock_zonal_ultimate_faostat.pq')
livestock_zonal

In [None]:
sample_cols = list(livestock_zonal.columns.drop('polygon_idx')) + ['gazID', 'gazName', 'source', 'level', 'country']
#sample_cols = list(livestock_zonal.columns.drop('polygon_idx')) + ['gazID', 'gazName', 'source']

livestock_zonal = livestock_zonal.set_index('polygon_idx', drop=True).merge(
    livestock_polygons,
    left_index = True,
    right_index = True
)

livestock_zonal

In [None]:
import numpy as np
import math
livestock_samples = []

animals = ['cattle', 'horse', 'goat', 'sheep', 'buffalo']
igh = '+proj=igh +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs'
representative_point = gpd.GeoSeries(livestock_zonal['geometry']).representative_point()
gpd.GeoSeries(livestock_zonal['geometry'])

for year, rows in livestock_zonal.groupby('year'):
    
    print(f"Preparing {rows.shape} rows for {year}")
    row_cols = list(sample_cols) + ['x', 'y', '1_radius']
    
    for animal in animals:
        density_col = f'{animal}_density'
        rows[density_col] = rows[f'{animal}_{year}'] / rows['livestock_area_km']
        row_cols.append(density_col)
        
        heads_col = f'{animal}_heads'
        rows[heads_col] = rows[f'{animal}_{year}']
        row_cols.append(heads_col)
        
        rows.loc[np.isinf(rows[f'{animal}_density']),f'{animal}_density'] = np.nan
        rows.loc[rows[f'{animal}_density'] == 0,f'{animal}_density'] = np.nan
    
    # Handleling multi-part polygons
    #rows['1_radius'] = gpd.GeoSeries(rows['geometry']).to_crs(igh).minimum_bounding_circle().area.apply(lambda f: 1/math.sqrt(f / math.pi)).mean()
    
    # Handleling multi-part polygons
    rows_wei = gpd.GeoDataFrame(rows['geometry']).to_crs(igh).explode()
    rows_wei['1_radius'] = rows_wei.minimum_bounding_circle().area
    rows_wei['1_radius'] = rows_wei['1_radius'].apply(lambda f: 1/math.sqrt(f / math.pi))
    rows['1_radius'] = rows_wei['1_radius'].groupby(level='polygon_idx').max()
    
    rows['x'] = gpd.GeoSeries(rows['geometry']).representative_point().x
    rows['y'] = gpd.GeoSeries(rows['geometry']).representative_point().y
    livestock_samples.append(rows[row_cols])

livestock_samples = pd.concat(livestock_samples).reset_index(drop=True)
livestock_samples

In [None]:
red = 'veg_red_mod13q1.v061_p50_250m_s0..0cm_year.05.01..year.05.31_v1'
nir = 'veg_nir_mod13q1.v061_p50_250m_s0..0cm_year.05.01..year.05.31_v1'
livestock_samples['veg_ndvi_mod13q1.v061_p50_250m_s0..0cm_year.05.01..year.05.31_v1'] = (livestock_samples[nir] - livestock_samples[red]) / (livestock_samples[nir] + livestock_samples[red])

In [None]:
import numexpr as ne
from datetime import datetime
import math

def geo_temp(fi, day, a=37.03043, b=-15.43029):
    f =fi
    pi = math.pi 

    #math.cos((day - 18) * math.pi / 182.5 + math.pow(2, (1 - math.copysign(1, fi))) * math.pi) 
    sign = 'where(abs(fi) - fi == 0, 1, -1)'
    costeta = f"cos((day - 18) * pi / 182.5 + 2**(1 - {sign}) * pi)"

    #math.cos(fi * math.pi / 180)
    cosfi = "cos(fi * pi / 180)"
    A = cosfi

    #(1 - costeta) * abs(math.sin(fi * math.pi / 180) )
    B = f"(1 - {costeta}) * abs(sin(fi * pi / 180) )"

    x = f"(a * {A} + b * {B})"
    return ne.evaluate(x)

def add_geo_temp(pts):

    elev_corr = 0.006 * pts['filtered.dtm_edtm_m_240m_s_20000101_20221231_go_epsg.4326_v20240528'].to_numpy() * 0.1

    for m in range(1,13):
        doy = (datetime.strptime(f'2000-{m}-15', '%Y-%m-%d').timetuple().tm_yday)
        max_temp_name = f'clm_lst_max.geom.temp_m_30m_s_m{m}' 
        min_temp_name = f'clm_lst_min.geom.temp_m_30m_s_m{m}'
        print(f"Adding {max_temp_name} & {min_temp_name}")
        pts[max_temp_name] = ((geo_temp(pts['y'].to_numpy(), day=doy, a=37.03043, b=-15.43029) - elev_corr) * 100).round()
        pts[min_temp_name] = ((geo_temp(pts['y'].to_numpy(), day=doy, a=24.16453, b=-15.71751) - elev_corr) * 100).round()

    return(pts)

livestock_samples = add_geo_temp(livestock_samples)
livestock_samples

In [None]:
livestock_samples_bkp = livestock_samples.copy()
livestock_samples_bkp

#livestock_samples = livestock_samples_bkp.copy()
#livestock_samples

In [None]:
meta_cols = ['gazID', 'gazName', 'source', 'level', 'country', 'x', 'y', '1_radius', 'year', 'mask_layer', 'livestock_area_km', 
             'cattle_density', 'horse_density', 'goat_density', 'sheep_density', 'buffalo_density', 
             'cattle_heads', 'horse_heads', 'goat_heads', 'sheep_heads', 'buffalo_heads']
covs = sorted(list(livestock_samples.columns.drop(meta_cols)))

In [None]:
n_heads_zero_th = livestock_samples[np.logical_and.reduce([
    livestock_samples['livestock_area_km'] == 0,
    np.any(np.isnan(livestock_samples[covs]),1) == False
])][[ f'{a}_heads' for a in animals ]].median().to_dict()

for k in n_heads_zero_th.keys():
    print(k)
    to_drop = livestock_samples[np.logical_and.reduce([
        livestock_samples['livestock_area_km'] == 0,
        livestock_samples[k] > n_heads_zero_th[k],
    ])].index
    
    to_zero = livestock_samples[np.logical_and.reduce([
        livestock_samples['livestock_area_km'] == 0,
        livestock_samples[k] <= n_heads_zero_th[k],
    ])].index
    
    livestock_samples = livestock_samples.drop(index=to_drop)
    print(f"Removing {to_drop.shape[0]} samples")
    
    livestock_samples.loc[to_zero, k.replace('_heads','_density')] = 0
    print(f"Inputing zeros {to_zero.shape[0]} samples")

In [None]:
livestock_samples[np.isnan(livestock_samples['lcv_snow.duration_global.snowpack_m_500m_0..0cm_year_v04042022'])]['gazName']

In [None]:
import numpy as np
from eumap.misc import ttprint

any_nan_samples = np.sum(np.isnan(livestock_samples[covs].to_numpy()).astype('int'), axis=0)
cols_to_remove = []

for th in [1, 0.5, 0.2, 0.1, 0.05, 0.02]:
    ttprint(f"Columns with {th*100}% of nan values")
    for c,s in zip(covs, any_nan_samples):
        if s > (livestock_samples.shape[0] * th):
            ttprint(f' - {s} => {c}')
            cols_to_remove += [c]

cols_to_remove = set(cols_to_remove)
ttprint(cols_to_remove)
ttprint(f"Removing {len(cols_to_remove)} columns (>= {th*100}% of nan values)")

In [None]:
livestock_samples = livestock_samples.drop(columns=cols_to_remove)

In [None]:
covs = sorted(list(livestock_samples.columns.drop(meta_cols)))

In [None]:
nan_samples = livestock_samples[np.any(np.isnan(livestock_samples[covs].to_numpy()).astype('int'), axis=1)].index
ttprint(f"Removing {len(nan_samples)} samples with at least one column with nan")
livestock_samples = livestock_samples.drop(nan_samples)

density_cols = livestock_samples.columns[livestock_samples.columns.str.contains('_density')]
dnan_samples = livestock_samples[np.all(np.isnan(livestock_samples[density_cols].to_numpy()), axis=1)].index
ttprint(f"Removing {len(dnan_samples)} samples with all densities nan")
livestock_samples = livestock_samples.drop(dnan_samples)

livestock_samples

In [None]:
list(livestock_samples.columns)

In [None]:
livestock_samples[livestock_samples['gazName'].isnull()]['sheep_density'].plot(kind='hist', bins=64,  histtype='step', linewidth=1.5, log=True, legend=True)

In [None]:
# Compromise the train, test, calibration split
nan_gazname = livestock_samples['gazName'].isnull()
ttprint(f"Fixing {len(nan_samples)} samples without gazName")
#livestock_samples = livestock_samples.drop(nan_gazname)

livestock_samples.loc[nan_gazname,'gazName'] = livestock_samples[nan_gazname]['gazID']

In [None]:
animals = ['cattle', 'horse', 'goat', 'sheep', 'buffalo']
calib_pct = 0.1
test_pct = 0.1

for animal in animals:
    
    mask = np.logical_not(np.isnan(livestock_samples[f'{animal}_density']))
    animal_samples = livestock_samples[mask]
    
    gaz_names = pd.Series(animal_samples['gazName'].unique())    
    
    n_names = gaz_names.shape[0]
    test_names = gaz_names.sample(int(n_names * test_pct))
    calib_names = gaz_names.drop(test_names.index).sample(int(n_names * test_pct))
    
    test_mask = np.logical_and.reduce([
        mask,
        livestock_samples['gazName'].isin(test_names)
    ])
    calib_mask = np.logical_and.reduce([
        mask,
        livestock_samples['gazName'].isin(calib_names)
    ])
    
    ttprint(f"{animal} shape: {np.sum(mask.astype('int'))}")
    ttprint(f"Testing shape: {np.sum(test_mask.astype('int'))}")
    ttprint(f"Calibration shape: {np.sum(calib_mask.astype('int'))}")
    
    livestock_samples.loc[mask, f'ind_{animal}'] = 1
    livestock_samples.loc[mask, f'{animal}_ml_type'] = 'training'
    livestock_samples.loc[test_mask, f'{animal}_ml_type'] = 'testing'
    livestock_samples.loc[calib_mask, f'{animal}_ml_type'] = 'calibration'

livestock_samples

In [None]:
livestock_samples['weight'] = np.nan
meta_cols += ['weight']

for animal in animals:
    mask = (livestock_samples[f'ind_{animal}'] == 1)
    
    rmin, rmax = livestock_samples[mask]['1_radius'].min(), livestock_samples[mask]['1_radius'].max()
    livestock_samples.loc[mask, 'weight'] = (livestock_samples[mask]['1_radius'] - rmin*0.1) / (rmax - rmin*0.1)
    
    print(mask.value_counts(), livestock_samples[mask]['weight'].min(), livestock_samples[mask]['weight'].max())

In [None]:
livestock_samples[np.logical_and.reduce([
    livestock_samples['weight'] == 0,
    livestock_samples['ind_cattle'] == 1,
])][['livestock_area_km', 'gazName', '1_radius','weight']]

In [None]:
meta_cols = meta_cols + [ f'{animal}_ml_type' for animal in animals ]  + [ f'ind_{animal}' for animal in animals ]
covs = sorted(list(livestock_samples.columns.drop(meta_cols)))
covs

In [None]:
livestock_samples = livestock_samples[meta_cols + covs].reset_index(drop=True)
livestock_samples

In [None]:
livestock_samples[[ f'{a}_density' for a in animals ]] = livestock_samples[[ f'{a}_density' for a in animals ]].round(1)

In [None]:
zero_val = 0.001

for animal in animals:
    col_density = f'{animal}_density'
    col_ind = f'ind_{animal}'
    
    n_samples = np.sum(livestock_samples[col_ind] == 1)
    
    mask = np.logical_and.reduce([
        livestock_samples[col_ind] == 1,
        livestock_samples[col_density] > 0
    ])
    
    q02 = livestock_samples[mask][col_density].quantile(0.02)
    mask = np.logical_and.reduce([
        livestock_samples[col_ind] == 1,
        livestock_samples[col_density] <= q02
    ])
    print(animal, (np.sum(mask) / n_samples) * 100, q02)
    livestock_samples.loc[mask,col_density] = zero_val

In [None]:
cov_idx = livestock_samples.columns.get_loc(list(livestock_samples.columns[livestock_samples.columns.str.contains('ind_')])[-1]) + 1
livestock_samples.columns[cov_idx:]

In [None]:
# Removing lcv_road.distance_osm
drop_cols = livestock_samples.columns[np.logical_or.reduce([
    livestock_samples.columns.str.contains('distance_osm'),
    livestock_samples.columns.str.contains('pop.count')
])]

livestock_samples = livestock_samples.drop(columns=drop_cols)

In [None]:
#livestock_samples.to_parquet(f'{wd}/livestock_census_ard/gpw_livestock.animals_faostat_zonal.samples_20000101_20211231_go_epsg.4326_v1.pq')
livestock_samples.to_parquet(f'{wd}/livestock_census_ard/gpw_livestock.animals_gpw.fao.glw3_zonal.samples_20000101_20211231_go_epsg.4326_v1.pq')

In [None]:
max_density = {
    'cattle': 1428,
    'sheep': 534,
    'goat': 311,
    'horse': 52,
    'buffalo': 338,
}

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="pastel")
sns.set_context("notebook")

animal_types = ['cattle', 'sheep', 'goat', 'horse', 'buffalo']

for animal in animal_types:
    col_density = f'{animal}_density'
    print(animal, livestock_samples[np.logical_and.reduce([
        livestock_samples[f'ind_{animal}'] == 1,
        livestock_samples[col_density] > 0.001
    ])][[col_density]].quantile(q=[0.0,0.01,0.02,0.03,0.04,0.05,0.5,0.95,0.975,0.98,0.99,1.0]).round())
    livestock_samples[np.logical_and(livestock_samples[f'ind_{animal}'] == 1,livestock_samples[col_density] < 10000)][[col_density]].plot(kind='hist', bins=64,  histtype='step', linewidth=1.5, log=True, legend=True)