In [None]:
!pip install -e 'git+https://gitlab.com/geoharmonizer_inea/eumap.git#egg=eumap[full]'

## Importation

In [None]:
from eumap.misc import find_files
from eumap import parallel
import datatable as dt
import geopandas as gpd
import pandas as pd
from pathlib import Path
import seaborn as sns
sns.set_theme()

wd = '/mnt/tupi/WRI/livestock_global_modeling/livestock_census_raw'
ard_census_dir='./livestock_census_ard'

## GPW data

In [None]:
CENSUS_DB=f'{wd}/ArealDB_GPW'

adb_tables_path=f'{CENSUS_DB}/tables'
adb_geometries_path=f'{CENSUS_DB}/geometries'

### Reading number of animals

In [None]:
pd.read_csv('/mnt/tupi/WRI/livestock_global_modeling/livestock_census_raw/ArealDB_GPW/tables/Brazil.csv')

In [None]:
import numpy as np

def read_csv(basedir, country, min_year = 2000, min_level = 2,
              animals = ['cattle','goat','horse','sheep','buffalo'],
              ontoName_dict = {}, ontoMatch_dict = {}):
    
    csv_file = f'{basedir}/{country}.csv'
    df = dt.fread(csv_file).to_pandas()
    df = df.rename(columns={'animal': 'ontoName'})

    has_methdod = np.any(df.columns.isin(['methdod']))
    if has_methdod:
        df.loc[:, 'method'] = df['methdod']

    df['ontoMatch'] = df['ontoMatch'].str.lower()   
    df['animal'] = df['ontoName'].str.lower()
    df['level'] = df['gazName'].str.split('.', expand=False).str.len()
    df['country'] = df['gazName'].str.split('.', expand=True)[0]
    df['year'] = pd.to_numeric(df['year'].astype('string').str.slice(0,4), errors='coerce')

    df['ontoName'] = df['ontoName'].replace(ontoName_dict)

    mask = (df['animal'] == '')
    df.loc[mask,'animal'] = df[mask]['ontoMatch'].map(ontoMatch_dict).fillna('')
    df = df[np.logical_not(df['year'].isin([pd.NA]))]
    
    #return df
    mask = np.logical_and.reduce([
        df['year'] >= min_year,
        df['level'] >= min_level,
        df['ontoName'].isin(animals),
        df['number_heads'] >= 0,
        df['gazName'] != 'NA.NA.NA',#
        np.logical_not(df['gazID'].isnull())
    ])

    n_recs = df[mask].shape[0]
    gpw_df = pd.DataFrame(df[mask])

    if n_recs > 0:

        gpw_df['column'] = gpw_df['ontoName'] + '_' + gpw_df['year'].astype('str')
        gpw_df.loc[:,'source'] = 'GPW'
        

        gpw_df = gpw_df[['gazID','gazName','ontoName','country','level','source','column','method','number_heads']].pivot_table(
            index=['gazID','gazName','method','country','level','source'],
            columns='column',
            values='number_heads',
            aggfunc=np.nanmean
        ).reset_index()
        
        gpw_df.loc[:,'min_year'] = df['year'].min()
        gpw_df.loc[:,'max_year'] = df['year'].max()
        gpw_df.loc[:,'animals'] = ','.join(df['ontoName'].unique())

    return gpw_df

def read_geom(geom_dir, country, level):
    cols = ['gazID', 'gazName','geometry']
    
    try:
        gdf = gpd.read_file(f'{geom_dir}/{country}.gpkg', layer=f'al{level}' )[cols]
        nodup = gdf.duplicated(subset=['gazID','gazName'], keep='first')
        return gdf[np.logical_not(nodup)].reset_index(drop=True)
    except:
        try:
            gdf = gpd.read_file(f'{geom_dir}/{country}.gpkg', layer=f'ADM{int(level) - 1}' )[cols]
            nodup = gdf.duplicated(subset=['gazID','gazName'], keep='first')
            return gdf[np.logical_not(nodup)].reset_index(drop=True)
        except:
            return gpd.GeoDataFrame([])
        
def read_geom_levels(geom_dir, country, levels):
    gpw_gdf = [
        read_geom(adb_geometries_path, country, level)
        for level in levels 
    ]
    if len(gpw_gdf) > 0:
        return pd.concat(gpw_gdf)
    else:
        return gpd.GeoDataFrame([])

def read_data(basedir, geom_dir, country, ontoName_dict, ontoMatch_dict):
    
    gpw_df = read_csv(basedir, country, ontoName_dict=ontoName_dict, ontoMatch_dict=ontoMatch_dict)
    if gpw_df.shape[0] > 0:
        gpw_df = gpw_df.merge(
            read_geom_levels(geom_dir, country, gpw_df['level'].unique())
            , on=['gazID', 'gazName'])
    
    return gpw_df

ontoName_dict = {
    'buffalo|bison': 'buffalo'
}
                                                       
ontoMatch_dict = {
   'овцы и козы': 'sheep and goats',
   'livestock - meat cattle - cows and heifers 1 year and over (no)': 'cattle',
   'live bovine animals': 'cattle',
   'bovine animals, less than 1 year old': 'cattle',
   'livestock - dairy cattle - cows in milk and dry (no)': 'cattle',
   'livestock - meat cattle - calves less than 1 year (no)': 'cattle',
   'bovine animals, less than 1 year old, for slaughter': 'cattle',
   'livestock - dairy cattle - heifers 1 to 2 years (no)': 'cattle',
   'livestock - dairy cattle - heifers over 2 years (no)': 'cattle',
   'bovine animals, less than 1 year old, not for slaughter': 'cattle',
   'bovine animals, 1 to less than 2 years old': 'cattle',
   'bovine animals, 2 years old or over': 'cattle',
   'livestock - meat cattle - all other (no)': 'cattle',
   'livestock - sheep and lambs - breeding ewes 1 year and over - other breeding ewes nec (no)': 'sheep',
   'livestock - sheep and lambs - all other (no)': 'sheep',
   'livestock - sheep and lambs - breeding ewes 1 year and over - merinos (no)': 'sheep'
}

#country = 'United States of America'
country = 'China'

read_data(adb_tables_path, adb_geometries_path, country, ontoName_dict, ontoMatch_dict)
#read_csv(adb_tables_path, country, ontoName_dict=ontoName_dict, ontoMatch_dict=ontoMatch_dict)

In [None]:
from pathlib import Path
from skmap.misc import ttprint

gpw_data = []
animals = ['cattle','goat','horse','sheep','buffalo']

for f in find_files(adb_tables_path, '*.csv'):
    country = Path(f).stem
    ttprint(f"Reading {country}")
    f_data = read_data(adb_tables_path, adb_geometries_path, country, ontoName_dict=ontoName_dict, ontoMatch_dict=ontoMatch_dict)
    ttprint(f" Shape: {f_data.shape}")
    if f_data.shape[0] > 0:
        gpw_data += [ f_data ]
        
gpw_data = pd.concat(gpw_data)

animal_cols = sorted(sum([ list(gpw_data.columns[gpw_data.columns.str.contains(a)]) for a in animals ],[]))
info_cols = list(gpw_data.columns[~gpw_data.columns.isin(animal_cols)])

gpw_data = gpw_data[info_cols + animal_cols]

#### Brazil double check

In [None]:
df_br = gpw_data.query("country == 'Brazil' and level > 2 and gazName == 'Brazil.Goiás.Jataí'")
df_br

### Data sources

In [None]:
inv_tables = pd.read_csv(f'{CENSUS_DB}/_meta/inv_tables.csv')
inv_dataseries = pd.read_csv(f'{CENSUS_DB}/_meta/inv_dataseries.csv')
gpw_tabids = gpw_data['tabID'].unique()

In [None]:
inv_tables[inv_tables['tabID'].isin(gpw_merged[gpw_merged['country'] == 'Brazil']['tabID'].unique())]

In [None]:
gpw_merged[gpw_merged['country'] == 'Brazil']['tabID'].unique()

In [None]:
#gpw_merged = gpw_data.merge(inv_tables[['tabID','datID','start_period','end_period','stage1_url','download_date','update_frequency']], on='tabID')
gpw_data['area_km2'] = gpd.GeoDataFrame(gpw_data, geometry='geometry').set_crs('EPSG:4326').to_crs('+proj=igh +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs +type=crs').geometry.area / 1000000

In [None]:
#gpw_merged.loc[:,'date_freq'] = gpw_merged['start_period'].astype(str) + '—' + gpw_merged['end_period'].astype(str) + " (" + gpw_merged['update_frequency'].astype(str) + ")"

In [None]:
gpw_merged

In [None]:
def concat(x):
    return ','.join(x.unique().astype(str))

gpw_merged.groupby(['country','min_year', 'max_year', 'animals']).agg({'level': concat, 'gazID': 'count', 'area_km2': ['mean', 'std'], })#.to_csv('summary_table.csv')

In [None]:
gpw_merged['area_km2'].mean(), gpw_merged['area_km2'].std()

In [None]:
gpw_data = gpw_data.drop(columns=['tabID','min_year', 'max_year'])

In [None]:
from datetime import datetime
version = datetime.today().strftime('%Y%m%d')
gpd.GeoDataFrame(gpw_data, geometry='geometry', crs='epsg:4326').to_file(f'{wd}/gpw_livestock_{version}.gpkg')

In [None]:
cur_gpw = gpd.read_file(f'{wd}/gpw_livestock_{version}.gpkg')
cur_gpw.shape

## GPW Ad-hoc level 2+

In [None]:
from skmap.misc import find_files
ADHOC_DIR = f'{wd}/AdHoc_GPW'

In [None]:
adhoc_df = pd.concat(
    [ gpd.read_file(f) for f in find_files(ADHOC_DIR, '*.gpkg')]
)
adhoc_df

In [None]:
def _gazName(row):
    cols = ['COUNTRY', 'NAME_1', 'NAME_12', 'NAME_2', 'NAME_23', 'Region' ]
    vals =[ str(row[c]).replace(' - ','_').replace(' ','_').capitalize() for c in cols if (~pd.isnull(row[c]) and str(row[c]) != 'nan') ]
    return '.'.join(list(dict.fromkeys(vals)))

adhoc_df['gazName'] = adhoc_df.apply(_gazName, axis=1)

In [None]:
import hashlib

cols = ['Bovins05','Bovins06','Bovins07','Bovins08','Bovins09','Bovins10','Bovins11','Bovins12','Bovins13','Bovins14','Bovins15','Bovins16','Bovins17','Bovins18','Bovins19','Bovins20','Bovins21','Bovins22','Cattle09','Cattle10','Cattle11','Cattle12','Cattle13','Cattle14']
dyear = { 'Bovins05': 2005,'Bovins06': 2006,'Bovins07': 2007,'Bovins08': 2008,'Bovins09': 2008,'Bovins10': 2010,'Bovins11': 2011,'Bovins12': 2012,'Bovins13': 2013,'Bovins14': 2014,'Bovins15': 2015,'Bovins16': 2016,'Bovins17': 2017,'Bovins18': 2018,'Bovins19': 2019,'Bovins20': 2020,'Bovins21': 2021,'Bovins22': 2022,'Cattle09': 2009,'Cattle10': 2010,'Cattle11': 2011,'Cattle12': 2012,'Cattle13': 2013,'Cattle14': 2014}

adhoc_rwise_df = []

for id, row in adhoc_df.iterrows():
    for c in cols:
        if (row[c] >= 0 and int(row[c]) != 9999):
            adhoc_rwise_df.append({
                'gazName': row['gazName'],
                'gazID': str(hashlib.md5(str(row['geometry']).encode('utf-8')).hexdigest()),
                'year': dyear[c],
                'animal': 'cattle',
                'number_heads': row[c],
                'geometry': row['geometry']
            })
    

adhoc_rwise_df = pd.DataFrame(adhoc_rwise_df)
adhoc_rwise_df['level'] = adhoc_rwise_df['gazName'].str.split('.', expand=False).str.len()
adhoc_rwise_df['country'] = adhoc_rwise_df['gazName'].str.split('.', expand=True)[0]
adhoc_rwise_df

In [None]:
adhoc_rwise_df.loc[:,'column'] = adhoc_rwise_df['animal'] + '_' + adhoc_rwise_df['year'].astype('str')
adhoc_rwise_df.loc[:,'source'] = 'GPW'

adhoc_data = adhoc_rwise_df[['gazID','gazName','animal','country','level','source','column','number_heads']].pivot_table(
    index=['gazID','gazName','country','level','source'],
    columns='column',
    values='number_heads',
    aggfunc=np.nanmean
).reset_index()

print(f"Before merge: {adhoc_data.shape}")

adhoc_data = adhoc_data.merge(
    adhoc_rwise_df[['gazID','geometry']].drop_duplicates(subset=['gazID'], keep='last'), 
    on=('gazID'), how='left'
)

print(f"After merge: {adhoc_data.shape}")

In [None]:
animal_cols = sorted(sum([ list(adhoc_data.columns[adhoc_data.columns.str.contains(a)]) for a in animals ],[]))
info_cols = list(adhoc_data.columns[~adhoc_data.columns.isin(animal_cols)])

adhoc_data = adhoc_data[info_cols + animal_cols]

In [None]:
from datetime import datetime
version = datetime.today().strftime('%Y%m%d')
#df_gpw_adhoc.to_file(f'{wd}/gpw_livestock.adhoc_{version}.gpkg')
gpd.GeoDataFrame(adhoc_data, geometry='geometry', crs=adhoc_df.crs).to_file(f'{wd}/gpw_livestock.adhoc_{version}.gpkg')

## FAO

In [None]:
from skmap.misc import find_files
FAO_DIR = f'{wd}/DB_FAO'

In [None]:
import fiona
from pathlib import Path
import numpy as np

import re 

def norm_gazid(t0):
    d = re.findall(r'\d+', str(t0))
    t = re.findall(r'[a-zA-Z]+', str(t0))
    
    if len(t) > 0:
        t = t[0]
    else:
        t = ''
        
    if len(d) > 0:
        return  str(t) + str(int(d[0]))
    else:
        return t0

def read_geom(basedir, country):
    geom_fn = f'{basedir}/{country}/{country}.gpkg'
    
    gdf_list = []
    for layer in fiona.listlayers(geom_fn):
        fn = Path(geom_fn).stem
        gdf = gpd.read_file(geom_fn, layer=layer )
        gdf[['file', 'layer']] = (fn, layer)
        gdf_list.append(gdf)

    return pd.concat(gdf_list)

#df = df[df['animal'].isin(['cattle','goat','sheep','horse','buffalo'])]
def read_csv(basedir, country, min_year = 2000,
              animals = ['cattle','goat','horse','sheep','buffalo'],
              species_dict = {}):
    
    csv_file = f'{basedir}/{country}/{country}.csv'
    df = dt.fread(csv_file).to_pandas()
    df = df[['YEAR','ADM_CODE','SPECIES','N0','FILE']].rename(columns={
        'YEAR': 'year',
        'ADM_CODE': 'gazID',
        'FILE': 'layer',
        'SPECIES': 'animal',
        'N0': 'number_heads'
    })
    df['animal'] = df['animal'].map(species_dict)

    df['year'] = pd.to_numeric(df['year'].astype('string').str.slice(0,4), errors='coerce')
    df['layer'] = df['layer'].str.split('_', expand=True)[0] + '_' + df['layer'].str.split('_', expand=True)[1]
    df['number_heads'] = pd.to_numeric(df['number_heads'], errors='coerce')
    
    df = df[np.logical_not(df['year'].isin([pd.NA]))]
    
    mask = np.logical_and.reduce([
        df['year'] >= min_year,
        df['animal'].isin(animals),
        df['number_heads'] >= 0,
        np.logical_not(df['gazID'].isnull())
    ])

    n_recs = df[mask].shape[0]
    fao_df = pd.DataFrame(df[mask])

    if n_recs > 0:

        fao_df['column'] = fao_df['animal'] + '_' + fao_df['year'].astype('str')
        fao_df.loc[:,'source'] = 'FAO'

        fao_df = fao_df[['gazID','layer','source','column','number_heads']].pivot_table(
            index=['gazID','layer','source',],
            columns='column',
            values='number_heads',
            aggfunc='mean'
        ).reset_index()

    return fao_df

def read_data(basedir, country, min_year, animals, species_dict):
    fao_df = read_csv(basedir, country, min_year=min_year, animals=animals, species_dict=species_dict)
    fao_gdf = read_geom(basedir, country).rename(columns={
        'ADM_CODE': 'gazID',
        'ADM0_NAME': 'country',
        'NAME': 'gazName',
        'LEVEL': 'level',
    })

    fao_df['gazID'] = fao_df['gazID'].apply(norm_gazid).astype('object')
    fao_gdf['gazID'] = fao_gdf['gazID'].apply(norm_gazid).astype('object')
    
    fao_data = fao_df.merge(
        fao_gdf[['country','gazName','level','gazID','layer','geometry']], 
        on=['gazID','layer']
    )
    
    fao_data['gazName'] = fao_data['country'] + '.' + fao_data['gazName']
    return fao_data
    
species_dict = {
    'LVAL': 'All livestock',
    'ARAL': 'All ruminants',
    'BOVN': 'cattle_BOVN',
    'CTAL': 'cattle',
    'CTME': 'cattle_CTME',
    'CTMI': 'cattle_CTMI',
    'YKAL': 'Yaks',
    'CTDR': 'cattle_CTDR',
    'BFAL': 'buffalo',
    'BFME': 'buffalo_BFME',
    'BFMI': 'buffalo_BFMI',
    'BFDR': 'buffalo_BFDF',
    'CAAL': 'Camelids',
    'CMAL': 'Camels',
    'ALAL': 'Alpacas',
    'LLAL': 'Llamas',
    'SRAL': 'Small ruminants',
    'SHAL': 'sheep',
    'SHMI': 'sheep_SHMI',
    'GTAL': 'goat',
    'GTME': 'goat_GTME',
    'GTMI': 'goat_GTMI',
    'AMAL': 'All monogastrics',
    'POAL': 'Poultry',
    'CHAL': 'Chickens',
    'CHLA': 'Layer chickens',
    'CHBO': 'Broiler chickens',
    'CHBR': 'Breeder chickens',
    'CHBY': 'Backyard chickens',
    'TUAL': 'Turkeys',
    'WAFL': 'Waterfowl',
    'DKAL': 'Ducks',
    'GSAL': 'Geese',
    'SUAL': 'Suines',
    'PGAL': 'Pigs',
    'ALEQ': 'horse_ALEQ', #All equines
    'EQAL': 'horse_EQAL', #Equines
    'HOAL': 'horse',
    'ASAL': 'horse_ASAL', #Asses
    'MUAL': 'horse_MUAL', #Mules
    'UNSP': 'Unspecified',
    'WIAN': 'Wild animals',
    'ALSU': 'All susceptible'
}

min_year = 2000
animals = ['cattle', 'goat', 'horse', 'sheep', 'buffalo']

country = 'AFG'
read_data(FAO_DIR, country, min_year, animals, species_dict)
#read_csv(FAO_DIR, country, min_year=min_year, animals=animals, species_dict=species_dict)

In [None]:
from pathlib import Path
from skmap.misc import ttprint

fao_data = []
animals = ['cattle','goat','horse','sheep','buffalo']
min_year = 2000

for f in find_files(FAO_DIR, '*.csv'):
    country = Path(f).stem
    ttprint(f"Reading {country}")
    f_data = read_data(FAO_DIR, country, min_year, animals, species_dict)
    
    ttprint(f" Shape: {f_data.shape}")
    if f_data.shape[0] > 0:
        fao_data += [ f_data ]
    
fao_data = pd.concat(fao_data)

animal_cols = sorted(sum([ list(fao_data.columns[fao_data.columns.str.contains(a)]) for a in animals ],[]))
info_cols = list(fao_data.columns[~fao_data.columns.isin(animal_cols)])

fao_data = fao_data[info_cols + animal_cols]

In [None]:
#df = fao_data[['country','layer']].value_counts().reset_index()
#dup_countries = df['country'].value_counts()[df['country'].value_counts() > 1].index
#dup_countries

In [None]:
agg_cols = {**{'geometry': 'last'}, **{ a:np.nanmean for a in animal_cols }}
fao_data_agg = fao_data.sort_values(['country','layer']).groupby(['gazID','source','country','gazName','level']).agg(agg_cols).reset_index()

In [None]:
#df = fao_data_agg[['country']].value_counts().reset_index()
#dup_countries = df['country'].value_counts()[df['country'].value_counts() > 1].index
#dup_countries

In [None]:
print(f'Before aggregation {fao_data.shape}')
print(f'After aggregation {fao_data_agg.shape}')

In [None]:
#fao_data_agg[fao_data_agg['gazName'] == 'Argentina.Buenos Aires'].to_numpy()

In [None]:
from datetime import datetime

version = datetime.today().strftime('%Y%m%d')
gpd.GeoDataFrame(fao_data_agg, geometry='geometry', crs='epsg:4326').to_file(f'{wd}/gpw_livestock.fao_{version}.gpkg')

## Malek et al., 2024

In [None]:
from skmap.misc import find_files
MALEK_DIR = f'{wd}/Malek_2024/EU_2020'

In [None]:
import fiona
from pathlib import Path

def read_geom(geom_fn):

    gdf_list = []
    for layer in fiona.listlayers(geom_fn):
        fn = Path(geom_fn).stem
        gdf = gpd.read_file(geom_fn, layer=layer ).to_crs(4326)
        gdf[['file', 'layer']] = (fn, layer)
        gdf_list.append(gdf)

    return pd.concat(gdf_list)

args = [ (f,) for f in find_files(MALEK_DIR, '*.gpkg') ]
gdf = pd.concat([ df for df in parallel.job(read_geom, args, n_jobs=16) ])
gdf

In [None]:
import hashlib

cols = ['buffalo','catt_tot','cattle tot','cattle','cattle2000','cattle2010','cattle2020','cattle_tot','Cattle_tot','goat','goat_t','goats','sheep','sheep2000','sheep2010','sheep2020','sheep_t','sheep_tot','Sheep_tota']
dyear = { 'buffalo': 2020, 'catt_tot': 2020, 'cattle tot': 2020, 'cattle': 2020, 'cattle2000': 2000, 'cattle2010': 2010, 'cattle2020': 2020, 'cattle_tot': 2020, 'Cattle_tot': 2020, 'goat': 2020, 'goat_t': 2020, 'goats': 2020, 'sheep': 2020, 'sheep2000': 2000, 'sheep2010': 2010, 'sheep2020': 2020, 'sheep_t': 2020, 'sheep_tot': 2020, 'Sheep_tota': 2020 }
danimal = {'buffalo': 'buffalo','catt_tot': 'cattle','cattle tot': 'cattle','cattle': 'cattle','cattle2000': 'cattle','cattle2010': 'cattle','cattle2020': 'cattle','cattle_tot': 'cattle','Cattle_tot': 'cattle','goat': 'goat','goat_t': 'goat','goats': 'goat','sheep': 'sheep','sheep2000': 'sheep','sheep2010': 'sheep','sheep2020': 'sheep','sheep_t': 'sheep','sheep_tot': 'sheep','Sheep_tota': 'sheep'}

malek_gdf = []

for id, row in gdf.iterrows():
    for c in cols:
        if (row[c] >= 0 and int(row[c]) != 9999):
            malek_gdf.append({
                'gazName': str(row['country']).capitalize() + '.' + str(row['local']).capitalize(),
                'gazID': str(hashlib.md5(str(row['geometry']).encode('utf-8')).hexdigest()),
                'year': dyear[c],
                'animal': danimal[c],
                'number_heads': row[c],
                'country': str(row['country']),
                'geometry': row['geometry']
            })
    

malek_gdf = pd.DataFrame(malek_gdf)
malek_gdf['level'] = 3 #malek_rwise_df['gazName'].str.split('.', expand=False).str.len()
#malek_gdf.drop_duplicates(subset=['gazID','year'], keep='last', inplace=True)
malek_gdf

In [None]:
#malek_gdf[malek_gdf['number_heads'] == 0]['country'].value_counts()
#malek_gdf[malek_gdf['gazName'].str.contains('.Nan$')]['country'].value_counts()

In [None]:
malek_gdf = malek_gdf[np.logical_and.reduce([
    malek_gdf['number_heads'] > 0
])]
malek_gdf

In [None]:
malek_gdf.loc[:,'column'] = malek_gdf['animal'] + '_' + malek_gdf['year'].astype('str')
malek_gdf.loc[:,'source'] = 'Malek et al., 2024'

malek_data = malek_gdf[['gazID','gazName','animal','country','level','source','column','number_heads']].pivot_table(
    index=['gazID','gazName','country','level','source'],
    columns='column',
    values='number_heads',
    aggfunc=np.nanmean
).reset_index()

print(f"Before merge: {malek_data.shape}")

malek_data = malek_data.merge(
    malek_gdf[['gazID','geometry']].drop_duplicates(subset=['gazID'], keep='last'), 
    on=('gazID'), how='left'
)

print(f"After merge: {malek_data.shape}")

In [None]:
#gpd.GeoDataFrame(malek_data[malek_data['country'] == 'nan'], geometry='geometry', crs=gdf.crs).plot()

In [None]:
animal_cols = sorted(sum([ list(malek_data.columns[malek_data.columns.str.contains(a)]) for a in animals ],[]))
info_cols = list(malek_data.columns[~malek_data.columns.isin(animal_cols)])

malek_data = malek_data[info_cols + animal_cols]

In [None]:
from datetime import datetime

version = datetime.today().strftime('%Y%m%d')
gpd.GeoDataFrame(malek_data, geometry='geometry', crs=gdf.crs).to_file(f'{wd}/gpw_livestock.malek.2024_{version}.gpkg')

## Malek et al., 2024 UK

In [None]:
from skmap.misc import find_files
MALEK_UK_DIR = f'{wd}/Malek_2024/UK'
raster_files = find_files(MALEK_UK_DIR, '*.tif')

In [None]:
adb_geometries_path

In [None]:
import rasterio
in_uk = f'/mnt/tupi/WRI/livestock_global_modeling/livestock_census_raw/ArealDB_GPW/geometries/United Kingdom of Great Britain and Northern Ireland.gpkg'
out_uk = f'{MALEK_UK_DIR}/uk_vector.gpkg'
gdf = gpd.read_file(in_uk, layer=f'ADM3')
gdf[gdf['geoID'] == 4.0].to_crs(rasterio.open(raster_files[0]).crs).to_file(
    out_uk
)

In [None]:
from rasterstats import zonal_stats
from pathlib import Path

area_th = 50

stats = []
cols = {
    'en-2016-c36-5km-total-cattle-and-calves': (2016, 'cattle', 5000),
    'en-2016-c44-5km-total-sheep-and-lambs': (2016, 'sheep', 5000),
    'en-2016-c49-5km-horses': (2016, 'horse', 5000),
    'sc-2019-c103-2km-total-cattle': (2019, 'cattle', 2000),
    'sc-2019-c117-2km-total-sheep': (2019, 'sheep', 2000),
    'sc-2019-c145-2km-total-horses': (2019, 'horse', 2000),
    'sc-2019-c146-2km-total-goats': (2019, 'goat', 2000),
    'wa-2018-goats-2km': (2018, 'goat', 2000),
    'wa-2018-horses-2km': (2018, 'horse', 2000),
    'wa-2018-total-cattle-2km': (2018, 'cattle', 2000),
    'wa-2018-total-sheep-2km': (2018, 'sheep', 2000)
}

for r in raster_files:
    print(f"Extracting values from {r}")
    key = str(Path(r).stem)
    year, animal, pixel_area = cols[key]
    
    area_col = f'{animal}_{year}_area'
    result = pd.DataFrame(
        zonal_stats(out_uk, r, stats=['sum','count'])
    ).rename(columns={
        'sum': f'{animal}_{year}',
        'count': area_col
    })
    
    pol_area = gpd.read_file(out_uk)['geometry'].area
    
    result[area_col] = (result[area_col] * pixel_area * pixel_area / pol_area) * 100
    result.loc[result[area_col] < area_th,f'{animal}_{year}'] = np.nan
    
    stats.append(result)
    
stats = pd.concat(stats, axis=1)
stats['source'] = 'Malek et al., 2024'

In [None]:
df_uk = pd.concat([gpd.read_file(out_uk), stats],axis=1)

In [None]:
ignore_cols = ['gazID','gazName','source','geometry']
cols = ignore_cols + sorted([ c for c in stats.columns if c not in ignore_cols ])        
df_uk = df_uk[cols]
df_uk = gpd.GeoDataFrame(df_uk, geometry=gpd.GeoSeries(df_uk['geometry']))
df_uk

In [None]:
area_cols = df_uk.columns[df_uk.columns.str.contains('area')]
df_uk = df_uk[np.any(df_uk[area_cols].to_numpy() >= area_th,axis=1)]
df_uk = df_uk.drop(columns=area_cols)

In [None]:
from datetime import datetime

version = datetime.today().strftime('%Y%m%d')
df_uk.to_file(f'{wd}/gpw_livestock.uk.malek.2024_{version}.gpkg')

## Census data integration

In [None]:
import pandas as pd
import geopandas as gpd

In [None]:
gpw_df = gpd.read_file(f'{wd}/gpw_livestock_20250908.gpkg')
gpw_ah_df = gpd.read_file(f'{wd}/gpw_livestock.adhoc_20250908.gpkg')
fao_df = gpd.read_file(f'{wd}/gpw_livestock.fao_20250908.gpkg')
malek_df = gpd.read_file(f'{wd}/gpw_livestock.malek.2024_20250908.gpkg')
malek_uk_df = gpd.read_file(f'{wd}/gpw_livestock.uk.malek.2024_20250908.gpkg')

In [None]:
gpw_df_agg = pd.concat([
    gpw_df, gpw_ah_df.to_crs(4326), 
    fao_df.to_crs(4326), malek_df.to_crs(4326), 
    malek_uk_df.to_crs(4326)
]).reset_index(drop=True)

In [None]:
list(gpw_df_agg['country'].unique())

In [None]:
#level_mask = gpw_df_agg['level'].isnull()
#gpw_df_agg.loc[level_mask, 'level'] = gpw_df_agg[level_mask]['gazName'].str.split('.', expand=False).str.len()

In [None]:
country_remap = {
    "Lao people's democratic republic": 'Laos',
    'Burkina_faso': 'Burkina Faso',
    'Czech republic': 'Czechia',
    'Luxembourg': 'Luxemburg',
    'N_ireland': 'Northern ireland',
    'Netherland': 'Netherlands',
    'North macedonia': 'Macedonia',
    'Republic of moldova': 'Moldova',
    'Republic of korea': 'South korea',
    'Russian federation': 'Russian',
    'Syrian arab republic': 'Syrian',
    'The former yugoslav republic of macedonia': 'Macedonia',
    'Türkiye': 'Turkey',
    'United republic of tanzania': 'Tanzania',
    'United states of america': 'United states',
    'U.k. of great britain and northern ireland': 'United kingdom of great britain and northern ireland',
    'Switzerlan': 'Switzerland',
    'Iran (islamic republic of)': 'Iran',
    'Bolivia (plurinational state of)': 'Bolivia'
    
}

gpw_df_agg['country'] = gpw_df_agg['country'].str.capitalize()
gpw_df_agg['country'] = gpw_df_agg['country'].replace(country_remap)

In [None]:
gpw_df_agg[gpw_df_agg['country'] == 'The former yugoslav republic of macedonia']

In [None]:
list(gpw_df_agg['country'].unique())

In [None]:
country_source_summary = gpw_df_agg[['country', 'source']].value_counts().sort_index().reset_index()
country_source_sum2 = country_source_summary['country'].value_counts()

country_source_summary[
    country_source_summary['country'].isin(country_source_sum2[country_source_sum2 > 1].reset_index()['country'])
].to_numpy()

In [None]:
gpw_final = gpw_df_agg

In [None]:
import numpy as np
animals = ['buffalo','cattle','goat','horse','sheep']
for c in animals:
    animal_cols = gpw_final.columns[gpw_final.columns.str.contains(c)]
    gpw_final.loc[:,f'{c}_nyears'] = np.sum(( np.logical_not(np.isnan(gpw_final[animal_cols]).to_numpy()) ).astype('int'), axis=1)
gpw_final = gpw_final.reset_index(drop=True)
gpw_final

In [None]:
import joblib
joblib.dump(gpw_final, f'{wd}/gpw_livestock.animals_gpw.fao.faostat.malek.2024_polygon.samples_20000101_20231231_go_epsg.4326_v1.lz4', compress='lz4')
gpw_final.to_file(f'{wd}/gpw_livestock.animals_gpw.fao.faostat.malek.2024_polygon.samples_20000101_20231231_go_epsg.4326_v1.gpkg')

In [None]:
gpw_final.shape