In [12]:
import rioxarray
import numpy as np
from pathlib import Path
import os
import sys
project_path = os.path.abspath(os.path.join('..', '..'))
if project_path not in sys.path:
    sys.path.insert(0, project_path)

from source.config import DATA_SRC, POP_DATA_SRC, WEATHER_SRC

# download worldpop for age categories and for all

In [13]:
import urllib
import rioxarray
import numpy as np
from pathlib import Path

for year in np.arange(2000,2021):
    for s in ["m","f"]:
        for age in [0,65,70,75,80]:
            downoald_url =  f"https://data.worldpop.org/GIS/AgeSex_structures/Global_2000_2020/{year}/0_Mosaicked/global_mosaic_1km/global_{s}_{age}_{year}_1km.tif"
            filepath = DATA_SRC / f"worldpop/global_{s}_{age}_{year}_1km.tif"
            if not Path(filepath).is_file():      
                urllib.request.urlretrieve(downoald_url, filepath)
    download_url = f"https://data.worldpop.org/GIS/Population/Global_2000_2020/{year}/0_Mosaicked/ppp_{year}_1km_Aggregated.tif"
    filepath = DATA_SRC / f"worldpop/ppp_{year}_1km_Aggregated.tif"
    if not Path(filepath).is_file():      
        urllib.request.urlretrieve(downoald_url, filepath)

In [14]:
def clean_and_coarsen(data_array):
    """
    Cleans the data to get read of negative, infinite and nan values, as well as coarsens by suming

    Parameters:
    data_array (xarray.DataArray): The data array to be cleaned.

    Returns:
    xarray.DataArray: The cleaned data array.
    """
    data_array = data_array.where(data_array > 0, 0)
    data_array = data_array.where(data_array < 3.1e8, 0)
    data_array = data_array.where(data_array != -np.inf, 0)
    data_array = data_array.where(data_array != np.inf, 0)
    data_array = data_array.fillna(0)
    data_array = data_array.rename({'y': 'latitude', 'x': 'longitude'})
    data_array = data_array.coarsen(latitude=2).sum()
    data_array = data_array.coarsen(longitude=2).sum()
    data_array = data_array.fillna(0)

    return data_array

import os
import rioxarray
import numpy as np

def sum_files(files, directory=""): 
    """
    Sums data across a list of files. The summing is useful for the age over 65, 
    made out of several age categories

    Parameters:
    files (list): A list of file paths to process.
    directory (str, optional): The directory path if it's not included in the file paths.

    Returns:
    xarray.DataArray: An array containing the summed data across the specified files.
    """
    total_sum = None
    for file in files:
        data_array = rioxarray.open_rasterio(os.path.join(directory, file))

        if total_sum is None:
            total_sum = data_array
        else:
            total_sum += data_array

    return total_sum

In [15]:
import xarray as xr
# open on year of era5 to put population data on the same grid
era5_data = xr.open_dataset(WEATHER_SRC / "era5_0.25deg/daily_temperature_summary/1980_temperature_summary.nc")



In [16]:
# source: https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html
from sklearn.neighbors import BallTree
import numpy as np

def get_nearest(src_points, candidates, k_neighbors=1):
    """Find nearest neighbors for all source points from a set of candidate points"""

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    closest = indices[0]
    closest_dist = distances[0]

    # Return indices and distances
    return (closest, closest_dist)


def nearest_neighbor(left_gdf, right_gdf, return_dist=False):
    """
    For each point in left_gdf, find closest point in right GeoDataFrame and return them.
    
    NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
    """
    
    left_geom_col = left_gdf.geometry.name
    right_geom_col = right_gdf.geometry.name
    
    # Ensure that index in right gdf is formed of sequential numbers
    right = right_gdf.copy().reset_index(drop=True)
    
    # Parse coordinates from points and insert them into a numpy array as RADIANS
    left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
    right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
    
    # Find the nearest points
    # -----------------------
    # closest ==> index in right_gdf that corresponds to the closest point
    # dist ==> distance between the nearest neighbors (in meters)
    
    closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)

    # Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
    closest_points = right.loc[closest]
    
    
    # Ensure that the index corresponds the one in left_gdf
    closest_points = closest_points.reset_index(drop=True)
    
    # Add distance if requested 
    if return_dist:
        # Convert to meters from radians
        earth_radius = 6371000  # meters
        closest_points['distance'] = dist * earth_radius
        
    return closest_points

In [17]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import numpy as np
from pathlib import Path
import copy

def remap_pop_to_era5(pop_data, era5_data, out_path, year, era5_grid_mapping=None):
    """
    Remap population data to match the ERA5 climate model's grid structure and save the result as a NetCDF file.

    This function processes population data to align with the grid of ERA5 climate data by adjusting longitude values,
    converting data to GeoDataFrames, finding the nearest ERA5 grid points to population data points, and finally 
    aggregating population data by these nearest points. The result is saved as a NetCDF file and also returns a 
    DataFrame mapping population data to ERA5 grid points and the resampled population data.

    Parameters:
    - pop_data: xarray Dataset containing population data with coordinates.
    - era5_data: xarray Dataset containing ERA5 climate model data.
    - out_path: Pathlib.Path or str, path to save the output NetCDF file.
    - year: int, the year for which the data remapping is performed.
    - era5_grid_mapping: pandas DataFrame or another structure indicating how population data should be mapped to ERA5 grid. This allows
    to save a lot of computation power, by doing it only once in a loop for data on the same grid
      If it's a DataFrame, it is used directly; otherwise, a nearest-neighbor mapping is performed. Default is None

    Returns:
    - pop_resampled: xarray Dataset of the resampled population data.
    - era5_grid_mapping: DataFrame showing the mapping of population data points to ERA5 grid points.
    """
    
    pop_df = pop_data.to_dataframe('pop').reset_index()
    pop_df['longitude'] = pop_df['longitude'].apply(lambda x: x + 360 if x < 0 else x)

    pop_df = pop_df.pivot_table(index=['latitude', 'longitude'], values='pop').reset_index()
    
    era5_df = era5_data.sel(time=era5_data.time[0]).to_dataframe().reset_index()
    era5_df = era5_df.pivot_table(index=['latitude', 'longitude'], columns='time', values='t_min').reset_index()
        
    if isinstance(era5_grid_mapping, pd.DataFrame):
        pop_df['latitude'] = era5_grid_mapping['latitude']
        pop_df['longitude'] = era5_grid_mapping['longitude']
    else:

        # Convert pop_df and era5_df to GeoDataFrames
        pop_gdf = gpd.GeoDataFrame(pop_df, geometry=[Point(xy) for xy in zip(pop_df.longitude, pop_df.latitude)])
        era5_gdf = gpd.GeoDataFrame(era5_df, geometry=[Point(xy) for xy in zip(era5_df.longitude, era5_df.latitude)])

        # Set the CRS for both GeoDataFrames to geographic CRS (EPSG:4326)
        pop_gdf.set_crs(epsg=4326, inplace=True)
        era5_gdf.set_crs(epsg=4326, inplace=True)

        nearest_neighbor_df = nearest_neighbor(pop_gdf, era5_gdf, return_dist=False)
        pop_df['latitude'] = nearest_neighbor_df['latitude']
        pop_df['longitude'] = nearest_neighbor_df['longitude']

        
    era5_grid_mapping = copy.deepcopy(pop_df[['latitude','longitude']])
    remapped_df = pop_df.groupby(["longitude", "latitude"]).sum().reset_index()
    merged_df = era5_df.merge(remapped_df, on=['latitude','longitude'], how='left')
    merged_df['time'] = year 
    merged_df.fillna(0, inplace=True)

    pivoted_df = merged_df.pivot(index='latitude', columns='longitude', values='pop')

    # Convert the pivoted DataFrame to an xarray DataArray
    da = xr.DataArray(pivoted_df, dims=['latitude', 'longitude'])

    # Optionally, add the time coordinate (if you have multiple time points, this step will differ)
    da = da.expand_dims(time=[year])

    # Convert to Dataset if you want to add more variables or simply prefer a Dataset structure
    pop_resampled = da.to_dataset(name='pop')

    pop_resampled.to_netcdf(out_path)

    return pop_resampled, era5_grid_mapping


In [None]:
def process_and_combine_ages(ages, sex, start_year, directory, era5_data, era5_grid_mapping):
    combined_files = []
    for age in ages:
        age_files = [f for f in os.listdir(directory) if f'_{sex}_{age}_{start_year}' in f and f.endswith('.tif')]
        combined_files.extend(age_files)
    # Use sum_files to sum the data from the combined files
    summed_data = sum_files(combined_files, directory)

    # Clean and coarsen the summed data
    cleaned_and_coarsened_data = clean_and_coarsen(summed_data)
    print(cleaned_and_coarsened_data)

    remapped_data, era5_grid_mapping = remap_pop_to_era5(cleaned_and_coarsened_data, era5_data, directory / 'era5_compatible' / f'{sex}_{"_".join(map(str, ages))}_{start_year}_era5_compatible.nc', start_year, era5_grid_mapping)

    return remapped_data, era5_grid_mapping

era5_grid_mapping = None
worldpop_dir = DATA_SRC / "worldpop/"
for start_year in np.arange(2000, 2001):
    for sex in ["m", "f"]:
        remapped_data, era5_grid_mapping = process_and_combine_ages([65, 70, 75, 80], sex, start_year, worldpop_dir, era5_data, era5_grid_mapping)
        remapped_data, era5_grid_mapping = process_and_combine_ages([0], sex, start_year, worldpop_dir, era5_data, era5_grid_mapping)


<xarray.DataArray (band: 1, latitude: 9360, longitude: 21600)>
array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)
Coordinates:
  * band         (band) int64 1
  * longitude    (longitude) float64 -180.0 -180.0 -180.0 ... 180.0 180.0 180.0
  * latitude     (latitude) float64 83.99 83.97 83.96 ... -71.96 -71.98 -71.99
    spatial_ref  int64 0
Attributes:
    AREA_OR_POINT:       Area
    DataType:            Generic
    RepresentationType:  ATHEMATIC
    _FillValue:          -3.4028235e+38
    scale_factor:        1.0
    add_offset:          0.0


In [None]:
import os
import numpy as np
from pathlib import Path

def process_population_data(start_year, directory, era5_data, era5_grid_mapping):
    # Collect all relevant population data files for the given start year
    pop_files = [f for f in os.listdir(directory) if f'ppp_{start_year}' in f and f.endswith('.tif')]

    # Sum the data from the collected files
    summed_data = sum_files(pop_files, directory)
    
    # Clean and coarsen the summed data
    cleaned_and_coarsened_data = clean_and_coarsen(summed_data)
    # Remap the population data to ERA5 data
    remapped_data = remap_pop_to_era5(cleaned_and_coarsened_data, era5_data, directory / 'era5_compatible' /f'resampled/all_{start_year}_era5_compatible.nc', start_year, era5_grid_mapping)

    return remapped_data

# Usage Example
worldpop_dir = DATA_SRC / "worldpop/"
pop_data = {}
era5_grid_mapping = None
for start_year in np.arange(2000, 2021):
    pop_data[start_year], remapped_data = process_population_data(start_year, worldpop_dir, era5_data, era5_grid_mapping)


# Put worldpop together with ISIMIP data for 1950-2000 and extrapolate to 2023
The next cells processes and merges population data spanning from 1950 to 2023 into a single dataset. Initially, it loads demographic data from 1950 to 2020 , selecting data from 1950 to 1999 for early years as processed in the version 2023 of the Lancet countdownand available at https://doi.org/10.5281/zenodo.6011021. Then, it incorporates WorldPop data for 2000 to 2020, standardizing the format and merging these with the earlier data. For years 2021 to 2023, it extrapolates population figures based on the existing data. The final dataset, covering 1950 to 2023, is reordered and saved as a NetCDF file, making it compatible with ERA5 climate data for further analysis.

In [10]:
POP_DATA_SRC

PosixPath('/nfs/n2o/wcr/szelie/lancet/population')

In [11]:
import xarray as xr
import numpy as np
from pathlib import Path
# Function to load population data for a specific demographic group (infants or elderly)
def load_population_data(group, gender, years, suffix):
    return {year: xr.open_dataset(DATA_SRC / 'worldpop/era5_compatible' / f"{gender}_{group}_{str(year)}_{suffix}") for year in years}

# Function to sum male and female datasets and rename 'time' dimension to 'year'
def combine_and_rename(data_m, data_f):
    combined_data = {year: data_f[year] + data_m[year] for year in data_m}
    for year in combined_data:
        combined_data[year] = combined_data[year].rename({'time':'year'})
    return xr.concat(combined_data.values(), dim='year')

def concatenate_and_extrapolate(old_data, new_data):
    combined_data = xr.concat([old_data, new_data], dim='year')
    return xr.concat([combined_data, combined_data.interp(year=extrapolated_years, kwargs={"fill_value": "extrapolate"})], 'year').load()


# Load and combine infant and elderly population data for 2000-2020
years_range = np.arange(2000, 2021)
infants_m = load_population_data("0", "m", years_range, "era5_compatible.nc")
infants_f = load_population_data("0", "f", years_range, "era5_compatible.nc")
elderly_m = load_population_data("65_70_75_80", "m", years_range, "era5_compatible.nc")
elderly_f = load_population_data("65_70_75_80", "f", years_range, "era5_compatible.nc")

# Combine and process data for both infants and elderly
infants_worldpop_2000_2020 = combine_and_rename(infants_m, infants_f)
elderly_worldpop_2000_2020 = combine_and_rename(elderly_m, elderly_f)

# Load additional population data for 1950-1999
infants_totals_file = POP_DATA_SRC / 'hybrid_2023' / 'infants_1950_2020_hybrid_15_min_era_compat.nc' # files generated for lancet report 2023
population_infants_1950_1999 = xr.open_dataarray(infants_totals_file).sel(year=slice(1950, 1999))

demographics_totals_file = POP_DATA_SRC / 'hybrid_2023' / 'demographics_hybrid_1950_2020_15_min_era_compat.nc'
demographics_totals = xr.open_dataarray(demographics_totals_file)
elderly_1950_1999 = demographics_totals.sel(age_band_lower_bound=65).sel(year=slice(1950,1999))

# Combine data for all years (1950-2020) and extrapolate to 2023
MAX_YEAR = 2023
extrapolated_years = np.arange(2021, MAX_YEAR + 1)
infants_worldpop_2000_2020 = infants_worldpop_2000_2020.rename({'pop':'infants'})
population_infants_worldpop = concatenate_and_extrapolate(population_infants_1950_1999.to_dataset(), infants_worldpop_2000_2020)
population_infants_worldpop = population_infants_worldpop.transpose("year", "latitude", "longitude")


elderly_1950_1999 = elderly_1950_1999.to_dataset().rename({'demographic_totals':'elderly'})
elderly_worldpop_2000_2020 = elderly_worldpop_2000_2020.rename({'pop':'elderly'})

population_elderly_worldpop = concatenate_and_extrapolate(elderly_1950_1999, elderly_worldpop_2000_2020)
population_elderly_worldpop = population_elderly_worldpop.transpose("year", "latitude", "longitude")
# # Save the results to NetCDF files
population_infants_worldpop.to_netcdf(POP_DATA_SRC / 'hybrid_2024' / f'worldpop_infants_1950_{MAX_YEAR}_era5_compatible.nc')
population_elderly_worldpop.to_netcdf(POP_DATA_SRC / 'hybrid_2024' / f'worldpop_elderly_1950_{MAX_YEAR}_era5_compatible.nc')


In [None]:
DEMOGRAPHICS_TOTALS_FILE = POP_DATA_SRC / 'hybrid_2023' / 'demographics_hybrid_1950_2020_15_min_era_compat.nc'

demographics_totals = xr.open_dataarray(DEMOGRAPHICS_TOTALS_FILE)

all_1950_1999 = demographics_totals.sel(year=slice(1950,1999))

path_worldpop = DATA_SRC / 'worldpop' / 'era5_compatible'
extrapolated_years = np.arange(2019+1, 2023+1)

worldpop_all = {year: xr.open_dataset(path_worldpop / f"all_{str(year)}_era5_compatible.nc") for year in np.arange(2000,2020)}
# worldpop_all[2000] = worldpop_all[2000].squeeze()

for year in np.arange(2000,2020): # for some reason some files seem to have a time dimensions, and others not, this should be checked
    try:
        worldpop_all[year] = worldpop_all[year].drop('time')
        worldpop_all[year] = worldpop_all[year].assign_coords(time=('year', [year]))
    except ValueError:
        continue
    worldpop_all[year] = worldpop_all[year].squeeze()

worldpop_all_2000_2020 = xr.concat(worldpop_all.values(), dim='year')
worldpop_all_2000_2020 = worldpop_all_2000_2020.rename_vars({"time":"year"})
all_1950_1999 = all_1950_1999.sum(dim='age_band_lower_bound').to_dataset().rename({'demographic_totals':'pop'})
worldpop_all_1950_2020 = xr.concat([all_1950_1999, worldpop_all_2000_2020], dim='year')

worldpop_all = (
    xr.concat([worldpop_all_1950_2020, 
               worldpop_all_1950_2020.interp(year=extrapolated_years, kwargs=dict(fill_value="extrapolate"))
              ], 'year').load())
worldpop_all['year'] = np.arange(1950,2024)
worldpop_all = worldpop_all.transpose("year", "latitude", "longitude")
worldpop_all.to_netcdf(POP_DATA_SRC / 'hybrid_2024' / "worldpop_hybrid_all_1950_2023_era5_compatible.nc")

In [24]:
POP_DATA_SRC / 'hybrid_2024' / "worldpop_hybrid_all_1950_2023_era5_compatible.nc"

PosixPath('/nfs/n2o/wcr/szelie/lancet/population/hybrid_2024/worldpop_hybrid_all_1950_2023_era5_compatible.nc')

In [20]:
worldpop_all.transpose('year','latitude','longitude')