In [None]:
# this script was ran on the cluster as the original data is on a 1km grid and takes a lot of space

In [4]:
import rioxarray
import numpy as np
from pathlib import Path
import os
import sys


## Download data

In [5]:
import urllib
import rioxarray
import numpy as np
from pathlib import Path

DATA_SRC = Path('/nfs/n2o/wcr/szelie/').expanduser()

for year in np.arange(2000,2021):
    for s in ["m","f"]:
        for age in [1,0,65,70,75,80]:
            downoald_url =  f"https://data.worldpop.org/GIS/AgeSex_structures/Global_2000_2020/{year}/0_Mosaicked/global_mosaic_1km/global_{s}_{age}_{year}_1km.tif"
            filepath = DATA_SRC / f"worldpop/global_{s}_{age}_{year}_1km.tif"
            if not Path(filepath).is_file():      
                urllib.request.urlretrieve(downoald_url, filepath)
    download_url = f"https://data.worldpop.org/GIS/Population/Global_2000_2020/{year}/0_Mosaicked/ppp_{year}_1km_Aggregated.tif"
    filepath = DATA_SRC / f"worldpop/ppp_{year}_1km_Aggregated.tif"
    if not Path(filepath).is_file():      
        urllib.request.urlretrieve(downoald_url, filepath)

## Process era5 data grid

In [7]:
import xarray as xr
from shapely.geometry import Point
import geopandas as gpd
ERA5_SRC = Path("/nfs/n2o/wcr/szelie/era5/")

# open on year of era5 to put population data on the same grid
era5_data = xr.open_dataset(ERA5_SRC / "era5_0.25deg/daily_temperature_summary/1980_temperature_summary.nc")
era5_data = era5_data.assign_coords(longitude=(((era5_data.longitude + 180) % 360) - 180))
era5_grid = era5_data.isel(time=0).to_dataframe().reset_index()
era5_grid['geometry'] = era5_grid.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
era5_grid = gpd.GeoDataFrame(era5_grid, geometry='geometry')
era5_grid.set_crs('EPSG:4326', inplace=True)
era5_grid = era5_grid[['longitude','latitude', 'geometry']]
era5_grid_3395 = era5_grid.to_crs('EPSG:3395')


In [8]:
from pathlib import Path

def clean_and_coarsen(data_array):
    """
    Cleans the data to get read of negative, infinite and nan values, as well as coarsens by suming to reduce the size of the files

    Parameters:
    data_array (xarray.DataArray): The data array to be cleaned.

    Returns:
    xarray.DataArray: The cleaned data array.
    """
    data_array = data_array.where(data_array > 0, 0)
    data_array = data_array.where(data_array < 3.1e8, 0)
    data_array = data_array.where(data_array != -np.inf, 0)
    data_array = data_array.where(data_array != np.inf, 0)
    data_array = data_array.fillna(0)
    data_array = data_array.rename({'y': 'latitude', 'x': 'longitude'})
    print(data_array.latitude)
    data_array = data_array.coarsen(latitude=10).sum()
    data_array = data_array.coarsen(longitude=20).sum()
    print(data_array.latitude)

    data_array = data_array.fillna(0)

    return data_array

import os
import rioxarray
import numpy as np

def sum_files(files, directory=""): 
    """
    Sums data across a list of files. The summing is useful for the age over 65 and under 5, 
    made out of several age categories

    Parameters:
    files (list): A list of file paths to process.
    directory (str, optional): The directory path if it's not included in the file paths.

    Returns:
    xarray.DataArray: An array containing the summed data across the specified files.
    """
    total_sum = None
    for file in files:
        data_array = rioxarray.open_rasterio(os.path.join(directory, file))

        if total_sum is None:
            total_sum = data_array
        else:
            total_sum += data_array

    return total_sum

def process_and_combine_ages(ages, sex, start_year, directory, era5_grid, mapping=None):
    combined_files = []
    for age in ages:
        age_files = [f for f in os.listdir(directory) if f'_{sex}_{age}_{start_year}' in f and f.endswith('.tif')]
        combined_files.extend(age_files)
    # Use sum_files to sum the data from the combined files
    summed_data = sum_files(combined_files, directory)

    # Clean and coarsen the summed data
    cleaned_and_coarsened_data = clean_and_coarsen(summed_data)
    pop = cleaned_and_coarsened_data.to_dataframe('pop').reset_index()

    pop['geometry'] = pop.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
    pop = gpd.GeoDataFrame(pop, geometry='geometry')
    pop.set_crs('EPSG:4326', inplace=True)
    pop = pop.to_crs('EPSG:3395')

    pop = pop[pop['pop']>0]
    pop_era5_grid = gpd.sjoin_nearest(pop, era5_grid, how='inner')

    pop_regrided = pop_era5_grid.groupby(['longitude_right','latitude_right']).sum().reset_index()[['longitude_right','latitude_right','pop']]


    return pop_regrided

worldpop_dir = DATA_SRC / "worldpop/"
ages = ['0','1'] # ages to sum
for start_year in np.arange(2005, 2006):
    for sex in ["f", "m"]: # sum female and males
        pop_regrided = process_and_combine_ages([0, 1], sex, start_year, worldpop_dir, era5_grid_3395)
        pop_regrided = pop_regrided.rename(columns={"latitude_right":"latitude","longitude_right":"longitude"})
        pop_regrided = era5_grid.merge(pop_regrided[['longitude','latitude', 'pop']], how='left')
        pivoted_df = pop_regrided.pivot(index='latitude', columns='longitude', values='pop')

        # Convert the pivoted DataFrame to an xarray DataArray
        da = xr.DataArray(pivoted_df, dims=['latitude', 'longitude'])

        # Optionally, add the time coordinate (if you have multiple time points, this step will differ)
        da = da.expand_dims(time=[start_year])

        # Convert to Dataset if you want to add more variables or simply prefer a Dataset structure
        pop_resampled = da.to_dataset(name='pop')
        pop_resampled['longitude'] = xr.where(pop_resampled['longitude'] < 0, pop_resampled['longitude'] + 360, pop_resampled['longitude'])
        pop_resampled = pop_resampled.sortby('longitude')
        out_path = Path('/nfs/n2o/wcr/szelie/worldpop/') / 'era5_compatible' / f'{sex}_{"_".join(map(str, ages))}_{start_year}_era5_compatible.nc'

        pop_resampled.to_netcdf(out_path)

<xarray.DataArray 'latitude' (latitude: 18720)>
array([ 83.995417,  83.987083,  83.97875 , ..., -71.979583, -71.987916,
       -71.99625 ])
Coordinates:
  * latitude     (latitude) float64 84.0 83.99 83.98 ... -71.98 -71.99 -72.0
    spatial_ref  int64 0
<xarray.DataArray 'latitude' (latitude: 1872)>
array([ 83.957917,  83.874583,  83.79125 , ..., -71.792083, -71.875416,
       -71.95875 ])
Coordinates:
  * latitude     (latitude) float64 83.96 83.87 83.79 ... -71.79 -71.88 -71.96
    spatial_ref  int64 0
290454050.0
<xarray.DataArray 'latitude' (latitude: 18720)>
array([ 83.995417,  83.987083,  83.97875 , ..., -71.979583, -71.987916,
       -71.99625 ])
Coordinates:
  * latitude     (latitude) float64 84.0 83.99 83.98 ... -71.98 -71.99 -72.0
    spatial_ref  int64 0
<xarray.DataArray 'latitude' (latitude: 1872)>
array([ 83.957917,  83.874583,  83.79125 , ..., -71.792083, -71.875416,
       -71.95875 ])
Coordinates:
  * latitude     (latitude) float64 83.96 83.87 83.79 ... -71.79 -71.8

In [3]:
# same for over 65s

In [11]:
from pathlib import Path

worldpop_dir = DATA_SRC / "worldpop/"
ages = [65, 70, 75, 80]
for start_year in np.arange(2014, 2020):
    for sex in ["f","m"]:
        #remapped_data, era5_grid_mapping = process_and_combine_ages([65, 70, 75, 80], sex, start_year, worldpop_dir, era5_data, era5_grid_mapping)
        pop_regrided = process_and_combine_ages(ages, sex, start_year, worldpop_dir, era5_grid_3395)
        pop_regrided = pop_regrided.rename(columns={"latitude_right":"latitude","longitude_right":"longitude"})
        pop_regrided = era5_grid.merge(pop_regrided[['longitude','latitude', 'pop']], how='left')
        pivoted_df = pop_regrided.pivot(index='latitude', columns='longitude', values='pop')

        # Convert the pivoted DataFrame to an xarray DataArray
        da = xr.DataArray(pivoted_df, dims=['latitude', 'longitude'])

        # Optionally, add the time coordinate (if you have multiple time points, this step will differ)
        da = da.expand_dims(time=[start_year])

        # Convert to Dataset if you want to add more variables or simply prefer a Dataset structure
        pop_resampled = da.to_dataset(name='pop')
        pop_resampled['longitude'] = xr.where(pop_resampled['longitude'] < 0, pop_resampled['longitude'] + 360, pop_resampled['longitude'])
        pop_resampled = pop_resampled.sortby('longitude')
        out_path = worldpop_dir / 'era5_compatible' / f'{sex}_{"_".join(map(str, ages))}_{start_year}_era5_compatible.nc'
        

<xarray.DataArray 'latitude' (latitude: 18720)>
array([ 83.995417,  83.987083,  83.97875 , ..., -71.979583, -71.987916,
       -71.99625 ])
Coordinates:
  * latitude     (latitude) float64 84.0 83.99 83.98 ... -71.98 -71.99 -72.0
    spatial_ref  int64 0
<xarray.DataArray 'latitude' (latitude: 1872)>
array([ 83.957917,  83.874583,  83.79125 , ..., -71.792083, -71.875416,
       -71.95875 ])
Coordinates:
  * latitude     (latitude) float64 83.96 83.87 83.79 ... -71.79 -71.88 -71.96
    spatial_ref  int64 0
326521020.0
326521000.0
<xarray.DataArray 'latitude' (latitude: 18720)>
array([ 83.995417,  83.987083,  83.97875 , ..., -71.979583, -71.987916,
       -71.99625 ])
Coordinates:
  * latitude     (latitude) float64 84.0 83.99 83.98 ... -71.98 -71.99 -72.0
    spatial_ref  int64 0
<xarray.DataArray 'latitude' (latitude: 1872)>
array([ 83.957917,  83.874583,  83.79125 , ..., -71.792083, -71.875416,
       -71.95875 ])
Coordinates:
  * latitude     (latitude) float64 83.96 83.87 83.79 ... 

In [4]:
# For all ages

In [12]:
from pathlib import Path

def process_and_combine(start_year, directory, era5_grid):
    # Use sum_files to sum the data from the combined files
    summed_data =  rioxarray.open_rasterio(directory / f"ppp_{start_year}_1km_Aggregated.tif")

    # Clean and coarsen the summed data
    cleaned_and_coarsened_data = clean_and_coarsen(summed_data)
    pop = cleaned_and_coarsened_data.to_dataframe('pop').reset_index()
    print(pop['pop'].sum())

    pop['geometry'] = pop.apply(lambda row: Point(row.longitude, row.latitude), axis=1)
    pop = gpd.GeoDataFrame(pop, geometry='geometry')
    pop.set_crs('EPSG:4326', inplace=True)
    pop = pop.to_crs('EPSG:3395')

    pop = pop[pop['pop']>0]
    pop_era5_grid = gpd.sjoin_nearest(pop, era5_grid, how='left')
    print(pop_era5_grid['pop'].sum())

    pop_regrided = pop_era5_grid.groupby(['longitude_right','latitude_right']).sum().reset_index()[['longitude_right','latitude_right','pop']]
    #pop_regrided['geometry'] = pop_regrided.apply(lambda row: Point(row.longitude_right, row.latitude_right), axis=1)
    #pop_regrided = gpd.GeoDataFrame(pop_regrided, geometry='geometry')

    return pop_regrided

area = era5_grid_3395.geometry.area

worldpop_dir = DATA_SRC / "worldpop/"
for start_year in np.arange(2000, 2020):
    #remapped_data, era5_grid_mapping = process_and_combine_ages([65, 70, 75, 80], sex, start_year, worldpop_dir, era5_data, era5_grid_mapping)
    pop_regrided = process_and_combine(start_year, worldpop_dir, era5_grid_3395)
    
    pop_regrided = pop_regrided.rename(columns={"latitude_right":"latitude","longitude_right":"longitude"})
    pop_regrided = era5_grid.merge(pop_regrided[['longitude','latitude', 'pop']], how='left')
    pivoted_df = pop_regrided.pivot(index='latitude', columns='longitude', values='pop')

    # Convert the pivoted DataFrame to an xarray DataArray
    da = xr.DataArray(pivoted_df, dims=['latitude', 'longitude'])

    # Optionally, add the time coordinate (if you have multiple time points, this step will differ)
    da = da.expand_dims(time=[start_year])

    # Convert to Dataset if you want to add more variables or simply prefer a Dataset structure
    pop_resampled = da.to_dataset(name='pop')
    pop_resampled['longitude'] = xr.where(pop_resampled['longitude'] < 0, pop_resampled['longitude'] + 360, pop_resampled['longitude'])
    pop_resampled = pop_resampled.sortby('longitude')
    out_path = worldpop_dir / 'era5_compatible' / f'all_{start_year}_era5_compatible.nc'

   
    pop_resampled.to_netcdf(out_path)

<xarray.DataArray 'latitude' (latitude: 18720)>
array([ 84.00375 ,  83.995417,  83.987083, ..., -71.97125 , -71.979583,
       -71.987916])
Coordinates:
  * latitude     (latitude) float64 84.0 84.0 83.99 ... -71.97 -71.98 -71.99
    spatial_ref  int64 0
<xarray.DataArray 'latitude' (latitude: 1872)>
array([ 83.96625 ,  83.882917,  83.799583, ..., -71.78375 , -71.867083,
       -71.950416])
Coordinates:
  * latitude     (latitude) float64 83.97 83.88 83.8 ... -71.78 -71.87 -71.95
    spatial_ref  int64 0
6026136600.0
6026137600.0
<xarray.DataArray 'latitude' (latitude: 18720)>
array([ 83.995417,  83.987083,  83.97875 , ..., -71.979583, -71.987916,
       -71.99625 ])
Coordinates:
  * latitude     (latitude) float64 84.0 83.99 83.98 ... -71.98 -71.99 -72.0
    spatial_ref  int64 0
<xarray.DataArray 'latitude' (latitude: 1872)>
array([ 83.957917,  83.874583,  83.79125 , ..., -71.792083, -71.875416,
       -71.95875 ])
Coordinates:
  * latitude     (latitude) float64 83.96 83.87 83.79 ...