# Export population raster points with their administrative population weights, and administrative units (using spatial joins)

This notebook exports each season-mode-service raster's pixels to their corresponding populated points, and calcualtes their pop-weighted access times for aggregation in future steps.

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask
import coiled
from dask.distributed import Client, LocalCluster, Lock
from dask.utils import SerializableLock
import dask.dataframe as dd

import pandas as pd
import geopandas as gpd
import spatialpandas as sp
import dask_geopandas as dg

import rioxarray as rx
import xarray as xr

import re

from dask_control import *
from raster_vals_to_pts import *

import numpy as np
dask.config.set({"temporary-directory": "C:/Users/andri"})

import json

In [None]:
data_root = 'D:\\github_test\\'

##################################################################
##################################################################
#read project input parameters 
data_file = data_root + 'project_data.json'

##################################################################
##################################################################
#read project variables so that we have our parameters and file locations
with open(data_file, 'rb') as f:
    data_loaded = json.load(f)
f.close()

##################################################################
##################################################################
#read information from the project setup file that's relevant to this section of code
#imports
local_population_folder = data_loaded['local_population_folder']
local_lc_folder = data_loaded['local_lc_folder']
access_dir = data_loaded['access_dir']
dest_crs = data_loaded['dest_crs']
dest_crs_id = data_loaded['dest_crs_id']

local_boundaries_folder = data_loaded['local_boundaries_folder']
level = data_loaded['level']
max_level = data_loaded['max_level']
if level != 'custom':
    shapefile_adm_field = data_loaded['shapefile_adm_field']
    adm_name = data_loaded['adm_name']

seasons = sorted([os.path.join(local_lc_folder,file) \
            for file \
            in os.listdir(local_lc_folder) \
            if file.endswith(".tif")])

for strnum in range(0, len(seasons)):
    seasons[strnum] = str.replace(seasons[strnum], local_lc_folder,"")
    seasons[strnum] = str.replace(seasons[strnum], ".tif","")   

In [None]:
mode_list =['walk','multi']
ssn_mode_list = []
for ssn in seasons:
    for mode_num in range(0,len(mode_list)):
        ssn_mode_list.append(ssn + "_" + mode_list[mode_num])

Instantiate Dask

In [None]:
client=get_dask_client(cluster_type='local',n_workers=4,processes=True,threads_per_worker=8)
client

Read in CSV

In [None]:
local_population_folder+'population_tabular_final.csv'

In [None]:
# Load Points
if max_level == 'adm2':
    col_range = [0,3,4,6,8]
    col_names = ["POP","x","y","ADM2_EN","Adm2_Code"]
    col_types = {"POP": float,"x": float,"y": float,"ADM2_EN": str,"Adm2_Code": str}
else:
    col_range = [0,3,4,6,7,9,10]
    col_names = ["POP","x","y","ADM2_EN","ADM3_EN","Adm2_Code","Adm3_Code"]
    col_types = {"POP": float,"x": float,"y": float,"ADM2_EN": str,"Adm2_Code": str,"ADM3_EN": str,"Adm3_Code": str}

In [None]:
points = dd.read_csv(local_population_folder+'population_tabular_final.csv',
                     skip_blank_lines=True,
                     usecols = col_range,
                     header=None,
                     names= col_names,
                     dtype = col_types,
                     na_values = ['', ' ', 'N/A', '#N/A', 'NA', '#NA'],
                    )


Process rasters

In [None]:
# forst we prepare the 'base' dataframe with all points' coordinates and corresponding adm data
points_xr = xr.Dataset.from_dataframe(points[["x", "y"]])
if max_level == 'adm2':
    df_pixels_source = points[['POP','x','y','ADM2_EN','Adm2_Code']].copy()
else:
    df_pixels_source = points[['POP','x','y','ADM3_EN','Adm3_Code','ADM2_EN','Adm2_Code']].copy()

df_pixels_source = df_pixels_source.repartition(npartitions=1)
df_pixels_source = df_pixels_source.reset_index(drop=True)

# Get Pops per Adm2 unit
adm2_pop = df_pixels_source.groupby('Adm2_Code')['POP'].sum().to_frame("adm2_pop")

if max_level == 'adm3':
    # Get Pops per Adm3 unit
    adm3_pop = df_pixels_source.groupby('Adm3_Code')['POP'].sum().to_frame("adm3_pop")

df_pixels_source = dd.merge(df_pixels_source, adm2_pop, how = 'left', left_on="Adm2_Code", right_index=True)
if max_level == 'adm3':
    df_pixels_source = dd.merge(df_pixels_source, adm3_pop, how = 'left', left_on="Adm3_Code", right_index=True)

df_pixels_source = df_pixels_source.persist()

# Calculate the population weight of each pixel within its enclosing admin area -- e.g. 10 pixel population for a 100 population admin - 0.1 weight
df_pixels_source['wt_adm_2'] = (df_pixels_source['POP'] / df_pixels_source['adm2_pop'])
if max_level == 'adm3':
    df_pixels_source['wt_adm_3'] = (df_pixels_source['POP'] / df_pixels_source['adm3_pop'])
    
del points, adm2_pop
if max_level == 'adm3':
    del adm3_pop

df_pixels_source = df_pixels_source.drop(['adm2_pop'], axis = 1)
if max_level == 'adm3':
    df_pixels_source = df_pixels_source.drop(['adm3_pop'], axis = 1)

In [None]:
# iterate through all the season-mode combinations and export pixel values 
# season-mode pixel file(s) will be created, containing each pixel's 
# service access time and its pop-weighted service access time, for all services

for ssn_mode_num in range(0, len(ssn_mode_list)):
    ssn_mode = ssn_mode_list[ssn_mode_num]
    print(ssn_mode)
    
    rasters = {}
    rlimit = len(os.listdir(access_dir))
    r_ct = 0

    # get a list of the access time rasters for specific season-mode
    for file in os.listdir(access_dir):
        if file.endswith(".tif"):
            if file.startswith(f'{ssn_mode}_'):
                acc_rast = re.search(f'(.*?).tif',os.path.basename(file)).group(1)
                rasters[acc_rast] = f"{access_dir}{file}"
                r_ct = r_ct + 1
                if r_ct >= rlimit:
                    break
   
    loaded_rasters = {}
    for key in rasters:
        print(f"Persist raster: {key} at {rasters[key]}")
        raster = xr.open_rasterio(f"{rasters[key]}", 
                                       chunks = ("auto", "auto", "auto"),
                                       parse_coordinates=True)        
        loaded_rasters[key] = raster
  
    rasters_ds = (
        xr.Dataset(loaded_rasters)
        .sel(band=1)
        .map(lambda arr: arr.where(arr != arr.nodatavals[0]))
    )

    # now start preparing the dataframe for this specific season-mode
    df_pixels = df_pixels_source

    # for each access time raster, add access time raster points values to columns in the dataframe
    first = 1
    for rkey in rasters:
        print(rkey)
        hrs_col = f"{rkey}"
        pixel_values_temp = rasters_ds[hrs_col].sel(x=points_xr.x, y=points_xr.y, method="nearest")
        pixel_values_temp = pixel_values_temp.reset_coords(drop=True).to_dataframe(name=hrs_col).reset_index()
        pixel_values_temp = dd.from_pandas(pixel_values_temp, npartitions=1)
        
        if first == 1:
            pixel_values = pixel_values_temp.copy().reset_index()
            first = 0
        else:
            pixel_values = dd.merge(pixel_values,pixel_values_temp.copy().reset_index())

    df_pixels = dd.merge(df_pixels,pixel_values)       
    df_pixels = df_pixels.drop(['level_0','index'],axis = 1)    
    df_pixels.compute()

    # Now we add pop-weighted access times to columns in the dataframe
    # Create a column per raster that we will populate with the corresponding raster's pop-weighted value
    for rkey in rasters:
        hrs_col = f"{rkey}"
        avg_col_adm_2 = f"{rkey}_avg_adm2"
        df_pixels[avg_col_adm_2] = df_pixels[hrs_col] * df_pixels['wt_adm_2']
        if max_level == 'adm3':
            avg_col_adm_3 = f"{rkey}_avg_adm3"
            df_pixels[avg_col_adm_3] = df_pixels[hrs_col] * df_pixels['wt_adm_3']

    float64_cols = df_pixels.select_dtypes(include='float64').columns
    df_pixels = df_pixels.map_partitions(lambda df_pixels: df_pixels.astype({col: 'float32' for col in float64_cols}))

    #clean some memory
    del loaded_rasters ,pixel_values
    del raster, rasters_ds

    #prepare df for export
    df_pixels = df_pixels.dropna()
    df_pixels = df_pixels.repartition(partition_size="100MB")
    
    #export dataframe to csv
    df_pixels_out = access_dir + 'pixels' + '\\' + f"{ssn_mode}_df_pixels_final-*.csv"  # make sure you have a * or it will output parquet files       
    df_pixels.to_csv(df_pixels_out, single_file=False)
    
    del df_pixels

client.close()