# Administrative Descriptive Stats

After appending travel time information to each populated pixel withon our AOI, we can prepare a number of descriptive stats within administrative units. Given the quantity of data in question these are best prepared with Dask Dataframes. Due to the large number of services that can be investigated and the large number of pixels involved, we export each season-mode-service combination separately, which we will merge together in the next and final notebook.

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask
import coiled
from dask.distributed import Client, LocalCluster, Lock
from dask.utils import SerializableLock
import dask.dataframe as dd

import pandas as pd
import geopandas as gpd
import spatialpandas as sp
import dask_geopandas as dg

import rioxarray as rx
import xarray as xr

import re

from dask_control import *

from spatialpandas.geometry import (
    PointArray, MultiPointArray, LineArray,
    MultiLineArray, PolygonArray, MultiPolygonArray
)

import numpy as np
from datetime import date

import json

## Setup

In [None]:
data_root = 'D:\\github_test\\'

##################################################################
##################################################################
#read project input parameters that will eventually be passed from the UI
data_file = data_root + 'project_data.json'

##################################################################
##################################################################
#read project variables that will come from UI so that we have our parameters and file locations
with open(data_file, 'rb') as f:
    data_loaded = json.load(f)
f.close()

##################################################################
##################################################################
#read information from the project setup file that's relevant to this section of code
#imports
local_population_folder = data_loaded['local_population_folder']
local_lc_folder = data_loaded['local_lc_folder']
access_dir = data_loaded['access_dir']
dest_crs = data_loaded['dest_crs']
dest_crs_id = data_loaded['dest_crs_id']

local_boundaries_folder = data_loaded['local_boundaries_folder']
level = data_loaded['level']
max_level = data_loaded['max_level']
if level != 'custom':
    shapefile_adm_field = data_loaded['shapefile_adm_field']
    adm_name = data_loaded['adm_name']

seasons = sorted([os.path.join(local_lc_folder,file) \
            for file \
            in os.listdir(local_lc_folder) \
            if file.endswith(".tif")])

for strnum in range(0, len(seasons)):
    seasons[strnum] = str.replace(seasons[strnum], local_lc_folder,"")
    seasons[strnum] = str.replace(seasons[strnum], ".tif","")   

**Initiate Dask Client**

In [None]:
client=get_dask_client(cluster_type='local',n_workers=4,processes=True,threads_per_worker=8)
client

## Loading data

Pixel level data

In [None]:
mode_list =['walk','multi']
ssn_mode_list = []
for ssn in seasons:
    for mode_num in range(0,len(mode_list)):
        ssn_mode_list.append(ssn + "_" + mode_list[mode_num])

In [None]:
def long_per_indicator(df,indicator,adm_col):
      
    indic_label = indicator

    # pivot the data for just that indicator, with the column VALUES = the population value for that pixel
    pop_total = df.pivot_table(index = adm_col, columns=indicator, 
                 values = 'POP', aggfunc = 'sum', fill_value = 0)

    # divide by the rowsum to get the % of population falling in each travel category, per admin area
    pop_pct = pop_total.div(np.nansum(pop_total,axis=1),axis=0)
    
    # create labels
    pop_total['indicator'] = indic_label
    pop_pct['indicator'] = indic_label
    
    # remove the multi-index, compress in long format with the adm and indicator as labels, and then change labels/sort
    pop_pct = pop_pct.reset_index()\
                            .melt(id_vars=[adm_col,'indicator'])\
                            .rename({indicator:'travel_time_range','value':'pop_pct'},axis=1)
    
    pop_total = pop_total.reset_index()\
                            .melt(id_vars=[adm_col,'indicator'])\
                            .rename({indicator:'travel_time_range','value':'pop_total'},axis=1)
    
    long_indic = pd.concat([pop_pct,pop_total[['pop_total']]],axis=1,ignore_index=False)
    
    return long_indic

In [None]:
# iterate through all the season-mode combinations and aggregate each service's
# pop-weighted access times within each adm unit

for ssn_mode_num in range(0, len(ssn_mode_list)):
    ssn_mode = ssn_mode_list[ssn_mode_num]
    print(ssn_mode)
    POINTS_URL = access_dir + 'pixels' + '\\' + f"{ssn_mode}_df_pixels_final-*.csv"

    df_pixels_source = dd.read_csv(POINTS_URL,header=0,
                                            na_values = ' ',
                                            blocksize=100e6)

    # Get the list of column names
    column_names = df_pixels_source.columns

    # List to store the column indices
    column_indices = []

    # Loop through column names to find the ones ending with "_avg_adm2" or "_avg_adm3"
    for idx, col_name in enumerate(column_names):
        if col_name.endswith('_avg_adm2') or col_name.endswith('_avg_adm3'):
            column_indices.append(idx)
                 
    min_col_num = column_indices[0]+1
    max_col_num = column_indices[len(column_indices)-1]+2

    # read each service's pop-weighted access times within adm units at a specific adm level, and aggregate
    for col_num in range(min_col_num, max_col_num):

        if max_level == 'adm2':
            df_pixels = dd.multi.concat([df_pixels_source[["ADM2_EN","Adm2_Code","wt_adm_2"]],df_pixels_source.iloc[: , [col_num-1]]],axis=1)
        else:
            df_pixels = dd.multi.concat([df_pixels_source[["ADM3_EN","ADM2_EN","Adm3_Code","Adm2_Code","wt_adm_3","wt_adm_2"]],df_pixels_source.iloc[: , [col_num-1]]],axis=1)
        
        df_pixels = df_pixels.persist()

        float64_cols = list(df_pixels.select_dtypes(include='float64'))

        # The same code again calling the columns
        df_pixels[float64_cols] = df_pixels[float64_cols].astype('float32')

        if max_level == 'adm2':
            access_col = df_pixels.columns[3]
        else:
            access_col = df_pixels.columns[6]

        adm_level = access_col[-1]

        if adm_level == '2':
            adm_code = 'Adm2_Code'
            adm_frame = df_pixels[['ADM2_EN', 'Adm2_Code']].copy()
            adm_frame = adm_frame.drop_duplicates(subset='ADM2_EN')
            adm_frame = adm_frame.sort_values("ADM2_EN")
        else:
            adm_code = 'Adm3_Code'    
            adm_frame = df_pixels.copy()
            adm_frame = df_pixels[['ADM3_EN','ADM2_EN','Adm3_Code','Adm2_Code']].copy()
            adm_frame = adm_frame.drop_duplicates(subset='ADM3_EN')
            adm_frame = adm_frame.sort_values("ADM3_EN")

        #######################################################################
        #AGGREGATION
        
        adm_aggr = df_pixels.groupby([adm_code])[access_col].sum().reset_index()

        #INDEXING
        
        min_acc = np.array(np.min(adm_aggr[access_col],axis=0)) # column-wise min
        max_acc = np.array(np.max(adm_aggr[access_col],axis=0)) # column-wise max
        
        adm_aggr[access_col+'_idx'] = (max_acc - adm_aggr[access_col]) / (max_acc - min_acc)

        #Finalise
        
        adm_aggr_final = adm_frame.copy()
        adm_aggr_final = adm_aggr_final.merge(adm_aggr,on=adm_code)

        adm_aggr_final.to_csv(access_dir + 'tables' + '\\separate\\' + f"{access_col}_final.csv", single_file=True, index=False)

        #######################################################################
        #BINNING
        # here we determine some more in-depth stats, that determine the % adm population within travel time bins to the specific service for the ssn-mode in question

        access_col_trim = access_col[:-9]
        
        if max_level == 'adm2':
            dft = dd.multi.concat([df_pixels_source[["POP",adm_code]],df_pixels_source[access_col_trim]],axis=1)
        else:
            dft = dd.multi.concat([df_pixels_source[["POP",adm_code]],df_pixels_source[access_col_trim]],axis=1)

        dft = dft.compute()

        # Travel time ranges
        tt_bins = [0, 0.5, 1, 2, 4, 8, 16, 10000]
        tt_bin_labels = ["0 - 30 minutes", "31 - 60 minutes", "1 - 2 hours", "2 - 4 hours", "4 - 8 hours", "8 - 16 hours", "16+ hours"]

        # rename dict
        tt_rename_dct = {
            1 : "0 - 30 minutes",
            2 : "31 - 60 minutes",
            3 : "1 - 2 hours",
            4 : "2 - 4 hours",
            5 : "4 - 8 hours",
            6 : "8 - 16 hours",
            7 : "16+ hours"}

        dft[access_col_trim] = pd.DataFrame(np.digitize(dft[access_col_trim], bins=tt_bins),columns=[access_col_trim], index=dft.index)
        dft[access_col_trim] = dft[access_col_trim].replace(tt_rename_dct)

        # For each indicator, pivot data by administrative unit, calculate the pct of total population per travel time bin, and reshape the data into a long format.</br>Then merge all these reshaped long tables into one master table 
        long_data_lst_adm = []
        
        long_i_adm = long_per_indicator(dft,access_col_trim,adm_code) 
        long_data_lst_adm.append(long_i_adm)

        # concatenate
        long_acc_indicators_adm = pd.concat(long_data_lst_adm,ignore_index=True)

        # convert tt ranges to categorical and order appropriately
        long_acc_indicators_adm['travel_time_range'] = long_acc_indicators_adm.travel_time_range.astype('category').cat.set_categories(tt_bin_labels)

        # # order as desired
        long_acc_indicators_adm = long_acc_indicators_adm.sort_values([adm_code,'indicator','travel_time_range']).reset_index(drop=True)

        # Calculate cumulative sums per indicator and the Adm2_Code for Adm3 datasets
        long_acc_indicators_adm['pop_pct_csum'] = long_acc_indicators_adm.groupby([adm_code,'indicator'])['pop_pct'].cumsum(axis=0)
        long_acc_indicators_adm['pop_total_csum'] = long_acc_indicators_adm.groupby([adm_code,'indicator'])['pop_total'].cumsum(axis=0)

        float64_cols = list(long_acc_indicators_adm.select_dtypes(include='float64'))
        long_acc_indicators_adm[float64_cols] = long_acc_indicators_adm[float64_cols].astype('float32')
        
        # Export final long data
        adm_long_final = adm_frame.copy()
        adm_long_final = adm_long_final.merge(long_acc_indicators_adm,on=adm_code)
        
        adm_long_final.to_csv(access_dir + 'tables' + '\\separate\\' + f"{access_col}_acc_indicators_long.csv", single_file=True, index = False)

client.close()