## Generate Results

Step 3 gives us raw OD matrices but these are not immediately useful for standard visualization tools in R, QGIS, Python, etc. In this notebook, we will import and manipulate the OD matrix - which should have been calculated separately from Steps 1 - 3.  The manipulated results are chart and map friendly, aggregated at various levels.


This notebook focuses on Cox's Bazar, Bangladesh as a case study. CXB is relatively small with only 8 Adm2 and 75 Adm3 units -- the analysis here could be slower for much larger areas.

In [1]:
import pandas as pd
import os, sys

import GOSTnets as gn
import importlib
importlib.reload(gn)
import geopandas as gpd
import rasterio
from rasterio import features
from shapely.wkt import loads
from shapely import wkt
import numpy as np
import pandas as pd

import re

import palettable
from functools import reduce
from pandas.api.types import CategoricalDtype
from plotnine import *

from pprint import pprint

from mizani.formatters import percent_format


Log performance / CPU usage

In [2]:

#!/usr/bin/env python
import psutil
# gives a single float value
psutil.cpu_percent()
# gives an object with many fields
psutil.virtual_memory()
# you can convert that object to a dictionary 
dict(psutil.virtual_memory()._asdict())


{'total': 34342551552,
 'available': 27818823680,
 'percent': 19.0,
 'used': 6523727872,
 'free': 27818823680}

Set path locations

In [2]:
basepth = r'..'

input_pth = r'inputs'
interm_pth = r'intermediate'
fin_pth = 'final'
net_pth = 'results/200521' # change folder name to date range of last output

chart_pth = r'charts'
table_pth = r'tables'

adm_pth = r'../../../GEO/Boundaries'
geo_pth = r'../../../GEO'


Load in tabular data

In [3]:
hrsl_adm = pd.read_csv(os.path.join(geo_pth,'Population/CXB/hrsl_pts_admins.csv'))
demog_upz = pd.read_csv(os.path.join(input_pth,'demog2011_upz.csv'))
wt_demog_upz = pd.read_csv(os.path.join(input_pth,'wt_demog2011_upz.csv'))
econ_union = pd.read_csv(os.path.join(input_pth,'econ_union_clean.csv'))
demog_union = pd.read_excel(os.path.join(input_pth,'demog2011_union.xls'))

Load in admin data

In [4]:
# load admin spatial data

adm2 = gpd.read_file(os.path.join(geo_pth,'Boundaries/adm2/bgd_admbnda_adm2_bbs_20180410.shp'))
adm2.crs = 'epsg:4326'

# adm3 = gpd.read_file(os.path.join(geo_pth,'Boundaries/bangladesh_upazila_boundary/bangladesh_upazila_boundary.shp')) # 8 in CXB

adm3 = gpd.read_file(os.path.join(geo_pth,'Boundaries/adm3/bgd_admbnda_adm3_bbs_20180410.shp'))
adm3.crs = 'epsg:4326'

adm4 = gpd.read_file(os.path.join(geo_pth,'Boundaries/adm4/bgd_admbnda_adm4_bbs_20180410.shp')) # 75 in CXB
adm4.crs = 'epsg:4326'

In [5]:
# Fix types for later joining
adm3['ADM3_PCODE'] = adm3['ADM3_PCODE'].astype(str)
adm4['ADM4_PCODE'] = adm4['ADM4_PCODE'].astype(str)
wt_demog_upz['adm3_code'] = wt_demog_upz['adm3_code'].astype(str)
econ_union['union_code'] = econ_union['union_code'].astype(str)
demog_union.union_code_alt = demog_union.union_code_alt.astype('str')
# demog_union.ADM4_PCODE = demog_union.ADM4_PCODE.astype('str')

In [6]:
# Merge admin data with econ data

adm3 = pd.merge(adm3,wt_demog_upz,how='left',left_on=['ADM3_PCODE'],right_on=['adm3_code'])
adm4 = pd.merge(adm4,econ_union,how='left',left_on=['ADM4_PCODE'],right_on=['union_code'])
adm4 = pd.merge(adm4,demog_union,how='left',left_on=['ADM4_PCODE'],right_on=['union_code_alt'])

In [7]:
adm3 = adm3[adm3['Zila_Name'].notna()]
adm4 = adm4[adm4['totwor'].notna()]

In [8]:
adm3['ADM3_EN'] = adm3['ADM3_EN'].str.title()

Classify Teknaf and Ukhia separately for comparative analysis at adm3 level

In [9]:
tu_pcodes = ['202290','202294']

adm3['TU'] = adm3['ADM3_PCODE'].apply(lambda x: 'Teknaf & Ukhia' if x in tu_pcodes else 'Other')

### Prepare OD data

Load in all the OD matrices for four scenarios: current, all upgraded roads, all upgrades but none to the south, and upgraded roads but no upgraded ferry

In [10]:
current_cxb = r'final_cxb_current_od_grid_CXB.csv'
current_chitt = r'final_cxb_current_od_grid_chittagong.csv'
current_martar = r'final_cxb_current_od_grid_martarbari.csv'
current_health = r'final_cxb_current_od_grid_health.csv'
current_primary_ed = r'final_cxb_current_od_grid_Primary_education.csv'
current_secondary_ed = r'final_cxb_current_od_grid_Secondary_education.csv'
current_tertiary_ed = r'final_cxb_current_od_grid_Tertiary_education.csv'
current_allmkts = r'final_cxb_current_od_grid_All_markets.csv'
current_growthcenters = r'final_cxb_current_od_grid_Growth_centers.csv'

ua_cxb = r'final_cxb_upgrade_all_od_grid_CXB.csv'
ua_chitt = r'final_cxb_upgrade_all_od_grid_chittagong.csv'
ua_martar = r'final_cxb_upgrade_all_od_grid_martarbari.csv'
ua_health = r'final_cxb_upgrade_all_od_grid_health.csv'
ua_primary_ed = r'final_cxb_upgrade_all_od_grid_Primary_education.csv'
ua_secondary_ed = r'final_cxb_upgrade_all_od_grid_Secondary_education.csv'
ua_tertiary_ed = r'final_cxb_upgrade_all_od_grid_Tertiary_education.csv'
ua_allmkts = r'final_cxb_upgrade_all_od_grid_All_markets.csv'
ua_growthcenters = r'final_cxb_upgrade_all_od_grid_Growth_centers.csv'

uns_cxb = r'final_cxb_upgrade_nosouth_od_grid_CXB.csv'
uns_chitt = r'final_cxb_upgrade_nosouth_od_grid_chittagong.csv'
uns_martar = r'final_cxb_upgrade_nosouth_od_grid_martarbari.csv'
uns_health = r'final_cxb_upgrade_nosouth_od_grid_health.csv'
uns_primary_ed = r'final_cxb_upgrade_nosouth_od_grid_Primary_education.csv'
uns_secondary_ed = r'final_cxb_upgrade_nosouth_od_grid_Secondary_education.csv'
uns_tertiary_ed = r'final_cxb_upgrade_nosouth_od_grid_Tertiary_education.csv'
uns_allmkts = r'final_cxb_upgrade_nosouth_od_grid_All_markets.csv'
uns_growthcenters = r'final_cxb_upgrade_nosouth_od_grid_Growth_centers.csv'

unf_cxb = r'final_cxb_upgrade_noferry_od_grid_CXB.csv'
unf_chitt = r'final_cxb_upgrade_noferry_od_grid_chittagong.csv'
unf_martar = r'final_cxb_upgrade_noferry_od_grid_martarbari.csv'
unf_health = r'final_cxb_upgrade_noferry_od_grid_health.csv'
unf_primary_ed = r'final_cxb_upgrade_noferry_od_grid_Primary_education.csv'
unf_secondary_ed = r'final_cxb_upgrade_noferry_od_grid_Secondary_education.csv'
unf_tertiary_ed = r'final_cxb_upgrade_noferry_od_grid_Tertiary_education.csv'
unf_allmkts = r'final_cxb_upgrade_noferry_od_grid_All_markets.csv'
unf_growthcenters = r'final_cxb_upgrade_noferry_od_grid_Growth_centers.csv'

In [11]:
# A dict combining files from all scenarios

all_scenarios = { 'current_cxb' : current_cxb, 'current_chitt' : current_chitt, 'current_martar' : current_martar, \
    'current_health' : current_health, 'current_primary_ed' : current_primary_ed, 'current_secondary_ed' : current_secondary_ed, \
    'current_tertiary_ed' : current_tertiary_ed, 'current_allmkts' : current_allmkts, 'current_growthcenters' :  current_growthcenters, \
    'ua_cxb' : ua_cxb, 'ua_chitt' : ua_chitt, 'ua_martar' : ua_martar, 'ua_health' : ua_health, 'ua_primary_ed' : ua_primary_ed, \
    'ua_secondary_ed' : ua_secondary_ed, 'ua_tertiary_ed' : ua_tertiary_ed, 'ua_allmkts' : ua_allmkts, 'ua_growthcenters' :  ua_growthcenters,\
    'uns_cxb' : uns_cxb, 'uns_chitt' : uns_chitt, 'uns_martar' : uns_martar, 'uns_health' : uns_health, 'uns_primary_ed' : uns_primary_ed, \
    'uns_secondary_ed' : uns_secondary_ed, 'uns_tertiary_ed' : uns_tertiary_ed, 'uns_allmkts' : uns_allmkts, 'uns_growthcenters' :  uns_growthcenters, \
    'unf_cxb' : unf_cxb, 'unf_chitt' : unf_chitt, 'unf_martar' : unf_martar, 'unf_health' : unf_health, 'unf_primary_ed' : unf_primary_ed, \
    'unf_secondary_ed' : unf_secondary_ed, 'unf_tertiary_ed' : unf_tertiary_ed, 'unf_allmkts' : unf_allmkts, 'unf_growthcenters' :  unf_growthcenters }

# old scenario specific dicts -- discontinued 

# current = { 'current_cxb' : current_cxb, 'current_chitt' : current_chitt, 'current_martar' : current_martar, 'current_health' : current_health, 'current_primary_ed' : current_primary_ed, 'current_secondary_ed' : current_secondary_ed, 'current_tertiary_ed' : current_tertiary_ed, 'current_allmkts' : current_allmkts, 'current_growthcenters' :  current_growthcenters }
# upgrade_all = { 'ua_cxb' : ua_cxb, 'ua_chitt' : ua_chitt, 'ua_martar' : ua_martar, 'ua_health' : ua_health, 'ua_primary_ed' : ua_primary_ed, 'ua_secondary_ed' : ua_secondary_ed, 'ua_tertiary_ed' : ua_tertiary_ed, 'ua_allmkts' : ua_allmkts, 'ua_growthcenters' :  ua_growthcenters }
# upgrade_noferry = { 'unf_cxb' : unf_cxb, 'unf_chitt' : unf_chitt, 'unf_martar' : unf_martar, 'unf_health' : unf_health, 'unf_primary_ed' : unf_primary_ed, 'unf_secondary_ed' : unf_secondary_ed, 'unf_tertiary_ed' : unf_tertiary_ed, 'unf_allmkts' : unf_allmkts, 'unf_growthcenters' :  unf_growthcenters }


### Grid level analysis

##### Basic data loading and manipuation

Population weighting origin grid data by admin unit. This creates aggregate statistics suitable for choropleth maps

Admin codes and population columns will need to be changed as per your context

In [12]:
# iterate through the dict

for key, layer in all_scenarios.items():
    
    # read in the CSVs, convert to geometry
    
    od = pd.read_csv(os.path.join(net_pth,layer))
    od['geometry'] = od['geometry'].apply(wkt.loads)
    od = gpd.GeoDataFrame(od,geometry='geometry')
    od.crs = 'epsg:4326'
    
    # spatial join admin information
    
    od = gpd.sjoin(od,adm3[['geometry','ADM3_EN','ADM3_PCODE','TU']],op="within")
#     od.to_file(os.path.join(net_pth,'test.shp'))
    od = od.drop('index_right',axis=1)
    od = gpd.sjoin(od,adm4[['geometry','ADM4_EN','ADM4_PCODE']],op="within")
    
    # weight accessibility info by population
    
    od['adm3_pop'] = od[od['PLOT_TIME_MINS'] < 150000].groupby('ADM3_PCODE')['VALUE'].transform(np.sum)
    od['adm4_pop'] = od[od['PLOT_TIME_MINS'] < 150000].groupby('ADM4_PCODE')['VALUE'].transform(np.sum)
    od['PLOT_TIME_MINS_WT_adm3'] = (od['PLOT_TIME_MINS'] * (od['VALUE'] / od['adm3_pop']))
    od['PLOT_TIME_MINS_WT_adm4'] = (od['PLOT_TIME_MINS'] * (od['VALUE'] / od['adm4_pop']))

    all_scenarios[key] = od



Summarizing weighted populations per admin. This provides input data for adm2/3 aggregation operations

In [13]:
for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
    
    # adm3
    
    upz = layer[layer['PLOT_TIME_MINS'] < 150000].groupby(['ADM3_EN','ADM3_PCODE']).agg(
        at = ('PLOT_TIME_MINS_WT_adm3',np.sum),    ) 
    upz = upz.rename(columns={'at' : at})
    
    upz = upz.reset_index()
    upz['ADM3_PCODE'] = upz['ADM3_PCODE'].astype(str)
    
    upz = upz[['ADM3_PCODE',at]]
    
    adm3 = pd.merge(adm3,upz,how='left',on=['ADM3_PCODE'])

    # adm4
    
    union = layer[layer['PLOT_TIME_MINS'] < 150000].groupby(['ADM4_EN','ADM4_PCODE']).agg(
        at = ('PLOT_TIME_MINS_WT_adm4',np.sum),    ) 
    union = union.rename(columns={'at' : at})
    
    union = union.reset_index()
    union['ADM4_PCODE'] = union['ADM4_PCODE'].astype(str)
    
    union = union[['ADM4_PCODE',at]]
    
    adm4 = pd.merge(adm4,union,how='left',on=['ADM4_PCODE'])  

Now join in HRPD population, so population figures are consistently HRPD throughout the analysis (rather than from the Pop Census or other sources which lack origin-level population data)

In [14]:
# upazila

upz = layer[layer['PLOT_TIME_MINS'] < 150000].groupby(['ADM3_EN','ADM3_PCODE']).agg(
    pop=('VALUE',sum),    ) 

upz = upz.reset_index()
upz['ADM3_PCODE'] = upz['ADM3_PCODE'].astype(str)

upz = upz[['ADM3_PCODE','pop']]

adm3 = pd.merge(adm3,upz,how='left',on=['ADM3_PCODE'])

# union

union = layer[layer['PLOT_TIME_MINS'] < 150000].groupby(['ADM4_EN','ADM4_PCODE']).agg(
    pop=('VALUE',sum),    ) 

union = union.reset_index()
union['ADM4_PCODE'] = union['ADM4_PCODE'].astype(str)

union = union[['ADM4_PCODE','pop']]

adm4 = pd.merge(adm4,union,how='left',on=['ADM4_PCODE'])  

Export

In [15]:
adm3.loc[:, adm3.columns != 'geometry'].to_csv(os.path.join('results/tables/adm3.csv'))
adm4.loc[:, adm4.columns != 'geometry'].to_csv(os.path.join('results/tables/adm4.csv'))

In [16]:
adm3.to_file('results/spatial/adm3.shp')
adm4.to_file('results/spatial/adm4.shp')

## Calculating population weighted statistics

Here we relate accessibility to various secondary data points from the population and economic censuses in Bangladesh, aggregated at adm2, adm3, and adm4 levels. Note that some data points were repeated between the two, although calculated slightly differently, and we extracted both for comparative purposes.

You will need to adapt the code below to your own use cases as the format and content of secondary input data can vary substantially, of course.

### Adm2 - District-level summary statistics

##### By employment category - 2013 economic census

In [17]:
econC_empl_data = {'Employment type': ['Industrial','Service']}
econC_empl_data = pd.DataFrame(econC_empl_data)

In [18]:
for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
#     iec = (key + '_avg_ind_empl_econC')
#     sec = (key + '_avg_ser_empl_econC')

    iec = sum(adm4[at] * (adm4['ind_wor'] / sum(adm4['ind_wor'])))
    sec = sum(adm4[at] * (adm4['ser_wor'] / sum(adm4['ser_wor'])))
    
    econC_empl_data[at] = [iec,sec]


In [19]:
econC_empl_data

Unnamed: 0,Employment type,current_cxb_avg_time,current_chitt_avg_time,current_martar_avg_time,current_health_avg_time,current_primary_ed_avg_time,current_secondary_ed_avg_time,current_tertiary_ed_avg_time,current_allmkts_avg_time,current_growthcenters_avg_time,...,uns_growthcenters_avg_time,unf_cxb_avg_time,unf_chitt_avg_time,unf_martar_avg_time,unf_health_avg_time,unf_primary_ed_avg_time,unf_secondary_ed_avg_time,unf_tertiary_ed_avg_time,unf_allmkts_avg_time,unf_growthcenters_avg_time
0,Industrial,69.721833,144.236295,100.714838,20.500267,7.606833,13.696061,78.08301,11.359396,18.245675,...,17.940878,68.19448,142.77644,77.436775,20.107418,7.597471,13.648367,76.519802,11.316069,17.941299
1,Service,66.705941,159.301499,115.251938,15.674706,7.97643,13.276603,80.511624,9.745408,15.176162,...,14.953994,65.317155,157.743778,91.81032,15.389925,7.960752,13.216936,79.102828,9.690076,14.95464


In [20]:
demog_union.columns

Index(['union_code_alt',
       'Number of working individuals in Agriculture 15 years plus',
       'Number of working individuals in Industry 15 years plus',
       'Number of working individuals in Services 15 years plus',
       'Education level Share of people No education 15 years plus',
       'Education level Share of people Primary 15 years plus',
       'Education level Share of people Lower Secondary 15 years plus',
       'Education level Share of people Secondary 15 years plus',
       'Education level Share of people Higher Secondary 15 years plus',
       'Education level Share of people Graduate & Above 15 years plus',
       'Marital status share of people Never married 15 years plus',
       'Marital status share of people Married 15 years plus',
       'Marital status share of people Widower/widowed 15 years plus',
       'Marital status share of people Divorced 15 years plus',
       'Working status share of Working 15 years plus',
       'Working status share of Se

##### By employment category - 2011 main census

In [21]:
Census_empl_data = {'Employment type': ['Agricultural','Industrial','Service']}

Census_empl_data = pd.DataFrame(Census_empl_data)

In [22]:
for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
#     iec = (key + '_avg_ind_empl_econC')
#     sec = (key + '_avg_ser_empl_econC')
    
    aec = sum(adm4[at] * (adm4['Number of working individuals in Agriculture 15 years plus'] / sum(adm4['Number of working individuals in Agriculture 15 years plus'])))
    iec = sum(adm4[at] * (adm4['Number of working individuals in Industry 15 years plus'] / sum(adm4['Number of working individuals in Industry 15 years plus'])))
    sec = sum(adm4[at] * (adm4['Number of working individuals in Services 15 years plus'] / sum(adm4['Number of working individuals in Services 15 years plus'])))

    Census_empl_data[at] = [aec,iec,sec]


In [23]:
Census_empl_data

Unnamed: 0,Employment type,current_cxb_avg_time,current_chitt_avg_time,current_martar_avg_time,current_health_avg_time,current_primary_ed_avg_time,current_secondary_ed_avg_time,current_tertiary_ed_avg_time,current_allmkts_avg_time,current_growthcenters_avg_time,...,uns_growthcenters_avg_time,unf_cxb_avg_time,unf_chitt_avg_time,unf_martar_avg_time,unf_health_avg_time,unf_primary_ed_avg_time,unf_secondary_ed_avg_time,unf_tertiary_ed_avg_time,unf_allmkts_avg_time,unf_growthcenters_avg_time
0,Agricultural,82.25106,156.881943,106.007687,22.432238,8.685197,15.040111,92.773621,12.723294,20.557604,...,20.095527,79.678146,153.744618,83.582775,21.921646,8.665628,14.950061,90.144212,12.643821,20.095968
1,Industrial,66.739518,158.811115,115.729587,16.781632,8.20136,14.321774,78.680341,11.046491,16.431536,...,16.268702,65.912632,157.167901,91.573172,16.578395,8.189666,14.273519,77.839397,11.004644,16.269337
2,Service,57.4136,153.669769,111.257671,14.831436,6.759978,12.222028,72.530404,9.403624,13.788009,...,13.617041,56.430862,152.348516,87.504248,14.629257,6.748042,12.177349,71.538165,9.364192,13.618939


In [24]:
econC_empl_data

Unnamed: 0,Employment type,current_cxb_avg_time,current_chitt_avg_time,current_martar_avg_time,current_health_avg_time,current_primary_ed_avg_time,current_secondary_ed_avg_time,current_tertiary_ed_avg_time,current_allmkts_avg_time,current_growthcenters_avg_time,...,uns_growthcenters_avg_time,unf_cxb_avg_time,unf_chitt_avg_time,unf_martar_avg_time,unf_health_avg_time,unf_primary_ed_avg_time,unf_secondary_ed_avg_time,unf_tertiary_ed_avg_time,unf_allmkts_avg_time,unf_growthcenters_avg_time
0,Industrial,69.721833,144.236295,100.714838,20.500267,7.606833,13.696061,78.08301,11.359396,18.245675,...,17.940878,68.19448,142.77644,77.436775,20.107418,7.597471,13.648367,76.519802,11.316069,17.941299
1,Service,66.705941,159.301499,115.251938,15.674706,7.97643,13.276603,80.511624,9.745408,15.176162,...,14.953994,65.317155,157.743778,91.81032,15.389925,7.960752,13.216936,79.102828,9.690076,14.95464


##### By educational attainment level

In [25]:
# convert education level population shares to population totals

adm4['educ_non_n'] = adm4['Education level Share of people No education 15 years plus'] * adm4['hrsl18sum']
adm4['educ_prim_n'] = adm4['Education level Share of people Primary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_lowsec_n'] = adm4['Education level Share of people Lower Secondary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_sec_n'] = adm4['Education level Share of people Secondary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_highsec_n'] = adm4['Education level Share of people Higher Secondary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_tert_n'] = adm4['Education level Share of people Graduate & Above 15 years plus'] * adm4['hrsl18sum']

# Generate adm3 populations for weighting

adm4['adm3_educ_non_n'] = adm4.groupby('ADM3_PCODE')['educ_non_n'] .transform(np.sum)
adm4['adm3_educ_prim_n'] = adm4.groupby('ADM3_PCODE')['educ_prim_n'].transform(np.sum)
adm4['adm3_educ_lowsec_n'] = adm4.groupby('ADM3_PCODE')['educ_lowsec_n'].transform(np.sum)
adm4['adm3_educ_sec_n'] = adm4.groupby('ADM3_PCODE')['educ_sec_n'].transform(np.sum)
adm4['adm3_educ_highsec_n'] = adm4.groupby('ADM3_PCODE')['educ_highsec_n'].transform(np.sum)
adm4['adm3_educ_tert_n'] = adm4.groupby('ADM3_PCODE')['educ_tert_n'].transform(np.sum)

Create a new long dataset to populate with weighted accessibility figures per education level

In [26]:
educ_data = {'Education level': ['No education','Primary','Lower Secondary','Secondary', 'Higher Secondary','University']}
educ_data = pd.DataFrame(educ_data)

for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
    
    n = ( adm4[at] * (adm4['educ_non_n'] / sum(adm4['educ_non_n'])) ).sum()
    p = ( adm4[at] * (adm4['educ_prim_n'] / sum(adm4['educ_prim_n'])) ).sum()
    ls = ( adm4[at] * (adm4['educ_lowsec_n'] / sum(adm4['educ_lowsec_n'])) ).sum()
    s = ( adm4[at] * (adm4['educ_sec_n'] / sum(adm4['educ_sec_n'])) ).sum()
    hs = ( adm4[at] * (adm4['educ_highsec_n'] / sum(adm4['educ_highsec_n'])) ).sum()
    u = ( adm4[at] * (adm4['educ_tert_n'] / sum(adm4['educ_tert_n'])) ).sum()
    
    educ_data[at] = [n,p,ls,s,hs,u]


In [27]:
educ_data

Unnamed: 0,Education level,current_cxb_avg_time,current_chitt_avg_time,current_martar_avg_time,current_health_avg_time,current_primary_ed_avg_time,current_secondary_ed_avg_time,current_tertiary_ed_avg_time,current_allmkts_avg_time,current_growthcenters_avg_time,...,uns_growthcenters_avg_time,unf_cxb_avg_time,unf_chitt_avg_time,unf_martar_avg_time,unf_health_avg_time,unf_primary_ed_avg_time,unf_secondary_ed_avg_time,unf_tertiary_ed_avg_time,unf_allmkts_avg_time,unf_growthcenters_avg_time
0,No education,81.584066,163.614367,110.853835,21.622942,9.263438,15.047438,92.150188,12.632496,20.513898,...,19.996841,78.110177,160.765616,89.84289,20.976082,9.231906,14.901543,88.66329,12.512195,19.997997
1,Primary,77.748027,156.435781,109.134112,21.084234,8.86636,15.197866,87.282769,12.615369,19.777505,...,19.357196,75.131508,154.454972,87.192253,20.508744,8.846047,15.084462,84.659582,12.530021,19.358306
2,Lower Secondary,74.178851,151.494891,101.919094,21.390018,8.500309,14.717662,83.557875,12.408179,19.494851,...,19.015635,71.209681,149.140222,80.196266,20.748852,8.473996,14.58533,80.581106,12.306027,19.017117
3,Secondary,70.089948,151.797883,104.201249,19.499466,8.101776,13.942996,80.812615,11.683841,17.968166,...,17.566606,67.547055,149.796725,82.082442,18.96087,8.078648,13.826344,78.26515,11.594775,17.567819
4,Higher Secondary,65.975348,151.826891,105.959405,18.21209,7.788637,13.352116,77.789903,11.136767,16.882603,...,16.540733,63.846601,150.127671,83.409088,17.761803,7.771125,13.259139,75.655636,11.066613,16.541969
5,University,65.12771,151.93028,105.889508,17.867245,7.50717,12.98266,77.38165,10.829348,16.266599,...,15.937595,62.986627,150.180405,83.331562,17.409494,7.490611,12.897397,75.234765,10.764994,15.93866


### Categorize raw origin access data to show distribution

Reset the all_scenarios dict, as the OD files must be read in as raw CSVs and were transformed into geodataframes above

In [28]:
# A dict combining files from all scenarios

all_scenarios = { 'current_cxb' : current_cxb, 'current_chitt' : current_chitt, 'current_martar' : current_martar, \
    'current_health' : current_health, 'current_primary_ed' : current_primary_ed, 'current_secondary_ed' : current_secondary_ed, \
    'current_tertiary_ed' : current_tertiary_ed, 'current_allmkts' : current_allmkts, 'current_growthcenters' :  current_growthcenters, \
    'ua_cxb' : ua_cxb, 'ua_chitt' : ua_chitt, 'ua_martar' : ua_martar, 'ua_health' : ua_health, 'ua_primary_ed' : ua_primary_ed, \
    'ua_secondary_ed' : ua_secondary_ed, 'ua_tertiary_ed' : ua_tertiary_ed, 'ua_allmkts' : ua_allmkts, 'ua_growthcenters' :  ua_growthcenters,\
    'uns_cxb' : uns_cxb, 'uns_chitt' : uns_chitt, 'uns_martar' : uns_martar, 'uns_health' : uns_health, 'uns_primary_ed' : uns_primary_ed, \
    'uns_secondary_ed' : uns_secondary_ed, 'uns_tertiary_ed' : uns_tertiary_ed, 'uns_allmkts' : uns_allmkts, 'uns_growthcenters' :  uns_growthcenters, \
    'unf_cxb' : unf_cxb, 'unf_chitt' : unf_chitt, 'unf_martar' : unf_martar, 'unf_health' : unf_health, 'unf_primary_ed' : unf_primary_ed, \
    'unf_secondary_ed' : unf_secondary_ed, 'unf_tertiary_ed' : unf_tertiary_ed, 'unf_allmkts' : unf_allmkts, 'unf_growthcenters' :  unf_growthcenters }

# old, scenario specific dicts

# current = { 'current_cxb' : current_cxb, 'current_chitt' : current_chitt, 'current_martar' : current_martar, 'current_health' : current_health, 'current_primary_ed' : current_primary_ed, 'current_secondary_ed' : current_secondary_ed, 'current_tertiary_ed' : current_tertiary_ed, 'current_allmkts' : current_allmkts, 'current_growthcenters' :  current_growthcenters }
# upgrade_all = { 'ua_cxb' : ua_cxb, 'ua_chitt' : ua_chitt, 'ua_martar' : ua_martar, 'ua_health' : ua_health, 'ua_primary_ed' : ua_primary_ed, 'ua_secondary_ed' : ua_secondary_ed, 'ua_tertiary_ed' : ua_tertiary_ed, 'ua_allmkts' : ua_allmkts, 'ua_growthcenters' :  ua_growthcenters }
# upgrade_noferry = { 'unf_cxb' : unf_cxb, 'unf_chitt' : unf_chitt, 'unf_martar' : unf_martar, 'unf_health' : unf_health, 'unf_primary_ed' : unf_primary_ed, 'unf_secondary_ed' : unf_secondary_ed, 'unf_tertiary_ed' : unf_tertiary_ed, 'unf_allmkts' : unf_allmkts, 'unf_growthcenters' :  unf_growthcenters }


In [29]:
# # backup for reference, in case time ranges need to be adjusted

# odcxb['time_cat'] = pd.cut((odcxb['PLOT_TIME_MINS']),[0,15,30,45,60,75,90,120,180,240,300])
# odchitt['time_cat'] = pd.cut((odchitt['PLOT_TIME_MINS']),[60,120,180,240,300,360,420,480,540])
# odmartar['time_cat'] = pd.cut((odmartar['PLOT_TIME_MINS']),[0,15,30,45,60,75,90,120,180,240,300])
# odec['time_cat'] = pd.cut((odec['PLOT_TIME_MINS']),[0,15,30,45,60,75,90,120,180,240,300])
# odhealth['time_cat'] = pd.cut((odhealth['PLOT_TIME_MINS']),[0,15,30,45,60,75,90,120,180,240,300])

Aggregate OD results per destination (across Cox's Bazar) by time travel ranges ("categories")

In [30]:
timecats = []

for key, layer in all_scenarios.items():
    
#     print(key) # for debuggindng
#     print(layer)

    kpn = key + '_popn'
    at = key + '_avg_time'
    ppct = key + '_pop_pct'
    
    od = pd.read_csv(os.path.join(net_pth,layer)) # won't work if dict isn't reset
    od = pd.DataFrame(od)
    od['time_cat'] = pd.cut((od['PLOT_TIME_MINS']),[0,15,30,45,60,75,90,120,180,240,300])
    
    time_cat = od[od['PLOT_TIME_MINS'] < 15000].groupby(['time_cat']).agg(
        kpn=('VALUE',sum),
        at=('PLOT_TIME_MINS',np.mean),
    )

    time_cat[ppct] = (time_cat['kpn'] / sum(time_cat['kpn']))
    time_cat.kpn = time_cat.kpn.astype('int64')
    
    time_cat = time_cat.rename(columns={'kpn': kpn ,'at': at})
    time_cat = pd.DataFrame(time_cat)
    
    timecats.append(time_cat)

In [31]:
timecats_merged = reduce(lambda left,right: pd.merge(left,right,on='time_cat',how='outer'),timecats)
timecats_merged = timecats_merged.reset_index()
timecats_merged['time_cat'] = timecats_merged['time_cat'].astype(str).str.replace(', ',' - ', regex=True).str.replace('(','', regex=True).str.replace(']','', regex=True)
# timecats_merged = timecats_merged.set_index('time_cat') # need to keep the index off for use in graphing

Create similar analysis but with Teknaf and Ukhia vs. Other Upazilas

In [32]:
timecats = []

for key, layer in all_scenarios.items():
    
#     print(key) # for debugging

    kpn = key + '_popn'
    at = key + '_avg_time'
    ppct = key + '_pop_pct'
    
    od = pd.read_csv(os.path.join(net_pth,layer)) # won't work if dict isn't reset
    
    od['geometry'] = od['geometry'].apply(wkt.loads)
    od = gpd.GeoDataFrame(od,geometry='geometry')
    od.crs = 'epsg:4326'
    
    # spatial join admin information
    
    od = gpd.sjoin(od,adm3[['geometry','ADM3_EN','ADM3_PCODE','TU']],op="within")
#     od.to_file(os.path.join(interm_pth,'{}_test.shp'.format(at))) # for debugging
    od = od.drop('index_right',axis=1)

    # back to tabular data for time category processing
    
    od = pd.DataFrame(od)
    
    od['tu_pop'] = od[od['PLOT_TIME_MINS'] < 15000].groupby('TU')['VALUE'].transform(np.sum) # prepare the total population ahead of time for creating accurate % breakdowns
    
    od['time_cat'] = pd.cut((od['PLOT_TIME_MINS']),[0,15,30,45,60,75,90,120,180,240,300]) # ,360,420 if we want to include Chittagong and Martar

    time_cat = od[od['PLOT_TIME_MINS'] < 15000].groupby(['TU','time_cat']).agg(
        kpn=('VALUE',sum),
        at=('PLOT_TIME_MINS',np.mean),
        tu_pop = ('tu_pop',np.mean)
    )
    
    # perform some basic calculations to replace nas with 0 and create percentage-wise data
#     print(time_cat.info) # for debugging

    time_cat.kpn = time_cat.kpn.fillna(0).astype('int64')
    
    time_cat[ppct] = (time_cat['kpn'] / time_cat['tu_pop'])
    
    time_cat = time_cat.rename(columns={'kpn': kpn ,'at': at})
    time_cat = pd.DataFrame(time_cat)
    
    # append to the master timecats list, which will then be reformatted into a data frame below
    timecats.append(time_cat)

In [33]:
timecats_merged_tu = reduce(lambda left,right: pd.merge(left,right,on=['TU','time_cat'],how='outer'),timecats)
timecats_merged_tu = timecats_merged_tu.reset_index()
timecats_merged_tu['time_cat'] = timecats_merged_tu['time_cat'].astype(str).str.replace(', ',' - ', regex=True).str.replace('(','', regex=True).str.replace(']','', regex=True)
# timecats_merged_tu = timecats_merged_tu.set_index('time_cat') # need to keep the index off for use in graphing

Create ordered labels for charts

In [34]:
time_cat_order_small = ['0 - 15','15 - 30','30 - 45','45 - 60','60 - 75', '75 - 90','90 - 120','120 - 180','180 - 240','240 - 300']
time_cat_order_small_tu = ['0 - 15','15 - 30','30 - 45','45 - 60','60 - 75', '75 - 90','90 - 120','120 - 180','180 - 240','240 - 300'] # ,'300 - 360','360 - 420' if we want to look at CXB and Martarbari
# time_cat_order_large = ['60 - 120','120 - 180','180 - 240','240 - 300','300 - 360', '360 - 420','420 - 480','480 - 540']

# create an ordered categorical column based on these time ranges

# maintimecat = pd.Categorical(timecats_merged['time_cat'], categories=time_cat_order_small)
maintimecat_tu = pd.Categorical(timecats_merged_tu['time_cat'], categories=time_cat_order_small_tu)

# assign to a new column in the DataFrames

# timecats_merged = timecats_merged.assign(time_cat_label = maintimecat)
timecats_merged_tu = timecats_merged_tu.assign(time_cat_label = maintimecat_tu)

In [35]:
timecats_merged

Unnamed: 0,time_cat,current_cxb_popn,current_cxb_avg_time,current_cxb_pop_pct,current_chitt_popn,current_chitt_avg_time,current_chitt_pop_pct,current_martar_popn,current_martar_avg_time,current_martar_pop_pct,...,unf_secondary_ed_pop_pct,unf_tertiary_ed_popn,unf_tertiary_ed_avg_time,unf_tertiary_ed_pop_pct,unf_allmkts_popn,unf_allmkts_avg_time,unf_allmkts_pop_pct,unf_growthcenters_popn,unf_growthcenters_avg_time,unf_growthcenters_pop_pct
0,0 - 15,260816,7.786524,0.100017,0,,0.0,30021,10.616797,0.011555,...,0.64469,7464,11.429357,0.002862,1865057,8.433703,0.715209,1314344,8.764055,0.504023
1,15 - 30,195486,23.066656,0.074965,0,,0.0,89712,22.92984,0.034531,...,0.282816,70817,24.825084,0.027157,630587,20.121587,0.241817,825297,21.425728,0.316484
2,30 - 45,308582,37.952456,0.118335,0,,0.0,129946,38.147459,0.050018,...,0.051603,448352,37.94009,0.171934,90275,35.585654,0.034619,296230,36.618078,0.113598
3,45 - 60,268589,52.174418,0.102998,0,,0.0,182809,53.009253,0.070366,...,0.013756,302326,52.331791,0.115936,19465,50.813722,0.007465,105009,50.867473,0.040269
4,60 - 75,210667,67.565939,0.080786,0,,0.0,210119,67.920415,0.080878,...,0.002889,303409,67.263231,0.116351,2322,65.510529,0.000891,33875,65.956087,0.012991
5,75 - 90,291274,83.0603,0.111698,28669,85.965448,0.011035,345738,82.760336,0.133079,...,0.000522,300394,83.150173,0.115195,0,,0.0,16776,81.575417,0.006433
6,90 - 120,560579,105.940941,0.21497,373822,107.004267,0.143889,713344,105.346847,0.274575,...,0.0,695291,105.235574,0.266629,0,,0.0,6461,96.988016,0.002478
7,120 - 180,475773,140.846751,0.182449,1565215,147.10082,0.60247,643984,142.172702,0.247877,...,0.003578,461422,141.786044,0.176945,0,,0.0,9559,169.128178,0.003666
8,180 - 240,26226,192.180977,0.010057,511136,203.501339,0.196743,252281,202.228016,0.097106,...,0.000146,8519,188.101365,0.003267,0,,0.0,152,180.214066,5.8e-05
9,240 - 300,9711,255.471617,0.003724,119152,255.279121,0.045863,38,246.855887,1.5e-05,...,0.0,9711,274.471688,0.003724,0,,0.0,0,,0.0


In [36]:
timecats_merged_tu

Unnamed: 0,TU,time_cat,current_cxb_popn,current_cxb_avg_time,tu_pop_x,current_cxb_pop_pct,current_chitt_popn,current_chitt_avg_time,tu_pop_y,current_chitt_pop_pct,...,unf_tertiary_ed_pop_pct,unf_allmkts_popn,unf_allmkts_avg_time,tu_pop_x.1,unf_allmkts_pop_pct,unf_growthcenters_popn,unf_growthcenters_avg_time,tu_pop_y.1,unf_growthcenters_pop_pct,time_cat_label
0,Other,0 - 15,260816,7.786524,2069835.0,0.126008,0,,,,...,0.003606,1411074,8.415461,2069835.0,0.681733,925304,8.606717,2069835.0,0.447042,0 - 15
1,Other,15 - 30,195051,23.051096,2069835.0,0.094235,0,,,,...,0.034214,551625,20.180486,2069835.0,0.266507,693973,21.682361,2069835.0,0.335279,15 - 30
2,Other,30 - 45,213427,36.988654,2069835.0,0.103113,0,,,,...,0.216612,86723,35.606116,2069835.0,0.041899,290102,36.651782,2069835.0,0.140157,30 - 45
3,Other,45 - 60,164056,52.719915,2069835.0,0.07926,0,,,,...,0.12497,18354,50.787181,2069835.0,0.008867,103643,50.862695,2069835.0,0.050073,45 - 60
4,Other,60 - 75,147386,67.472177,2069835.0,0.071207,0,,,,...,0.080359,2057,65.805838,2069835.0,0.000994,33572,65.970063,2069835.0,0.01622,60 - 75
5,Other,75 - 90,209060,83.016109,2069835.0,0.101003,28669,85.965448,2069835.0,0.013851,...,0.121887,0,,,,16776,81.575417,2069835.0,0.008105,75 - 90
6,Other,90 - 120,422327,106.224785,2069835.0,0.204039,373822,107.004267,2069835.0,0.180605,...,0.249458,0,,,,6461,96.988016,2069835.0,0.003122,90 - 120
7,Other,120 - 180,431482,141.482233,2069835.0,0.208462,1494638,145.827479,2069835.0,0.722105,...,0.164776,0,,,,0,,,,120 - 180
8,Other,180 - 240,26226,192.180977,2069835.0,0.012671,168361,200.801515,2069835.0,0.08134,...,0.004116,0,,,,0,,,,180 - 240
9,Other,240 - 300,0,,,,4341,243.450019,2069835.0,0.002097,...,,0,,,,0,,,,240 - 300


In [37]:
# temp export for charting
timecats_merged_tu.to_csv(r'results/tables/adm2_time_categories_tu.csv')

## Adm3 - Upazila employment access averages

### Upazila-level summary statistics

##### per employment category

In [38]:
# Generate total workers per UPZ within the Union file

adm4['adm3_totwor'] = adm4.groupby('ADM3_PCODE')['totwor'].transform(np.sum)
adm4['adm3_indwor'] = adm4.groupby('ADM3_PCODE')['ind_wor'].transform(np.sum)
adm4['adm3_serwor'] = adm4.groupby('ADM3_PCODE')['ser_wor'].transform(np.sum)

In [39]:
adm4['totwor'].dtypes

dtype('float64')

In [40]:
(adm4['totwor'] / adm4['adm3_totwor'])

0     0.117988
1     0.089345
2     0.117494
3     0.007631
4     0.188199
        ...   
70    0.121797
71    0.208799
72    0.033493
73    0.085161
74    0.169361
Length: 75, dtype: float64

In [41]:
for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
    tw = (key + '_avg_tot_wt')
    iw = (key + '_avg_ind_wt')
    sw = (key + '_avg_ser_wt')
    
    adm4[tw] = ( adm4[at] * (adm4['totwor'] / adm4['adm3_totwor']))
    adm4[iw] = ( adm4[at] * (adm4['ind_wor'] / adm4['adm3_indwor']))
    adm4[sw] = ( adm4[at] * (adm4['ser_wor'] / adm4['adm3_serwor']))


In [42]:
adm4.keys()

Index(['Shape_Leng', 'Shape_Area', 'ADM4_EN', 'ADM4_PCODE', 'ADM4_REF',
       'ADM4ALT1EN', 'ADM4ALT2EN', 'ADM3_EN', 'ADM3_PCODE', 'ADM2_EN',
       ...
       'unf_secondary_ed_avg_ser_wt', 'unf_tertiary_ed_avg_tot_wt',
       'unf_tertiary_ed_avg_ind_wt', 'unf_tertiary_ed_avg_ser_wt',
       'unf_allmkts_avg_tot_wt', 'unf_allmkts_avg_ind_wt',
       'unf_allmkts_avg_ser_wt', 'unf_growthcenters_avg_tot_wt',
       'unf_growthcenters_avg_ind_wt', 'unf_growthcenters_avg_ser_wt'],
      dtype='object', length=237)

In [43]:
empl_access_upz = adm4.groupby(['ADM3_EN','ADM3_PCODE']).size().reset_index() # reset_index seems to spring it back into a DataFrame, which we need for the join
empl_access_upz = empl_access_upz[['ADM3_EN','ADM3_PCODE']] # get rid of 0 column

for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
    tw = (key + '_avg_tot_wt')
    iw = (key + '_avg_ind_wt')
    sw = (key + '_avg_ser_wt')
    
    te = (key + '_avg_tot_empl')
    ie = (key + '_avg_ind_empl')
    se = (key + '_avg_ser_empl')
    
    temp = adm4.groupby(['ADM3_EN','ADM3_PCODE']).agg(
            te = (tw,sum),
            ie = (iw,sum),
            se = (sw,sum)
            )

    temp = temp.rename(columns={'te' : te, 'ie' : ie, 'se' : se})

    empl_access_upz = pd.merge(empl_access_upz,temp,on=['ADM3_EN','ADM3_PCODE'])

Now create long data for later charting

In [44]:
# empl_access_upz = empl_access_upz.drop(columns=['Unnamed: 0','0'])

empl_access_upz_long = pd.melt(empl_access_upz,id_vars=['ADM3_EN','ADM3_PCODE'],
                        var_name='Employment category',value_name='access_time')


In [45]:
empl_access_upz_long.head()

# empl_access_upz_long['Employment category'].unique()

Unnamed: 0,ADM3_EN,ADM3_PCODE,Employment category,access_time
0,Chakaria,202216,current_cxb_avg_tot_empl,75.241927
1,Cox's Bazar Sadar,202224,current_cxb_avg_tot_empl,18.50898
2,Kutubdia,202245,current_cxb_avg_tot_empl,160.494717
3,Maheshkhali,202249,current_cxb_avg_tot_empl,92.345461
4,Pekua,202256,current_cxb_avg_tot_empl,102.367468


In [46]:
empl_rename = {"avg_tot_empl":"Total","avg_ind_empl":"Industry","avg_ser_empl":"Service"}

In [47]:
for idx, row in empl_access_upz_long.iterrows():
#     print(row)
#     print(row['Employment category'])
    ele = re.split('_',row['Employment category'])

    if len(ele) >= 6:

        empl_access_upz_long.loc[idx,'scen_dest'] = ele[0] + '_' + ele[1] + '_' + ele[2]
        empl_access_upz_long.loc[idx,'Employment category'] = ele[3] + '_' + ele[4] + '_' + ele[5]

    else:
        empl_access_upz_long.loc[idx,'scen_dest'] = ele[0] + '_' + ele[1]
        empl_access_upz_long.loc[idx,'Employment category'] = ele[2] + '_' + ele[3] + '_' + ele[4]
    
empl_access_upz_long['Employment category'] = empl_access_upz_long['Employment category'].map(empl_rename)

In [48]:
empl_access_upz_long.head(5)

Unnamed: 0,ADM3_EN,ADM3_PCODE,Employment category,access_time,scen_dest
0,Chakaria,202216,Total,75.241927,current_cxb
1,Cox's Bazar Sadar,202224,Total,18.50898,current_cxb
2,Kutubdia,202245,Total,160.494717,current_cxb
3,Maheshkhali,202249,Total,92.345461,current_cxb
4,Pekua,202256,Total,102.367468,current_cxb


#### Re-doing with population census employment data

In [49]:
# Generate total workers per sector per UPZ (according to the 2011 demographic census) within the Union file

adm4['adm3_pc_aggwor'] = adm4.groupby('ADM3_PCODE')['Number of working individuals in Agriculture 15 years plus'].transform(np.sum)
adm4['adm3_pc_indwor'] = adm4.groupby('ADM3_PCODE')['Number of working individuals in Industry 15 years plus'].transform(np.sum)
adm4['adm3_pc_serwor'] = adm4.groupby('ADM3_PCODE')['Number of working individuals in Services 15 years plus'].transform(np.sum)

In [50]:
(adm4['Number of working individuals in Agriculture 15 years plus'] / adm4['adm3_pc_aggwor'])

0     0.185264
1     0.092256
2     0.131716
3     0.023964
4     0.095245
        ...   
70    0.159609
71    0.018247
72    0.102283
73    0.232163
74    0.251496
Length: 75, dtype: float64

In [51]:
for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
    tw = (key + '_avg_pc_agg_wt')
    iw = (key + '_avg_pc_ind_wt')
    sw = (key + '_avg_pc_ser_wt')
    
    adm4[tw] = ( adm4[at] * (adm4['Number of working individuals in Agriculture 15 years plus'] / adm4['adm3_pc_aggwor']))
    adm4[iw] = ( adm4[at] * (adm4['Number of working individuals in Industry 15 years plus'] / adm4['adm3_pc_indwor']))
    adm4[sw] = ( adm4[at] * (adm4['Number of working individuals in Services 15 years plus'] / adm4['adm3_pc_serwor']))


In [52]:
empl_pc_access_upz = adm4.groupby(['ADM3_EN','ADM3_PCODE']).size().reset_index() # reset_index seems to spring it back into a DataFrame, which we need for the join
empl_pc_access_upz = empl_pc_access_upz[['ADM3_EN','ADM3_PCODE']] # get rid of 0 column

for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')
    tw = (key + '_avg_pc_agg_wt')
    iw = (key + '_avg_pc_ind_wt')
    sw = (key + '_avg_pc_ser_wt')
    
    te = (key + '_avg_pc_agg_empl')
    ie = (key + '_avg_pc_ind_empl')
    se = (key + '_avg_pc_ser_empl')
    
    temp = adm4.groupby(['ADM3_EN','ADM3_PCODE']).agg(
            te = (tw,sum),
            ie = (iw,sum),
            se = (sw,sum)
            )

    temp = temp.rename(columns={'te' : te, 'ie' : ie, 'se' : se})

    empl_pc_access_upz = pd.merge(empl_pc_access_upz,temp,on=['ADM3_EN','ADM3_PCODE'])

Now transform to long

In [53]:
# empl_pc_access_upz = empl_access_upz.drop(columns=['Unnamed: 0','0'])

empl_pc_access_upz_long = pd.melt(empl_pc_access_upz,id_vars=['ADM3_EN','ADM3_PCODE'],
                        var_name='Employment category',value_name='access_time')


In [54]:
empl_pc_access_upz_long.head()

# empl_access_upz_long['Employment category'].unique()

Unnamed: 0,ADM3_EN,ADM3_PCODE,Employment category,access_time
0,Chakaria,202216,current_cxb_avg_pc_agg_empl,81.269461
1,Cox's Bazar Sadar,202224,current_cxb_avg_pc_agg_empl,32.610011
2,Kutubdia,202245,current_cxb_avg_pc_agg_empl,159.841509
3,Maheshkhali,202249,current_cxb_avg_pc_agg_empl,95.59398
4,Pekua,202256,current_cxb_avg_pc_agg_empl,106.443999


In [55]:
empl_pc_rename = {"avg_pc_agg_empl":"Agricultural","avg_pc_ind_empl":"Industry","avg_pc_ser_empl":"Service"}

In [56]:
for idx, row in empl_pc_access_upz_long.iterrows():
#     print(row)
#     print(row['Employment category'])
    ele = re.split('_',row['Employment category'])
    
#     print(ele)

    if len(ele) >= 7:

        empl_pc_access_upz_long.loc[idx,'scen_dest'] = ele[0] + '_' + ele[1] + '_' + ele[2]
        empl_pc_access_upz_long.loc[idx,'Employment category'] = ele[3] + '_' + ele[4] + '_' + ele[5] + '_' + ele[6]

    else:
        empl_pc_access_upz_long.loc[idx,'scen_dest'] = ele[0] + '_' + ele[1]
        empl_pc_access_upz_long.loc[idx,'Employment category'] = ele[2] + '_' + ele[3] + '_' + ele[4] + '_' + ele[5]
    
empl_pc_access_upz_long['Employment category'] = empl_pc_access_upz_long['Employment category'].map(empl_pc_rename)

In [57]:
empl_pc_access_upz_long.head(-10)

Unnamed: 0,ADM3_EN,ADM3_PCODE,Employment category,access_time,scen_dest
0,Chakaria,202216,Agricultural,81.269461,current_cxb
1,Cox's Bazar Sadar,202224,Agricultural,32.610011,current_cxb
2,Kutubdia,202245,Agricultural,159.841509,current_cxb
3,Maheshkhali,202249,Agricultural,95.593980,current_cxb
4,Pekua,202256,Agricultural,106.443999,current_cxb
...,...,...,...,...,...
849,Cox's Bazar Sadar,202224,Industry,10.943171,unf_growthcenters
850,Kutubdia,202245,Industry,19.482468,unf_growthcenters
851,Maheshkhali,202249,Industry,15.769390,unf_growthcenters
852,Pekua,202256,Industry,23.238038,unf_growthcenters


#### Per education level

Check out the adm4 data

In [3]:
# pd.set_option('display.max_columns', None) # for debuggging 
# adm4.head(2)

Create population totals per education level and adm3 aggregates for weighting

In [59]:
# convert education level population shares to population totals

adm4['educ_non_n'] = adm4['Education level Share of people No education 15 years plus'] * adm4['hrsl18sum']
adm4['educ_prim_n'] = adm4['Education level Share of people Primary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_lowsec_n'] = adm4['Education level Share of people Lower Secondary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_sec_n'] = adm4['Education level Share of people Secondary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_highsec_n'] = adm4['Education level Share of people Higher Secondary 15 years plus'] * adm4['hrsl18sum']
adm4['educ_tert_n'] = adm4['Education level Share of people Graduate & Above 15 years plus'] * adm4['hrsl18sum']

In [60]:
# Generate adm3 populations for weighting

adm4['adm3_educ_non_n'] = adm4.groupby('ADM3_PCODE')['educ_non_n'] .transform(np.sum)
adm4['adm3_educ_prim_n'] = adm4.groupby('ADM3_PCODE')['educ_prim_n'].transform(np.sum)
adm4['adm3_educ_lowsec_n'] = adm4.groupby('ADM3_PCODE')['educ_lowsec_n'].transform(np.sum)
adm4['adm3_educ_sec_n'] = adm4.groupby('ADM3_PCODE')['educ_sec_n'].transform(np.sum)
adm4['adm3_educ_highsec_n'] = adm4.groupby('ADM3_PCODE')['educ_highsec_n'].transform(np.sum)
adm4['adm3_educ_tert_n'] = adm4.groupby('ADM3_PCODE')['educ_tert_n'].transform(np.sum)

Create a new long dataset to populate with weighted accessibility figures per education level

In [61]:
# educ_data_union = {'Education level': ['No education','Primary','Low Secondary','Secondary', 'Higher Secondary','University']}
# educ_data_union = pd.DataFrame(educ_data_union)

educ_data_union_list = ['No education','Primary','Lower Secondary','Secondary', 'Higher Secondary','University']
adm3_names = adm3.ADM3_EN.unique().tolist()

In [62]:
all_scen_educ = {k: adm3_names for k, v in all_scenarios.items()}

all_scen_educ_df = pd.DataFrame(
    [(k,val) for k, vals in all_scen_educ.items() for val in vals], 
    columns=['scenario','ADM3_EN']
)


Group by (split), apply DF of list, then combine below

Good split-apply-combine walkthrough : https://towardsdatascience.com/how-to-use-the-split-apply-combine-strategy-in-pandas-groupby-29e0eb44b62e

In [63]:
t = all_scen_educ_df.groupby(['scenario','ADM3_EN'],axis=0).apply(lambda x: pd.DataFrame(educ_data_union_list)) 
all_scen_educ_df = pd.DataFrame(t.reset_index()).drop(columns='level_2',axis=1).rename(columns = { 0 : 'educ_level'})

In [64]:
all_scen_educ_df

Unnamed: 0,scenario,ADM3_EN,educ_level
0,current_allmkts,Chakaria,No education
1,current_allmkts,Chakaria,Primary
2,current_allmkts,Chakaria,Lower Secondary
3,current_allmkts,Chakaria,Secondary
4,current_allmkts,Chakaria,Higher Secondary
...,...,...,...
1723,uns_tertiary_ed,Ukhia,Primary
1724,uns_tertiary_ed,Ukhia,Lower Secondary
1725,uns_tertiary_ed,Ukhia,Secondary
1726,uns_tertiary_ed,Ukhia,Higher Secondary


Long dataset method proved difficult to assign weighted values per admin-scenario-education level, therefore I went with the tried-and-true method below

In [65]:
for key, layer in all_scenarios.items():
    
    at = (key + '_avg_time')

    nw = (key + '_avg_pc_none_wt')
    pw = (key + '_avg_pc_prim_wt')
    lsw = (key + '_avg_pc_low_secon_wt')
    sw = (key + '_avg_pc_secon_wt')
    hsw = (key + '_avg_pc_high_secon_wt')
    uw = (key + '_avg_pc_uni_wt')
    
    adm4[nw] = ( adm4[at] * (adm4['educ_non_n'] / adm4['adm3_educ_non_n']) )
    adm4[pw] = ( adm4[at] * (adm4['educ_prim_n'] / adm4['adm3_educ_prim_n']) )
    adm4[lsw] = ( adm4[at] * (adm4['educ_lowsec_n'] / adm4['adm3_educ_lowsec_n']) )
    adm4[sw] = ( adm4[at] * (adm4['educ_sec_n'] / adm4['adm3_educ_sec_n']) )
    adm4[hsw] = ( adm4[at] * (adm4['educ_highsec_n'] / adm4['adm3_educ_highsec_n']) )
    adm4[uw] = ( adm4[at] * (adm4['educ_tert_n'] / adm4['adm3_educ_tert_n']) )


In [66]:
educ_access_upz = adm4.groupby(['ADM3_EN','ADM3_PCODE']).size().reset_index() # reset_index seems to spring it back into a DataFrame, which we need for the join
educ_access_upz = educ_access_upz[['ADM3_EN','ADM3_PCODE']] # get rid of 0 column

for key, layer in all_scenarios.items():

    at = key + '_avg_time'
    
    nw = (key + '_avg_pc_none_wt')
    pw = (key + '_avg_pc_prim_wt')
    lsw = (key + '_avg_pc_low_secon_wt')
    sw = (key + '_avg_pc_secon_wt')
    hsw = (key + '_avg_pc_high_secon_wt')
    uw = (key + '_avg_pc_uni_wt')
    
    none = (key + '_No education')
    prim = (key + '_Primary')
    low_secon = (key + '_Lower Secondary')
    secon = (key + '_Secondary')
    high_secon = (key + '_Higher Secondary')
    uni = (key + '_University')
    
#     print(at)
    
    temp = adm4.groupby(['ADM3_EN','ADM3_PCODE']).agg(
        
        none = (nw,sum),
        prim = (pw,sum),
        low_secon = (lsw,sum),
        secon = (sw,sum),
        high_secon = (hsw,sum),
        uni = (uw,sum)
        
            )

# #     educ_map = {'No education' : none, 'Primary' : prim, 'Lower Secondary' : low_secon, 'Secondary' : secon, 'Higher Secondary' : high_secon, 'University' : uni}
    
    temp = temp.rename(columns={'none' : none, 'prim' : prim, 'low_secon' : low_secon, 'secon' : secon, 'high_secon' : high_secon, 'uni' : uni})
    
    educ_access_upz = pd.merge(educ_access_upz,temp,how='left',on=['ADM3_EN','ADM3_PCODE'])

In [67]:
educ_access_upz_long = pd.melt(educ_access_upz,id_vars=['ADM3_EN','ADM3_PCODE'],
                        var_name='Education level',value_name='access_time')

for idx, row in educ_access_upz_long.iterrows():

    ele = re.split('_',row['Education level'])
    
#     print(ele)

    if len(ele) >= 4:

        educ_access_upz_long.loc[idx,'scen_dest'] = ele[0] + '_' + ele[1] + '_' + ele[2]
        educ_access_upz_long.loc[idx,'Education level'] = ele[3]

    else:
        educ_access_upz_long.loc[idx,'scen_dest'] = ele[0] + '_' + ele[1]
        educ_access_upz_long.loc[idx,'Education level'] = ele[2]
    

In [68]:
educ_access_upz_long.head(-10)

Unnamed: 0,ADM3_EN,ADM3_PCODE,Education level,access_time,scen_dest
0,Chakaria,202216,No education,80.277132,current_cxb
1,Cox's Bazar Sadar,202224,No education,24.392905,current_cxb
2,Kutubdia,202245,No education,160.246913,current_cxb
3,Maheshkhali,202249,No education,119.457076,current_cxb
4,Pekua,202256,No education,116.660360,current_cxb
...,...,...,...,...,...
1713,Cox's Bazar Sadar,202224,Higher Secondary,8.856306,unf_growthcenters
1714,Kutubdia,202245,Higher Secondary,16.524617,unf_growthcenters
1715,Maheshkhali,202249,Higher Secondary,28.455364,unf_growthcenters
1716,Pekua,202256,Higher Secondary,29.678127,unf_growthcenters


## Exports

In [69]:
# shp export


# cxb_adm3.to_file(r'results/spatial/cxb_upz_summary.gpkg',driver="GPKG")
# cxb_adm4.to_file(r'results/spatial/cxb_union_summary.gpkg',driver="GPKG")

adm3.to_file(r'results/spatial/adm3_summary.gpkg',driver="GPKG")
adm4.to_file(r'results/spatial/adm4_summary.gpkg',driver="GPKG")

CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_adm3_summary_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'

Exception ignored in: 'fiona._shim.gdal_flush_cache'
Traceback (most recent call last):
  File "fiona/_err.pyx", line 201, in fiona._err.GDALErrCtxManager.__exit__
fiona._err.CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_adm3_summary_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'


CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_adm4_summary_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'

Exception ignored in: 'fiona._shim.gdal_flush_cache'
Traceback (most recent call last):
  File "fiona/_err.pyx", line 201, in fiona._err.GDALErrCtxManager.__exit__
fiona._err.CPLE_AppDefinedError: b'sqlite3_exec(CREATE VIRTUAL TABLE "rtree_adm4_summary_geom" USING rtree(id, minx, maxx, miny, maxy)) failed: no such module: rtree'


In [70]:
# csv export

## to results folder
### geometries enlarge tabular exports so I'm dropping them

adm3.loc[:, adm3.columns != 'geometry'].to_csv(r'results/tables/cxb_adm3.csv')
adm4.loc[:, adm4.columns != 'geometry'].to_csv(r'results/tables/cxb_adm4.csv')

timecats_merged.to_csv(r'results/tables/adm2_time_categories.csv')
timecats_merged_tu.to_csv(r'results/tables/adm2_time_categories_tu.csv')
# chitt_time_cat.to_csv(r'results/tables/adm2_time_categories_chittagong.csv')

educ_data.to_csv(r'results/tables/adm2_access_by_educ_level.csv')
econC_empl_data.to_csv(r'results/tables/adm2_access_by_empl_type_EconCensus.csv')
Census_empl_data.to_csv(r'results/tables/adm2_access_by_empl_type_2011Census.csv')

empl_access_upz.to_csv(r'results/tables/adm3_pop_weighted_access_by_empl_type_upz_EconCensus.csv')
empl_access_upz_long.to_csv(r'results/tables/adm3_pop_weighted_access_by_empl_type_upz_EconCensus_LONG.csv')
empl_pc_access_upz_long.to_csv(r'results/tables/adm3_pop_weighted_access_by_empl_type_upz_PopCensus_LONG.csv')
educ_access_upz_long.to_csv(r'results/tables/adm3_pop_weighted_access_by_educ_level_upz_PopCensus_LONG.csv')
