In [1]:
from google.colab import drive
drive.mount('/content/drive')

program_location = '/content/drive/MyDrive/Colab Notebooks/PTI'

Mounted at /content/drive


# **0. PREPARATION**

### 0.1 IMPORTANT: Check kernel and environment prior to run
The conda environment for these scripts is available on GitHub at https://github.com/wbPTI/sahelPTIdata/tree/main/vignettes/pti_env.yml.
<br><br>Once you've activated the PTI environment and loaded this Jupyter Notebook script, make sure it is using the "pti" kernel (see top right, under Logout button). If it is not, you must navigate to the top ribbon and go to: **Kernel -> Change Kernel -> pti** prior to running.

### 0.2 Expected inputs

This script uses admin areas, settlements, and population rasters.

##### ADMIN AREAS: Source file type: geopackage.
The script as currently written is NOT flexible with admin area inputs. It must take a geopackage named AdminBoundaries.gpkg. This geopackage must contain 3 layers named __adm3__, __adm2__, and __adm1__. The __adm3__ layer must have a unique ID named _ADM3_CODE_, as well as codes for the two other admin sets (_ADM2_CODE_, _ADM1_CODE_).

##### SETTLEMENTS: Source file type: 2 or more shapefiles.
This script is designed for multiple GRID3 settlement extents. To work with a single GRID3 source file, User must comment out the code blocks in 2.1 up to the pd.concat() code block, then uncomment the code block following.
<br>A field named _pop_un_adj_ is required to successfully create outputs. The field _type_ is not required, but the script will fail unless User manually removes references to it in Section 2.1.

##### POPULATION: Source file type: Between 2-7 rasters in .tif.
Rasters are expected to have different areas of interest, such as one for each country in the region, and have the same specifications (resolution, CRS, etc.).
<br>Script was designed to use WorldPop 2020 UN-Adjusted Constrained datasets. In theory, however, the code should work for any population source.

### 0.3 Outputs

This script produces degrees of urbanization (UrbanIndicators_ADM.csv) and population (Population_ADM.csv) indicators per admin area. It also creates useful intermediate outputs: i) variations of the GRID3 settlement data as layers in a geopackage, and ii) a mosaic of the input WorldPop rasters.

### 0.4 Pseudocode (outline of the following script)

##### GRID3
Merge: GRID3_BFA, GRID3_MLI, GRID3_NER, GRID3_TCD.
<br> Calculate population density of settlements.
<br> Spatial join ADM3 (with ADM2 and 1 codes) onto merged GRID3. Save to file as G3_ply.
<br> Convert GRID3 geometries (WGS84) to centroids. Save to file as G3_pt.
<br> Export GRID3 (polygons) without Built-Up Area class = G3_rural_ply
<br> Convert G3_rural_ply geometries (WGS84) to centroids. Save to file as G3_rural_pt.
<br> Summary stats: population density of inhabited areas by ADM
<br> Summary stats: population density of HD urban by ADM
<br> Summary stats: population density of SD urban by ADM

##### WorldPop
Mosaic: WP_BFA, WP_MLI, WP_NER, WP_TCD. Save to file.
<br> Zonal statistics (sum) population in ADM3 zones.
<br> Remove geometries. Group-by sum population by ADM2 and ADM1. Save to file.

---
# **1. SET-UP**

##### These packages are used across the range of Sahel PTI indicator preparations. Not all packages may be used in present script.

In [2]:
!pip install geopandas rioxarray richdem geemap rasterio import_ipynb pyshp pycrs pyogrio rasterstats

Collecting rioxarray
  Downloading rioxarray-0.15.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting richdem
  Downloading richdem-0.3.4.tar.gz (329 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.4/329.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rasterio
  Downloading rasterio-1.3.9-cp310-cp310-manylinux2014_x86_64.whl (20.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.6/20.6 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting import_ipynb
  Downloading import_ipynb-0.1.4-py3-none-any.whl (4.1 kB)
Collecting pycrs
  Downloading PyCRS-1.0.2.tar.gz (36 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyogrio
  Downloading pyogrio-0.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.0 MB)
[2K     [90m━━━━━━

In [3]:
# Colab default packages
import io, os, sys, glob, re, time, subprocess, string, types, json, fiona
from os.path import exists

import pandas as pd
import importlib
import matplotlib.pyplot as plt
import numpy as np

import ee
from osgeo import gdal, osr, ogr, gdal_array, gdalconst

from sklearn.neighbors import KDTree

from shapely import geometry
from shapely.geometry import Point, LineString, Polygon, shape, MultiPoint, mapping
from shapely.ops import cascaded_union
from shapely.validation import make_valid
import shapely.wkt

from functools import reduce


# Temporarily installed packages
import geopandas as gpd
import rioxarray

import rasterio
from rasterio.plot import show
from rasterio import features # features.rasterize()
from rasterio.features import shapes
from rasterio import mask # rasterio.mask.mask()
from rasterio.enums import Resampling # rasterio.enums.Resampling()

import richdem as rd
import geemap
import pyogrio
import pycrs
import import_ipynb

from rasterstats import zonal_stats


# Import external files
os.chdir(program_location)
!pwd
import config

sys.path.append(program_location)

import tools
importlib.reload(tools)

/content/drive/MyDrive/Colab Notebooks/PTI
importing Jupyter notebook from config.ipynb


<module 'tools' from '/content/drive/MyDrive/Colab Notebooks/PTI/tools.py'>

In [4]:
data_loc = os.path.join(os.getcwd(), 'data', config.ISO)
print(data_loc)

# Auxilliary sources
G3_Fd = os.path.join(data_loc, 'GRID3')
ADM_Fd = os.path.join(data_loc, 'ADM')
WP_Fd = os.path.join(data_loc, 'WorldPop')
Intermediate_Fd = os.path.join(G3_Fd, 'Intermediate')
src_GRID3_results = os.path.join(G3_Fd, 'Results')
src_wPop_results = os.path.join(WP_Fd, 'Results')

# Generate folders if not exist:
if not os.path.exists(Intermediate_Fd):
    os.mkdir(Intermediate_Fd)

if not os.path.exists(src_GRID3_results):
    os.mkdir(src_GRID3_results)

if not os.path.exists(src_wPop_results):
    os.mkdir(src_wPop_results)

/content/drive/MyDrive/Colab Notebooks/PTI/data/CPV


---
# **2. GRID3**

## **2.1 pre-process GRID3 layers**

In [5]:
# Read target GRID3 file
G3_file = os.path.join(G3_Fd, 'Source', config.original_GRID3_fil)
G3 = tools.vec_import(config.RW_engine, G3_file)

print(G3.info(), '\n\n', G3['type'].unique(), '\n\n', G3.head(10))


# Organize necessary cols
G3['G3_ID'] = G3.index
G3 = G3[['G3_ID','type', 'pop_un_adj', 'geometry']]


# Make sure unmeasured populations don't give us trouble.
G3[['pop_un_adj']] = G3[['pop_un_adj']].fillna(0)


G3.info()

Vector import complete.
GDF size:5711
EPSG:4326
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5711 entries, 0 to 5710
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   mgrs_code   5711 non-null   object  
 1   country     5711 non-null   object  
 2   iso         5711 non-null   object  
 3   type        5711 non-null   object  
 4   population  5711 non-null   float64 
 5   pop_un_adj  5711 non-null   float64 
 6   adm0_pcode  5711 non-null   object  
 7   adm1_name   5711 non-null   object  
 8   adm1_pcode  5711 non-null   object  
 9   adm2_name   5711 non-null   object  
 10  adm2_pcode  5711 non-null   object  
 11  settl_pcod  5711 non-null   object  
 12  Shape_Leng  5711 non-null   float64 
 13  Shape_Area  5711 non-null   float64 
 14  geometry    5711 non-null   geometry
dtypes: float64(4), geometry(1), object(10)
memory usage: 669.4+ KB
None 

 ['Built-up Area' 'Small Settlement Area' 'Hamle

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


## **2.2. Calculate attributes**

In [6]:
# Population density
G3['areaKM'] = G3.to_crs('ESRI:102022')['geometry'].area / 10**6 # Calculate area in square kilometers
G3['popDens'] = G3['pop_un_adj'] / G3['areaKM']


# Degrees of urbanization
G3['UrbanClass'] = 'Low Density'
G3.loc[(G3['pop_un_adj'] >= 5000) & (G3['popDens'] >= 300), 'UrbanClass'] = 'Semi-Dense Urban'
G3.loc[(G3['pop_un_adj'] >= 50000) & (G3['popDens'] >= 1500), 'UrbanClass'] = 'High-Density Urban'


# Check
G3.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,G3_ID,type,pop_un_adj,geometry,areaKM,popDens,UrbanClass
4026,4026,Hamlet,9.365769,"POLYGON ((-22.79425 16.16680, -22.79468 16.166...",0.012873,727.572143,Low Density
4601,4601,Hamlet,23.282199,"POLYGON ((-24.96398 16.86835, -24.96420 16.868...",0.016671,1396.596282,Low Density
1305,1305,Hamlet,67.787712,"POLYGON ((-24.29145 14.95331, -24.29105 14.953...",0.010945,6193.6337,Low Density
3492,3492,Hamlet,4.381683,"POLYGON ((-23.65066 15.19990, -23.65057 15.200...",0.018548,236.23305,Low Density
816,816,Hamlet,1.043916,"POLYGON ((-24.36410 14.89137, -24.36370 14.891...",0.010941,95.412916,Low Density
1259,1259,Hamlet,1.796701,"POLYGON ((-24.43055 14.94795, -24.43085 14.948...",0.02921,61.509779,Low Density
2084,2084,Hamlet,20.87591,"POLYGON ((-23.52768 15.02209, -23.52742 15.022...",0.048044,434.517174,Low Density
4218,4218,Hamlet,6.627645,"POLYGON ((-24.23027 16.63023, -24.23042 16.630...",0.020141,329.069689,Low Density
3263,3263,Hamlet,1.942121,"POLYGON ((-23.63181 15.16650, -23.63141 15.166...",0.010958,177.240303,Low Density
4666,4666,Hamlet,22.771848,"POLYGON ((-24.96892 16.91139, -24.96882 16.911...",0.013404,1698.880164,Low Density


## **2.3. Merge GRID3-AMD**

### 2.3.1 Generate centroids

In [7]:
G3_pt = G3.copy()

G3_pt['centroid'] = G3_pt.to_crs('ESRI:102022').geometry.centroid
G3_pt = G3_pt.set_geometry('centroid').drop(['geometry'], axis=1)

G3_pt.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5711 entries, 0 to 5710
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   G3_ID       5711 non-null   int64   
 1   type        5711 non-null   object  
 2   pop_un_adj  5711 non-null   float64 
 3   areaKM      5711 non-null   float64 
 4   popDens     5711 non-null   float64 
 5   UrbanClass  5711 non-null   object  
 6   centroid    5711 non-null   geometry
dtypes: float64(3), geometry(1), int64(1), object(2)
memory usage: 312.4+ KB


### 2.3.2 Merge GRID3 and ADMs

In [8]:
# Read the original ADM3 layer for the target country
ADM3 = tools.vec_import(config.RW_engine, os.path.join(ADM_Fd, 'Source', config.original_adm3_fil))


# Change the name of ADM3 code col to an appropriate one (i.e., ADM3_CODE, ADM2_CODE, and ADM1_CODE) defined by l_replace.
ADM3 = ADM3.rename(columns = config.l_replace)


# CRS check for spatial join
if ADM3.crs != G3_pt.crs:
  ADM3.to_crs(G3_pt.crs, inplace = True)

Vector import complete.
GDF size:32
EPSG:4326


##### Spatial join with the point version of GRID3 for a one-to-one result.

In [9]:
G3_pt = gpd.sjoin(G3_pt, ADM3, how='left', predicate='intersects')
G3_pt = G3_pt[~G3_pt.index.duplicated(keep='first')] # The only duplicates were ~2 dozen hamlets with very small populations.

print(G3_pt.sample(5), G3_pt.info())


try:
    G3_pt = G3_pt.drop(['index_right'], axis=1)
except:
    pass

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 5711 entries, 0 to 5710
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   G3_ID        5711 non-null   int64   
 1   type         5711 non-null   object  
 2   pop_un_adj   5711 non-null   float64 
 3   areaKM       5711 non-null   float64 
 4   popDens      5711 non-null   float64 
 5   UrbanClass   5711 non-null   object  
 6   centroid     5711 non-null   geometry
 7   index_right  5703 non-null   float64 
 8   ADM3_NAME    5703 non-null   object  
 9   ADM0_CODE    5703 non-null   object  
 10  ADM1_CODE    5703 non-null   object  
 11  ADM2_CODE    5703 non-null   object  
 12  ADM3_CODE    5703 non-null   object  
dtypes: float64(4), geometry(1), int64(1), object(7)
memory usage: 624.6+ KB
      G3_ID                   type   pop_un_adj    areaKM      popDens  \
2001   2001                 Hamlet     3.415849  0.014527   235.145218   
779     779    

##### Table join to assign ADMs onto polygon version.

In [10]:
G3 = G3.merge(G3_pt[['G3_ID','ADM1_CODE', 'ADM2_CODE', 'ADM3_CODE']], how='left', on='G3_ID')

G3.info()
G3.sample(10)

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 5711 entries, 0 to 5710
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   G3_ID       5711 non-null   int64   
 1   type        5711 non-null   object  
 2   pop_un_adj  5711 non-null   float64 
 3   geometry    5711 non-null   geometry
 4   areaKM      5711 non-null   float64 
 5   popDens     5711 non-null   float64 
 6   UrbanClass  5711 non-null   object  
 7   ADM1_CODE   5703 non-null   object  
 8   ADM2_CODE   5703 non-null   object  
 9   ADM3_CODE   5703 non-null   object  
dtypes: float64(3), geometry(1), int64(1), object(5)
memory usage: 490.8+ KB


Unnamed: 0,G3_ID,type,pop_un_adj,geometry,areaKM,popDens,UrbanClass,ADM1_CODE,ADM2_CODE,ADM3_CODE
3667,3667,Hamlet,180.85353,"POLYGON ((-23.70680 15.23822, -23.70592 15.238...",0.127642,1416.877358,Low Density,CPV006,CPV006009,CPV0060090024
3434,3434,Hamlet,4.892715,"POLYGON ((-23.11576 15.18060, -23.11536 15.180...",0.010958,446.481168,Low Density,CPV005,CPV005008,CPV0050080014
1541,1541,Hamlet,1.993888,"POLYGON ((-24.47000 14.98488, -24.46964 14.985...",0.01305,152.79217,Low Density,CPV007,CPV007019,CPV0070190017
104,104,Small Settlement Area,93.442588,"POLYGON ((-24.38678 15.00916, -24.38715 15.008...",0.721074,129.587994,Low Density,CPV007,CPV007019,CPV0070190017
1733,1733,Hamlet,4.295703,"POLYGON ((-24.31548 15.00252, -24.31557 15.002...",0.029582,145.214383,Low Density,CPV007,CPV007018,CPV0070180018
3146,3146,Hamlet,7.644001,"POLYGON ((-23.67097 15.15354, -23.67118 15.153...",0.024028,318.123754,Low Density,CPV006,CPV006010,CPV0060100026
896,896,Hamlet,1.338534,"POLYGON ((-24.47381 14.90333, -24.47387 14.903...",0.014838,90.208227,Low Density,CPV007,CPV007019,CPV0070190016
2061,2061,Hamlet,2.470562,"POLYGON ((-24.33718 15.03601, -24.33678 15.035...",0.01095,225.628479,Low Density,CPV007,CPV007018,CPV0070180018
2233,2233,Hamlet,3.280875,"POLYGON ((-23.53998 15.04364, -23.53958 15.043...",0.01095,299.620007,Low Density,CPV006,CPV006013,CPV0060130022
3329,3329,Hamlet,1.3864,"POLYGON ((-23.65001 15.17670, -23.64961 15.176...",0.010958,126.517429,Low Density,CPV006,CPV006014,CPV0060140025


### 2.6 Rural-only

In [11]:
print(G3['UrbanClass'].unique(), '\n\n', G3_pt['UrbanClass'].unique())

['Low Density' 'Semi-Dense Urban' 'High-Density Urban'] 

 ['Low Density' 'Semi-Dense Urban' 'High-Density Urban']


In [12]:
# Process GRID3
G3['UrbanClass'] = G3['UrbanClass'].astype(str)
G3_rural = G3[G3['UrbanClass'].str.startswith("L")] # Remove all cities and towns
print(G3_rural['UrbanClass'].unique())

# Process GRID3 Point
G3_pt['UrbanClass'] = G3_pt['UrbanClass'].astype(str)
G3_rural_pt = G3_pt[G3_pt['UrbanClass'].str.startswith("L")]
print(G3_rural_pt['UrbanClass'].unique())

['Low Density']
['Low Density']


In [13]:
# Save file of all settlement types.

dic_gdf_export = {'G3_pt': G3_pt, 'G3_ply' : G3, 'SSA_HA_ply' : G3_rural, 'SSA_HA_pt' : G3_rural_pt}

for name_temp, gdf_temp in dic_gdf_export.items():

  print('Exporting {}.shp'.format(name_temp))
  temp_out = os.path.join(Intermediate_Fd, name_temp + '.shp')
  print('Path = {}'.format(temp_out))

  tools.vec_export(config.RW_engine, gdf_temp, temp_out)


print('Export SHP complete.')

Exporting G3_pt.shp
Path = /content/drive/MyDrive/Colab Notebooks/PTI/data/CPV/GRID3/Intermediate/G3_pt.shp
Vector export complete.
Exporting G3_ply.shp
Path = /content/drive/MyDrive/Colab Notebooks/PTI/data/CPV/GRID3/Intermediate/G3_ply.shp
Vector export complete.
Exporting SSA_HA_ply.shp
Path = /content/drive/MyDrive/Colab Notebooks/PTI/data/CPV/GRID3/Intermediate/SSA_HA_ply.shp
Vector export complete.
Exporting SSA_HA_pt.shp
Path = /content/drive/MyDrive/Colab Notebooks/PTI/data/CPV/GRID3/Intermediate/SSA_HA_pt.shp
Vector export complete.
Export SHP complete.


### 2.7 Summary stats on ADM

##### Prep dataframes

In [14]:
UrbClass = {'LD':'Low Density', 'SD':'Semi-Dense Urban', 'HD':'High-Density Urban'}
G3_df = pd.DataFrame(G3).drop(columns = 'geometry')

G3_df.sample(10)

Unnamed: 0,G3_ID,type,pop_un_adj,areaKM,popDens,UrbanClass,ADM1_CODE,ADM2_CODE,ADM3_CODE
1884,1884,Hamlet,5.827207,0.010948,532.27747,Low Density,CPV006,CPV006013,CPV0060130022
1385,1385,Hamlet,6.008947,0.010945,549.037792,Low Density,CPV006,CPV006017,CPV0060170031
1657,1657,Hamlet,2.300083,0.015267,150.655198,Low Density,CPV007,CPV007019,CPV0070190017
905,905,Hamlet,24.079666,0.13906,173.160387,Low Density,CPV007,CPV007019,CPV0070190016
1298,1298,Hamlet,1.221387,0.013423,90.989401,Low Density,CPV007,CPV007019,CPV0070190017
1646,1646,Hamlet,2.716113,0.012641,214.866841,Low Density,CPV007,CPV007018,CPV0070180018
459,459,Small Settlement Area,1477.590638,1.172653,1260.041263,Low Density,CPV000,CPV000000,CPV0000000002
1800,1800,Hamlet,6.460883,0.032161,200.889962,Low Density,CPV007,CPV007018,CPV0070180018
2178,2178,Hamlet,3.079067,0.01095,281.200034,Low Density,CPV006,CPV006016,CPV0060160028
3062,3062,Hamlet,5.422624,0.010956,494.930229,Low Density,CPV006,CPV006010,CPV0060100026


In [15]:
AllSummaries = pd.DataFrame(ADM3)[['ADM3_CODE', 'ADM2_CODE', 'ADM1_CODE']]

AllSummaries.sample(10)

Unnamed: 0,ADM3_CODE,ADM2_CODE,ADM1_CODE
2,CPV0000000002,CPV000000,CPV000
14,CPV0050080014,CPV005008,CPV005
19,CPV0080210019,CPV008021,CPV008
20,CPV0080210020,CPV008021,CPV008
28,CPV0060160028,CPV006016,CPV006
5,CPV0000000005,CPV000000,CPV000
25,CPV0060140025,CPV006014,CPV006
21,CPV0060130021,CPV006013,CPV006
3,CPV0000000003,CPV000000,CPV000
24,CPV0060090024,CPV006009,CPV006


#### 2.7.1 Number of settlements in each urban class

In [16]:
for ADM in ['ADM1', 'ADM2', 'ADM3']:
    for key in UrbClass:

        ADM_ID = ''.join([ADM, '_CODE'])

        df = G3_df[['UrbanClass', ADM_ID]]
        df = df.loc[df['UrbanClass'] == UrbClass[key]]
        GroupedVals = df.groupby(ADM_ID, as_index=False).count()
        GroupedVals = GroupedVals.rename(columns={'UrbanClass':''.join([key, 'ct_', ADM])})

        AllSummaries = AllSummaries.merge(GroupedVals, on=ADM_ID, how='left').fillna(0)

AllSummaries

Unnamed: 0,ADM3_CODE,ADM2_CODE,ADM1_CODE,LDct_ADM1,SDct_ADM1,HDct_ADM1,LDct_ADM2,SDct_ADM2,HDct_ADM2,LDct_ADM3,SDct_ADM3,HDct_ADM3
0,CPV0000020000,CPV000002,CPV000,1116,0.0,0.0,670,0.0,0.0,474,0.0,0.0
1,CPV0000020001,CPV000002,CPV000,1116,0.0,0.0,670,0.0,0.0,196,0.0,0.0
2,CPV0000000002,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,139,0.0,0.0
3,CPV0000000003,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,87,0.0,0.0
4,CPV0000010004,CPV000001,CPV000,1116,0.0,0.0,126,0.0,0.0,126,0.0,0.0
5,CPV0000000005,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,83,0.0,0.0
6,CPV0000000006,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,11,0.0,0.0
7,CPV0030060007,CPV003006,CPV003,191,2.0,0.0,191,2.0,0.0,191,2.0,0.0
8,CPV0040070008,CPV004007,CPV004,232,1.0,0.0,232,1.0,0.0,150,1.0,0.0
9,CPV0040070009,CPV004007,CPV004,232,1.0,0.0,232,1.0,0.0,82,0.0,0.0


#### 2.7.2 Average population density of each urban class

In [17]:
for ADM in ['ADM1', 'ADM2', 'ADM3']:
    for key in UrbClass:

        ADM_ID = ''.join([ADM, '_CODE'])

        df = G3_df.loc[G3_df['UrbanClass'] == UrbClass[key]]
        df = df[['popDens', ADM_ID]]
        GroupedVals = df.groupby(ADM_ID, as_index=False).mean()
        GroupedVals = GroupedVals.rename(columns={'popDens':''.join([key, 'avgDens_', ADM])})

        AllSummaries = AllSummaries.merge(GroupedVals, on=ADM_ID, how='left')
        # NA is meaningful. (It means there are no settlements of that class). So this one won't use .fillna(0).

AllSummaries

Unnamed: 0,ADM3_CODE,ADM2_CODE,ADM1_CODE,LDct_ADM1,SDct_ADM1,HDct_ADM1,LDct_ADM2,SDct_ADM2,HDct_ADM2,LDct_ADM3,...,HDct_ADM3,LDavgDens_ADM1,SDavgDens_ADM1,HDavgDens_ADM1,LDavgDens_ADM2,SDavgDens_ADM2,HDavgDens_ADM2,LDavgDens_ADM3,SDavgDens_ADM3,HDavgDens_ADM3
0,CPV0000020000,CPV000002,CPV000,1116,0.0,0.0,670,0.0,0.0,474,...,0.0,537.084511,,,152.434999,,,157.781755,,
1,CPV0000020001,CPV000002,CPV000,1116,0.0,0.0,670,0.0,0.0,196,...,0.0,537.084511,,,152.434999,,,139.50458,,
2,CPV0000000002,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,139,...,0.0,537.084511,,,801.181315,,,690.069883,,
3,CPV0000000003,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,87,...,0.0,537.084511,,,801.181315,,,809.650411,,
4,CPV0000010004,CPV000001,CPV000,1116,0.0,0.0,126,0.0,0.0,126,...,0.0,537.084511,,,1911.720983,,,1911.720983,,
5,CPV0000000005,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,83,...,0.0,537.084511,,,801.181315,,,967.352304,,
6,CPV0000000006,CPV000000,CPV000,1116,0.0,0.0,320,0.0,0.0,11,...,0.0,537.084511,,,801.181315,,,884.407273,,
7,CPV0030060007,CPV003006,CPV003,191,2.0,0.0,191,2.0,0.0,191,...,0.0,765.60904,1997.796212,,765.60904,1997.796212,,765.60904,1997.796212,
8,CPV0040070008,CPV004007,CPV004,232,1.0,0.0,232,1.0,0.0,150,...,0.0,1120.186467,3554.12783,,1120.186467,3554.12783,,1264.233163,3554.12783,
9,CPV0040070009,CPV004007,CPV004,232,1.0,0.0,232,1.0,0.0,82,...,0.0,1120.186467,3554.12783,,1120.186467,3554.12783,,856.686414,,


#### 2.7.3 Urban area as percent of admin area

##### First calculate area of each ADM.

In [18]:
# Calcularate area for each admin level (i.e., ADM3, ADM2,ADM1) form the ADM3 GDF.

temp_ADM = ADM3.copy()
temp_ADM = temp_ADM[['ADM1_CODE', 'ADM2_CODE', 'ADM3_CODE', 'geometry']]

In [19]:
temp_ADM['areaKM_ADM3'] = temp_ADM.to_crs('ESRI:102022')['geometry'].area / 10**6

# Compute the area size at ADM1 and ADM2 on the fly:
ADM2_area = temp_ADM.groupby(['ADM2_CODE'], as_index=False).sum()
ADM1_area = temp_ADM.groupby(['ADM1_CODE'], as_index=False).sum()


# Rename the area size col to appropriate one
ADM2_area = ADM2_area.rename(columns = {'areaKM_ADM3' : 'areaKM_ADM2'})
ADM1_area = ADM1_area.rename(columns = {'areaKM_ADM3' : 'areaKM_ADM1'})


# Merge the computed ADM1 and ADM2 area size to the temp_ADM GDF.
temp_ADM = temp_ADM.merge(pd.DataFrame(ADM2_area[['ADM2_CODE', 'areaKM_ADM2']]), on='ADM2_CODE', how='left')
temp_ADM = temp_ADM.merge(pd.DataFrame(ADM1_area[['ADM1_CODE', 'areaKM_ADM1']]), on='ADM1_CODE', how='left')

temp_ADM.sample(10)

  ADM2_area = temp_ADM.groupby(['ADM2_CODE'], as_index=False).sum()
  ADM1_area = temp_ADM.groupby(['ADM1_CODE'], as_index=False).sum()


Unnamed: 0,ADM1_CODE,ADM2_CODE,ADM3_CODE,geometry,areaKM_ADM3,areaKM_ADM2,areaKM_ADM1
8,CPV004,CPV004007,CPV0040070008,"MULTIPOLYGON (((-5004636.627 1843894.798, -500...",412.57916,631.216281,631.216281
6,CPV000,CPV000000,CPV0000000006,"POLYGON ((-5234913.020 1949146.807, -5234864.0...",14.318307,166.460635,784.85447
5,CPV000,CPV000000,CPV0000000005,"POLYGON ((-5231597.594 1946829.660, -5231484.2...",35.900756,166.460635,784.85447
13,CPV001,CPV001003,CPV0010030013,"POLYGON ((-5220592.477 1917587.935, -5220579.3...",226.646902,226.646902,226.646902
23,CPV006,CPV006012,CPV0060120023,"POLYGON ((-5065136.987 1702176.802, -5065085.8...",102.152079,102.152079,1003.798229
19,CPV008,CPV008021,CPV0080210019,"POLYGON ((-5190479.233 1687539.643, -5190471.0...",28.857808,62.513505,62.513505
12,CPV002,CPV002004,CPV0020040012,"POLYGON ((-5155467.728 1891768.919, -5155450.8...",18.877258,224.824771,344.629404
29,CPV006,CPV006011,CPV0060110029,"POLYGON ((-5072419.345 1723079.472, -5072421.8...",112.224194,112.224194,1003.798229
7,CPV003,CPV003006,CPV0030060007,"POLYGON ((-5006992.655 1914443.805, -5006995.7...",219.832046,219.832046,219.832046
30,CPV006,CPV006017,CPV0060170030,"POLYGON ((-5077715.254 1708296.634, -5077713.7...",93.876825,137.650071,1003.798229


In [20]:
ADM3_df = pd.DataFrame(temp_ADM).drop(columns='geometry')

##### Now calculate total urban area in each admin area, divided by admin area.

In [21]:
df = G3_df.loc[G3_df['UrbanClass'] != 'Low Density']
df = df.merge(ADM3_df[['ADM3_CODE', 'areaKM_ADM3', 'areaKM_ADM2', 'areaKM_ADM1']], how='left', on='ADM3_CODE')

df.sample(10)

Unnamed: 0,G3_ID,type,pop_un_adj,areaKM,popDens,UrbanClass,ADM1_CODE,ADM2_CODE,ADM3_CODE,areaKM_ADM3,areaKM_ADM2,areaKM_ADM1
6,9,Built-up Area,58667.490608,12.785549,4588.578225,High-Density Urban,CPV001,CPV001003,CPV0010030013,226.646902,226.646902,226.646902
7,59,Small Settlement Area,6070.740377,0.584888,10379.325346,Semi-Dense Urban,CPV007,CPV007020,CPV0070200015,152.946352,152.946352,471.235202
12,338,Small Settlement Area,5982.390265,1.683223,3554.12783,Semi-Dense Urban,CPV004,CPV004007,CPV0040070008,412.57916,631.216281,631.216281
3,6,Built-up Area,8010.437103,1.827976,4382.135609,Semi-Dense Urban,CPV006,CPV006009,CPV0060090024,120.843053,120.843053,1003.798229
4,7,Built-up Area,15164.202588,3.428926,4422.43503,Semi-Dense Urban,CPV006,CPV006009,CPV0060090024,120.843053,120.843053,1003.798229
2,5,Built-up Area,9051.959527,7.322964,1236.105929,Semi-Dense Urban,CPV006,CPV006010,CPV0060100026,242.591001,242.591001,1003.798229
11,195,Small Settlement Area,7468.639926,4.336176,1722.402515,Semi-Dense Urban,CPV006,CPV006015,CPV0060150027,26.49055,26.49055,1003.798229
13,354,Small Settlement Area,5596.347472,2.554704,2190.605426,Semi-Dense Urban,CPV003,CPV003006,CPV0030060007,219.832046,219.832046,219.832046
9,189,Small Settlement Area,6580.756533,0.737087,8928.053392,Semi-Dense Urban,CPV006,CPV006016,CPV0060160028,36.94299,36.94299,1003.798229
5,8,Built-up Area,7277.724095,4.032009,1804.986998,Semi-Dense Urban,CPV003,CPV003006,CPV0030060007,219.832046,219.832046,219.832046


In [22]:
for ADM in ['ADM3', 'ADM2', 'ADM1']:

    ADM_ID = ADM + '_CODE'
    ADMarea = ''.join(['areaKM_', ADM])
    SumField = ''.join(['UrbAreaKM_', ADM])
    PCfield = ''.join(['UrbAreaPC_', ADM])

    GroupedVals = df[['areaKM', ADM_ID]].groupby(ADM_ID, as_index=False).sum()
    GroupedVals = GroupedVals.rename(columns={'areaKM':SumField})
    GroupedVals = ADM3_df.merge(GroupedVals[[SumField, ADM_ID]], how='left', on=ADM_ID)
    GroupedVals[PCfield] = GroupedVals[SumField] / GroupedVals[ADMarea]

    AllSummaries = AllSummaries.merge(GroupedVals[[PCfield, SumField, ADMarea, 'ADM3_CODE']], how='left', on='ADM3_CODE').fillna(0)

In [23]:
AllSummaries.sample(30)

Unnamed: 0,ADM3_CODE,ADM2_CODE,ADM1_CODE,LDct_ADM1,SDct_ADM1,HDct_ADM1,LDct_ADM2,SDct_ADM2,HDct_ADM2,LDct_ADM3,...,HDavgDens_ADM3,UrbAreaPC_ADM3,UrbAreaKM_ADM3,areaKM_ADM3,UrbAreaPC_ADM2,UrbAreaKM_ADM2,areaKM_ADM2,UrbAreaPC_ADM1,UrbAreaKM_ADM1,areaKM_ADM1
23,CPV0060120023,CPV006012,CPV006,2212,8.0,1.0,160,1.0,0.0,160,...,0.0,0.211435,21.598512,102.152079,0.211435,21.598512,102.152079,0.051283,51.477636,1003.798229
16,CPV0070190016,CPV007019,CPV007,1178,1.0,0.0,615,0.0,0.0,343,...,0.0,0.0,0.0,132.256158,0.0,0.0,228.834488,0.001241,0.584888,471.235202
20,CPV0080210020,CPV008021,CPV008,112,0.0,0.0,112,0.0,0.0,59,...,0.0,0.0,0.0,33.655697,0.0,0.0,62.513505,0.0,0.0,62.513505
7,CPV0030060007,CPV003006,CPV003,191,2.0,0.0,191,2.0,0.0,191,...,0.0,0.029962,6.586713,219.832046,0.029962,6.586713,219.832046,0.029962,6.586713,219.832046
4,CPV0000010004,CPV000001,CPV000,1116,0.0,0.0,126,0.0,0.0,126,...,0.0,0.0,0.0,54.246373,0.0,0.0,54.246373,0.0,0.0,784.85447
25,CPV0060140025,CPV006014,CPV006,2212,8.0,1.0,238,0.0,0.0,238,...,0.0,0.0,0.0,77.357294,0.0,0.0,77.357294,0.051283,51.477636,1003.798229
26,CPV0060100026,CPV006010,CPV006,2212,8.0,1.0,448,2.0,0.0,448,...,0.0,0.05939,14.407421,242.591001,0.05939,14.407421,242.591001,0.051283,51.477636,1003.798229
11,CPV0020040011,CPV002004,CPV002,238,0.0,0.0,167,0.0,0.0,145,...,0.0,0.0,0.0,205.947513,0.0,0.0,224.824771,0.0,0.0,344.629404
1,CPV0000020001,CPV000002,CPV000,1116,0.0,0.0,670,0.0,0.0,196,...,0.0,0.0,0.0,130.561881,0.0,0.0,564.147462,0.0,0.0,784.85447
0,CPV0000020000,CPV000002,CPV000,1116,0.0,0.0,670,0.0,0.0,474,...,0.0,0.0,0.0,433.585581,0.0,0.0,564.147462,0.0,0.0,784.85447


In [24]:
# Export 'UrbanIndicators_ADM.csv' to the 'Results' folder.
AllSummaries.to_csv(os.path.join(src_GRID3_results, 'UrbanIndicators_ADM.csv'))

---
# **3. WORLDPOP**

In [25]:
src_ras_pop = os.path.join(WP_Fd, 'Source', config.original_WP_fil)

## **3.2 Zonal statistics**

In [26]:
with rasterio.open(src_ras_pop) as src:

    profile = src.profile
    #profile.update(driver='GTiff', crs = zone_CRS)
    transform = src.transform

    array = src.read(1)

print(transform)
print(profile['crs'])
print(array)

| 0.00, 0.00,-25.36|
| 0.00,-0.00, 17.21|
| 0.00, 0.00, 1.00|
EPSG:4326
[[-99999. -99999. -99999. ... -99999. -99999. -99999.]
 [-99999. -99999. -99999. ... -99999. -99999. -99999.]
 [-99999. -99999. -99999. ... -99999. -99999. -99999.]
 ...
 [-99999. -99999. -99999. ... -99999. -99999. -99999.]
 [-99999. -99999. -99999. ... -99999. -99999. -99999.]
 [-99999. -99999. -99999. ... -99999. -99999. -99999.]]


In [27]:
# The zonal shape must have the same CRS with that of target raster.
# Otherwise, a memory handling error will occur due to an extraordinary large shape array (which is incorrect) due to the misinterpretation of the zone vectors.

ADM3.to_crs(profile['crs'], inplace = True)

In [28]:
# For the WorldPop 2020 UN-Adjusted Constrained datasets, NoData value is -99999.
# So, -99999 should be replaced to 0 to compute correct zonal sum in the next step.

np.place(array, array == -99999, 0)

In [29]:
stats = zonal_stats(ADM3, array, affine=transform, stats = ['sum', 'count'])
stats



[{'count': 52966, 'sum': 4773.8583984375},
 {'count': 15966, 'sum': 1438.726318359375},
 {'count': 6464, 'sum': 9326.5458984375},
 {'count': 7743, 'sum': 4478.02001953125},
 {'count': 6631, 'sum': 18828.357421875},
 {'count': 4391, 'sum': 8835.033203125},
 {'count': 1751, 'sum': 1579.868896484375},
 {'count': 26827, 'sum': 23434.74609375},
 {'count': 50161, 'sum': 17696.587890625},
 {'count': 26610, 'sum': 2522.182861328125},
 {'count': 14614, 'sum': 5155.779296875},
 {'count': 25118, 'sum': 6111.2412109375},
 {'count': 2304, 'sum': 1785.11181640625},
 {'count': 27655, 'sum': 87607.421875},
 {'count': 33267, 'sum': 6804.44091796875},
 {'count': 18499, 'sum': 19616.314453125},
 {'count': 15988, 'sum': 3661.544677734375},
 {'count': 11684, 'sum': 4111.42919921875},
 {'count': 10824, 'sum': 11420.9306640625},
 {'count': 3488, 'sum': 2272.878662109375},
 {'count': 4061, 'sum': 3080.36083984375},
 {'count': 9106, 'sum': 5757.396484375},
 {'count': 8746, 'sum': 7958.5048828125},
 {'count': 1

In [30]:
Pop_ADM = pd.DataFrame(ADM3.join(pd.DataFrame(stats)).rename(columns={
    'sum': 'popADM3', 'count':'ct_popADM3'})).drop(columns='geometry')

Pop_ADM.sample(3)

Unnamed: 0,ADM3_NAME,ADM0_CODE,ADM1_CODE,ADM2_CODE,ADM3_CODE,ct_popADM3,popADM3
15,Santa Catarina do Fogo,CPV,CPV007,CPV007020,CPV0070200015,18499,19616.314453
27,S. Salvador do Mundo,CPV,CPV006,CPV006015,CPV0060150027,3212,26811.570312
28,S. Lourenço dos Órgãos,CPV,CPV006,CPV006016,CPV0060160028,4472,135666.0625


In [31]:
# ADM2 level aggregation
GroupedVals = Pop_ADM.drop(columns='ct_popADM3', axis=1).groupby('ADM2_CODE', as_index=False)
Pop_ADM = Pop_ADM.merge(GroupedVals.sum().rename(columns={'popADM3': 'popADM2'}), how = 'left', on='ADM2_CODE')

# ADM1 level aggregation
GroupedVals = Pop_ADM.drop(columns=['ct_popADM3', 'popADM2'], axis=1).groupby('ADM1_CODE', as_index=False)
Pop_ADM = Pop_ADM.merge(GroupedVals.sum().rename(columns={'popADM3': 'popADM1'}), how = 'left', on='ADM1_CODE')


Pop_ADM.sample(3)

  Pop_ADM = Pop_ADM.merge(GroupedVals.sum().rename(columns={'popADM3': 'popADM2'}), how = 'left', on='ADM2_CODE')
  Pop_ADM = Pop_ADM.merge(GroupedVals.sum().rename(columns={'popADM3': 'popADM1'}), how = 'left', on='ADM1_CODE')


Unnamed: 0,ADM3_NAME,ADM0_CODE,ADM1_CODE,ADM2_CODE,ADM3_CODE,ct_popADM3,popADM3,popADM2,popADM1
31,Santíssimo Nome de Jesus,CPV,CPV006,CPV006017,CPV0060170031,5289,4907.168945,9249.42041,310210.383301
3,S. Pedro Apóstolo,CPV,CPV000,CPV000000,CPV0000000003,7743,4478.02002,24219.468018,49260.410156
9,S. João Baptista,CPV,CPV004,CPV004007,CPV0040070009,26610,2522.182861,20218.770752,20218.770752


In [32]:
# Selecting only required cols:

Pop_ADM_out = Pop_ADM[['ADM1_CODE',	'ADM2_CODE',	'ADM3_CODE',	'ct_popADM3',	'popADM1',	'popADM2', 'popADM3']]
Pop_ADM_out.sample(3)

Unnamed: 0,ADM1_CODE,ADM2_CODE,ADM3_CODE,ct_popADM3,popADM1,popADM2,popADM3
10,CPV002,CPV002005,CPV0020050010,14614,13052.132324,5155.779297,5155.779297
1,CPV000,CPV000002,CPV0000020001,15966,49260.410156,6212.584717,1438.726318
14,CPV005,CPV005008,CPV0050080014,33267,6804.440918,6804.440918,6804.440918


In [33]:
# Export 'Population_ADM.csv' to the 'Results' folder.
Pop_ADM_out.to_csv(os.path.join(src_wPop_results, 'Population_ADM.csv'))