# QA/QC for delivered administrative boundaries
The World Bank receives regular deliveries of official administrative boundaries. This script will process and evaluate these boundaries in several ways:

- Combine the two ID columns into a single, primary key  
  a. Check to ensure no duplicates in this new, primary key
- Combine ADM0 file with disputed areas shapefile 
- Create ocean mask 
- Within higher heirarchical level, evaluate name duplication
- Perform topological checks  
  a. Ensure no overlaps  
  b. Ensure no gaps  
  c. Ensure all admin1 and admin2 shapes are fully contained within their heirarchical parents
 


In [1]:
import sys, os
import requests
import json

import geopandas as gpd
import pandas as pd

from shapely.geometry import Point, Polygon, box, shape

sys.path.append("../src")

from wb_gad_helper import *

%load_ext autoreload
%autoreload 2  

In [2]:
admin_folder = r'C:\WBG\Work\data\ADMIN\NEW_WB_BOUNDS'
out_folder = r'C:\WBG\Work\data\ADMIN\QAQC'
better_formats_folder = r'C:\WBG\Work\data\ADMIN\BETTER_FORMATS'
if not os.path.exists(better_formats_folder):
    os.makedirs(better_formats_folder)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

admin0_file = os.path.join(admin_folder, 'WB_GAD_ADM0.shp')
adm0_disputes_file = os.path.join(admin_folder, 'WB_GAD_NDLSA.shp')
admin1_file = os.path.join(admin_folder, 'WB_GAD_ADM1.shp')
admin2_file = os.path.join(admin_folder, 'WB_GAD_ADM2.shp')

In [3]:
# Add World Bank classifications to ADM0
wb_classes = get_wb_classifications()

In [4]:
wb_classes['group_type'].value_counts()

group_type
REGION       1100
REGION_UN     982
OTHER         878
LENDING       675
INCOME        565
CONTINENT     206
Name: count, dtype: int64

In [7]:
wb_classes.loc[wb_classes['group_type'] == 'REGION']['group'].value_counts()

group
002    60
ECS    58
150    51
142    50
TSS    48
SSF    48
SSA    47
LCN    42
DSS    39
EAS    38
TLA    30
009    29
AFE    26
LAC    23
TEC    23
TEA    23
EAP    23
ESA    22
ARB    22
BLA    22
AFW    22
MEA    21
MCT    21
ECA    20
BEC    19
005    16
WAF    16
MPA    15
DEA    14
CMD    13
MNA    13
TMN    12
PAC    10
SER    10
CFR     9
BMN     9
SEA     9
BEA     9
BSS     9
CRB     8
MDE     8
SAM     8
SAS     8
DLA     8
TSA     8
CAM     7
SAX     7
DSA     7
CAT     6
CAS     5
CNA     5
EER     5
NAF     5
DEC     4
DMN     3
NAC     3
ESS     3
BSA     1
Name: count, dtype: int64

In [3]:
adm0 = gpd.read_file(admin0_file)
adm0_disputes = gpd.read_file(adm0_disputes_file)
adm1 = gpd.read_file(admin1_file)
adm2 = gpd.read_file(admin2_file)

## Combine ID columns

In [4]:
merged_adm1 = merge_id_columns(adm1, [['P_CODE_1', 'P_CODE_1_t'], ['ADM1CD', 'ADM1CD_t']])
merged_adm2 = merge_id_columns(adm2, [['P_CODE_1', 'P_CODE_1_t'], ['P_CODE_2', 'P_CODE_2_t'], ['ADM1CD', 'ADM1CD_t'], ['ADM2CD', 'ADM2CD_t']])

In [5]:
# Check for duplicates in ADM1
test_col = 'ADM1CD_c'
check_duplicates(merged_adm1, test_col, os.path.join(out_folder, f'adm1_duplicates_{test_col}.gpkg'))

# Check for duplicates in ADM2
test_col = 'ADM2CD_c'
check_duplicates(merged_adm2, test_col, os.path.join(out_folder, f'adm2_duplicates_{test_col}.gpkg'))

ADM1CD_c duplicates: 0
ADM2CD_c duplicates: 0


## Combine ADM0 with disputed territories

In [6]:
for col in adm0_disputes.columns:
    if not col in adm0.columns:
        adm0_disputes.drop(columns=[col], inplace=True)
adm0_disputes.head()
adm0_complete = pd.concat([adm0, adm0_disputes], ignore_index=True)


In [7]:
# Generate a global ocean mask from the admin divisions
ocean_polygon = box(-180, -90, 180, 90)  # Global bounding box for ocean
#clip ocean polygon to adm0 boundaries
ocean_polygon = ocean_polygon.difference(adm0_complete.union_all())  # Remove land areas
ocean_mask = gpd.GeoDataFrame([[1, ocean_polygon]], columns=['id', 'geometry'], crs=4326)


## Evaluate name duplication

In [8]:
# Evaluate duplicate names in adm1 and adm2
evaluate_duplicate_names(merged_adm1, 'NAM_1', 'ISO_A3', os.path.join(out_folder, "ADM1_name_duplicates.log"))
# Evaluate duplicate names in adm1 and adm2
evaluate_duplicate_names(merged_adm2, 'NAM_2', 'ADM1CD_c', os.path.join(out_folder, "ADM2_name_duplicates.log"))

In [9]:
# Use sjoin to identify overlapping polygons
'''
sj = gpd.sjoin(adm0, adm0, how="inner", predicate="overlaps", lsuffix="left", rsuffix="right")
sj = sj[sj.index != sj.index_right]

sj['intersection_geom'] = sj['geometry_left'].intersection(sj['geometry_right'])
sj['intersection_area'] = sj['intersection_geom'].area
sj
'''

'\nsj = gpd.sjoin(adm0, adm0, how="inner", predicate="overlaps", lsuffix="left", rsuffix="right")\nsj = sj[sj.index != sj.index_right]\n\nsj[\'intersection_geom\'] = sj[\'geometry_left\'].intersection(sj[\'geometry_right\'])\nsj[\'intersection_area\'] = sj[\'intersection_geom\'].area\nsj\n'

## Separate datasets into primary and secondary tables

The delivered files contain several columns that are temporary or reference external sources. This section will separate the superfluous columns into a secondary table

In [10]:
simplified_out_folder = os.path.join(better_formats_folder, 'simplified_output')
if not os.path.exists(simplified_out_folder):
    os.makedirs(simplified_out_folder)
adm1_primary = 'ADM1CD_c'
adm1_simple_cols = ['ISO_A3','ISO_A2','WB_A3','WB_REGION','WB_STATUS','NAM_0','NAM_1','ADM1CD_c','GEOM_SRCE', 'geometry']
adm1_bad_cols = [adm1_primary] + [x for x in merged_adm1.columns if x not in adm1_simple_cols]

adm2_primary = 'ADM2CD_c'
adm2_simple_cols = adm1_simple_cols + ['NAM_2','ADM2CD_c']
adm2_bad_cols = [adm2_primary] + [x for x in merged_adm2.columns if x not in adm2_simple_cols]

# Write out final versions in several data formats

write out:
- adm0 base
- adm0 NDLSA
- adm0 complete
- adm1 simple
- adm1 supplemntal columns
- adm2 simple 
- adm2 supplemental columns

in these formats
- geopackage
- geojson
- shapefile

In [11]:
final_folder = os.path.join(admin_folder, 'FOR_PUBLICATION')
if not os.path.exists(final_folder):
    os.makedirs(final_folder)
for file_format in ['gpkg', 'shp', 'geojson']:
    temp_folder = os.path.join(final_folder, file_format)
    if not os.path.exists(temp_folder):
        os.makedirs(temp_folder)

for file_def in [
    (ocean_mask, 'WB_GAD_ocean_mask'),
    (adm0_complete, 'WB_GAD_ADM0_complete'),
    (adm0, 'WB_GAD_ADM0'),
    (adm0_disputes, 'WB_GAD_ADM0_NDLSA'),
    (merged_adm1.loc[:, adm1_simple_cols], 'WB_GAD_ADM1'),
    (merged_adm2.loc[:, adm2_simple_cols], 'WB_GAD_ADM2'),    
    ]:
    gdf, filename = file_def
    # write geopackage to file
    gdf.to_file(os.path.join(final_folder, "gpkg", f"{filename}.gpkg"), driver='GPKG')
    # write shapefile to file
    gdf.to_file(os.path.join(final_folder, "shp", f"{filename}.shp"), driver='ESRI Shapefile')
    # Write geojson to file
    gdf.to_file(os.path.join(final_folder, "geojson", f"{filename}.geojson"), driver='GeoJSON')

pd.DataFrame(merged_adm1.loc[:, adm1_bad_cols]).to_csv(os.path.join(final_folder, 'WB_GAD_adm1_additional_columns.csv'), index=False)
pd.DataFrame(merged_adm2.loc[:, adm2_bad_cols]).to_csv(os.path.join(final_folder, 'WB_GAD_adm2_additional_columns.csv'), index=False)
