# Mpumalanga spatial analysis

1. Calculate zonal statistics on the South African Municipalities  
   - Nighttime lights intensity  
   - Gridded population count  
   - Urbanization (TBD)  
   - Built-up  
   - Travel-time  
     - re-calculate TT to tourist destinations  
   - Gravity-metrics (TBD)  
   
2023-11-28 Updated work 
- re-run population numbers with higher resolution data on population 
- Re-run travel times; for all the below, include straight-line calculations  
  - Travel time to largest city/capital city **in their own province**
  - Nearest national port and airport
  - Nearest non-national port and airport
  



In [1]:
import sys
import os
import importlib
import rasterio

import pandas as pd
import geopandas as gpd
import skimage.graph as graph


sys.path.insert(0, "/home/wb411133/Code/gostrocks/src")

import GOSTRocks.dataMisc as dataMisc
import GOSTRocks.rasterMisc as rMisc
from GOSTRocks.misc import tPrint

sys.path.insert(0, "/home/wb411133/Code/GOSTNets_Raster/src")
import GOSTNetsRaster.market_access as ma

%load_ext autoreload
%autoreload 2



In [14]:
# Define input variables
in_folder = "/home/wb411133/projects/URB_SURDR_ZAF"
ntl_folder = os.path.join(in_folder, "NTL_data")
ghsl_folder = os.path.join(in_folder, "GHSL_data")
urban_folder = os.path.join(in_folder, "URBAN")
ma_folder = os.path.join(in_folder, "market_access")
infra_folder = os.path.join(in_folder, "Infra")
zaf_folder = os.path.join(in_folder, "MiningCommunities")
zonal_res_folder = os.path.join(in_folder, "ZONAL_RES")
protected_areas_folder = os.path.join(in_folder, "Protected_Areas")
reference_folder = os.path.join(in_folder, "Reference")
for f in [
    in_folder,
    ntl_folder,
    ghsl_folder,
    ma_folder,
    infra_folder,
    zonal_res_folder,
    protected_areas_folder,
]:
    if not os.path.exists(f):
        os.makedirs(f)

# Define local variables
admin0_file = os.path.join(in_folder, "ZAF_select_adm0.shp")
admin1_file = os.path.join(in_folder, "admin1_geoBounds.shp")
admin3_file = os.path.join(in_folder, "ADMIN", "admin3_geoBounds_FINAL.shp")
ghsl_thresh = 0.1
local_ghsl_file = os.path.join(in_folder, f"ghsl_combined_{int(ghsl_thresh*100)}.tif")
high_res_pop = (
    "/home/public/Data/GLOBAL/Population/RF_SSA_2015-2020/ZAF/ppp_ZAF_const_2020.tif"
)
urban_raster = os.path.join(urban_folder, "zaf_cpo20_urban.tif")
urban_raster_pop = os.path.join(urban_folder, "zaf_cpo20.tif")
urban_extents_file = os.path.join(urban_folder, "cpo20_urban_extents.shp")
local_ghs_smod_file = os.path.join(in_folder, "GHS_SMOD_2020.tif")
major_urban_extents = os.path.join(in_folder, "major_cities_UCDB2019.shp")
zaf_capitals = os.path.join(in_folder, "ZAF_provincial_capitals.kml")
local_friction_file = os.path.join(ma_folder, "friction_2020.tif")
local_airports = os.path.join(reference_folder, "Major_Airports.shp")
local_ports = os.path.join(reference_folder, "Ports.shp")
tourist_locations = os.path.join(infra_folder, "Kruger_EntryGates.shp")
protected_areas = os.path.join(protected_areas_folder, "SAPAD_IR_2023_Q2_01.shp")
mines_file = os.path.join(reference_folder, "SAMines", "AllMinesFeb2024.shp")
plants_file = os.path.join(reference_folder, "SAPlants", "processing-plants.shp")

municipalities = os.path.join(zaf_folder, "MainPlaces", "MP_SA_2011.shp")
muni_id = "MP_CODE_st"

proj_epsg = 22293  # https://epsg.io/22293

In [3]:
municipalities = os.path.join(zaf_folder, "MainPlaces", "MP_SA_2011.shp")
inM = gpd.read_file(municipalities)

In [None]:
# Zonal stats on nighttime lights
regional_ntl_folder = os.path.join(ntl_folder, "Neighbours")
ntl_files = [
    os.path.join(regional_ntl_folder, x) for x in os.listdir(regional_ntl_folder)
]

inM_ntl_res = inM.copy()
for ntl_file in ntl_files:
    year = ntl_file.split("_")[-2]
    tPrint(year)
    res = rMisc.zonalStats(inM, ntl_file, minVal=0.1)
    res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"])
    inM_ntl_res[f"NTL_{year}"] = res["SUM"]
pd.DataFrame(inM_ntl_res.drop(["geometry"], axis=1)).to_csv(
    os.path.join(zonal_res_folder, "NTL_Zonal_res.csv")
)

In [None]:
# Population summary
inM_pop_res = inM.copy()

pop_res = rMisc.zonalStats(inM, high_res_pop, minVal=0, reProj=True)
pop_res = pd.DataFrame(pop_res, columns=["SUM", "MIN", "MAX", "MEAN"])
inM_pop_res["POP"] = pop_res["SUM"]

pd.DataFrame(inM_pop_res.drop(["geometry"], axis=1)).to_csv(
    os.path.join(zonal_res_folder, "Pop_res.csv")
)

In [None]:
# GHSL _summary
ghsl_res = rMisc.zonalStats(
    inM, local_ghsl_file, rastType="C", unqVals=list(range(1975, 2031, 5)), reProj=True
)
ghsl_res = pd.DataFrame(
    ghsl_res, columns=[f"c_{x}" for x in list(range(1975, 2031, 5))]
)
ghsl_res = ghsl_res.cumsum(axis=1)
ghsl_area = ghsl_res.copy()
# GHSL conversion to area (km2)
for col in ghsl_area.columns:
    ghsl_area[col] = ghsl_area[col] * (100 * 100) / 1000000
ghsl_area["AREA_KM"] = inM["ALBERS_ARE"]
ghsl_area["per_built_2020"] = ghsl_area.apply(
    lambda x: x["c_2020"] / x["AREA_KM"], axis=1
)
ghsl_area[muni_id] = inM[muni_id]
ghsl_area.to_csv(os.path.join(zonal_res_folder, "Muni_GHSL_res.csv"))

In [26]:
# Calculate travel time
popR = rasterio.open(urban_raster_pop)
ttr = rasterio.open(local_friction_file)
frictionD = ttr.read()[0, :, :] * 1000
mcp = graph.MCP_Geometric(frictionD)

In [None]:
# Calculate travel time to largest city within province
in_cities = gpd.read_file(major_urban_extents)
zaf_adm1 = dataMisc.get_geoboundaries("ZAF", "ADM1")
in_cities["geometry"] = in_cities["geometry"].apply(lambda x: x.centroid)
in_cities = in_cities.loc[:, ["ID_HDC_G0", "CTR_MN_NM", "UC_NM_MN", "P15", "geometry"]]
zaf_adm1 = gpd.read_file(admin1_file)
in_cities = gpd.sjoin(in_cities, zaf_adm1)

In [25]:
##### Ports and airports
ports = gpd.read_file(local_ports)
zaf_ports = ports.loc[ports["COUNTRY"] == "ZA"]
foreign_ports = ports.loc[ports["COUNTRY"] != "ZA"]
maputo_port = foreign_ports.loc[foreign_ports["PORT_NAME"] == "MAPUTO"]

airports = gpd.read_file(local_airports)
zaf_airports = airports.loc[airports["soc"] == "ZAF"]
foreign_airports = airports.loc[airports["soc"] != "ZAF"]

In [None]:
# Largest 5 cities
largest_5_cities = in_cities.sort_values("P15", ascending=False)[:5]


# Largest city in each province
def get_largest(x):
    return x.sort_values("P15", ascending=False).iloc[0]


provincial_largest = in_cities.groupby("shapeName").apply(get_largest)
provincial_largest.crs = in_cities.crs
# Read in KML of provincial capitals
gpd.io.file.fiona.drvsupport.supported_drivers["LIBKML"] = "rw"
prov_capitals = gpd.read_file(zaf_capitals).loc[:, ["Name", "geometry"]]

In [44]:
# Plants and mines
plants = gpd.read_file(plants_file)
mines = gpd.read_file(mines_file)
mines = mines.loc[mines["Commodity"] != "coal"]
mines = mines.loc[~mines.geometry.isna()]

In [46]:
# Calculate travel time
popR = rasterio.open(urban_raster_pop)
ttr = rasterio.open(local_friction_file)
frictionD = ttr.read()[0, :, :] * 1000
mcp = graph.MCP_Geometric(frictionD)

inN, profile = rMisc.standardizeInputRasters(popR, ttr, resampling_type="sum")
with rMisc.create_rasterio_inmemory(profile, inN) as pop_temp:
    for dest in [
        # [gpd.read_file(protected_areas), 'tt_protected_areas'],
        # [gpd.read_file(tourist_locations), 'tt_kruger'],
        # [gpd.read_file(major_urban_extents), 'tt_cities'],
        # [largest_5_cities, 'tt_largest_5_cities'],
        # [provincial_largest, 'tt_prov_largest'],
        # [prov_capitals, 'tt_prov_capital'],
        [zaf_ports, "tt_zaf_ports"],
        [foreign_ports, "tt_foreign_ports"],
        [maputo_port, "tt_maputo_ports"],
        [zaf_airports, "tt_zaf_airports"],
        [mines, "tt_mines_noncoal"],
        [plants, "tt_plants"],
        # [foreign_airports, 'tt_foreign_airports']
    ]:
        out_file = os.path.join(zonal_res_folder, f"{dest[1]}_tt.csv")
        tPrint(out_file)
        if not os.path.exists(out_file):
            dests = dest[0]
            if not dests.geom_type.iloc[0] == "Point":
                dests["geometry"] = dests["geometry"].apply(lambda x: x.centroid)
            suffix = os.path.basename(out_file[:-4])
            res = ma.summarize_travel_time_populations(
                pop_temp, ttr, dests, mcp, inM, col_suffix=suffix, calc_small=True
            )
            pd.DataFrame(res.drop(["geometry"], axis=1)).to_csv(out_file)

13:12:46	/home/wb411133/projects/URB_SURDR_ZAF/ZONAL_RES/tt_zaf_ports_tt.csv


  tt_pop = ttD * popD


13:15:26	/home/wb411133/projects/URB_SURDR_ZAF/ZONAL_RES/tt_foreign_ports_tt.csv


  tt_pop = ttD * popD


13:18:05	/home/wb411133/projects/URB_SURDR_ZAF/ZONAL_RES/tt_maputo_ports_tt.csv


  tt_pop = ttD * popD


13:20:44	/home/wb411133/projects/URB_SURDR_ZAF/ZONAL_RES/tt_zaf_airports_tt.csv


  tt_pop = ttD * popD


13:23:24	/home/wb411133/projects/URB_SURDR_ZAF/ZONAL_RES/tt_mines_noncoal_tt.csv


  tt_pop = ttD * popD


13:26:03	/home/wb411133/projects/URB_SURDR_ZAF/ZONAL_RES/tt_plants_tt.csv


  tt_pop = ttD * popD


# Combine results

In [47]:
baseD = gpd.read_file(municipalities)
baseD.index = baseD[muni_id].astype("int64")

In [48]:
def join_data(in_file_name, baseD):
    ntl_data = pd.read_csv(in_file_name, index_col=0)
    ntl_data.index = ntl_data[muni_id].astype("int64")
    cols_to_use = ntl_data.columns.difference(baseD.drop("geometry", axis=1).columns)
    ntl_data = ntl_data[cols_to_use]
    baseD = pd.merge(baseD, ntl_data, left_index=True, right_index=True)
    return baseD

In [49]:
for in_file in os.listdir(zonal_res_folder):
    if in_file.endswith(".csv"):
        print(in_file)
        baseD = join_data(os.path.join(zonal_res_folder, in_file), baseD)

tt_zaf_ports_tt.csv
tt_foreign_ports_tt.csv
tt_maputo_ports_tt.csv
tt_zaf_airports_tt.csv
tt_mines_noncoal_tt.csv
tt_plants_tt.csv


In [None]:
baseD.drop("MP_CODE_st", axis=1).reset_index().to_file(
    os.path.join(zonal_res_folder, "named_places_zonal.geojson"), driver="GeoJSON"
)

In [50]:
pd.DataFrame(baseD.drop(["geometry"], axis=1)).to_csv(
    os.path.join(zonal_res_folder, "named_places_zonal.csv")
)

# Calculate gravity of all MPs in Mpumalanga to major cities

In [None]:
out_varun_folder = os.path.join(in_folder, "SP_VARUN", "RESULTS")

In [None]:
### Read in origins
sp_varun_file = os.path.join(in_folder, "SP_VARUN", "SP_SA_2011.shp")
in_sp = gpd.read_file(sp_varun_file)
in_sp.crs = 4326
in_sp["geometry"] = in_sp["geometry"].apply(lambda x: x.centroid)
# inM = gpd.read_file(municipalities)
# selM = inM.loc[inM['PR_NAME'] == 'Mpumalanga'].copy()
# selM['geometry'] = selM['geometry'].apply(lambda x: x.centroid)

In [None]:
### Read in destinations
in_cities = gpd.read_file(major_urban_extents)
largest_5_cities = in_cities.sort_values("P15", ascending=False)[:5]
largest_5_cities["geometry"] = largest_5_cities["geometry"].apply(lambda x: x.centroid)
largest_5_cities

In [None]:
# Calculate travel time
popR = rasterio.open(urban_raster_pop)
ttr = rasterio.open(local_friction_file)
frictionD = ttr.read()[0, :, :] * 1000
mcp = graph.MCP_Geometric(frictionD)

In [None]:
sel_sp

In [None]:
importlib.reload(ma)
# Calculate for Gauteng
sel_sp = in_sp.loc[in_sp["PR_NAME"] == "Gauteng"].copy()
od = ma.calculate_od_matrix(ttr, mcp, sel_sp, sel_sp)
xx = pd.DataFrame(od)
xx.columns = sel_sp.SP_CODE_st
xx.index = sel_sp.SP_CODE_st
xx.to_csv(os.path.join(out_varun_folder, "Gauteng_OD.csv"))

In [None]:
importlib.reload(ma)
# Calculate for City of Cape Town
sel_sp = in_sp.loc[in_sp["DC_NAME"] == "City of Cape Town"].copy()
od = ma.calculate_od_matrix(ttr, mcp, sel_sp, sel_sp)
xx = pd.DataFrame(od)
xx.columns = sel_sp.SP_CODE_st
xx.index = sel_sp.SP_CODE_st
xx.to_csv(os.path.join(out_varun_folder, "Cape_Town_OD.csv"))

In [None]:
importlib.reload(ma)
# Calculate for City of eTh
sel_sp = in_sp.loc[in_sp["DC_NAME"] == "eThekwini"].copy()
od = ma.calculate_od_matrix(ttr, mcp, sel_sp, sel_sp)
xx = pd.DataFrame(od)
xx.columns = sel_sp.SP_CODE_st
xx.index = sel_sp.SP_CODE_st
xx.to_csv(os.path.join(out_varun_folder, "eThekwini_OD.csv"))

In [None]:
importlib.reload(ma)
# Calculate for City of eTh
sel_sp = in_sp.loc[in_sp["DC_NAME"] == "Nelson Mandela Bay"].copy()
od = ma.calculate_od_matrix(ttr, mcp, sel_sp, sel_sp)
xx = pd.DataFrame(od)
xx.columns = sel_sp.SP_CODE_st
xx.index = sel_sp.SP_CODE_st
xx.to_csv(os.path.join(out_varun_folder, "Nelson_Mandela_Bay_OD.csv"))

In [None]:
importlib.reload(ma)
# Calculate for City of eTh
sel_sp = in_sp.loc[in_sp["DC_NAME"] == "Buffalo City"].copy()
od = ma.calculate_od_matrix(ttr, mcp, sel_sp, sel_sp)
xx = pd.DataFrame(od)
xx.columns = sel_sp.SP_CODE_st
xx.index = sel_sp.SP_CODE_st
xx.to_csv(os.path.join(out_varun_folder, "Buffalo_City_OD.csv"))

# Debugging

In [7]:
# Running travel time summaries with small issue flag
# Calculate travel time
popR = rasterio.open(urban_raster_pop)
ttr = rasterio.open(local_friction_file)
frictionD = ttr.read()[0, :, :] * 1000
mcp = graph.MCP_Geometric(frictionD)

inN, profile = rMisc.standardizeInputRasters(popR, ttr, resampling_type="sum")

In [11]:
with rMisc.create_rasterio_inmemory(profile, inN) as pop_temp:
    res = ma.summarize_travel_time_populations(
        pop_temp, ttr, zaf_airports, mcp, inM, calc_small=True
    )

  tt_pop = ttD * popD


In [12]:
res.loc[res["total_pop"] == 0.0]

Unnamed: 0,MP_CODE,MP_CODE_st,MP_NAME,MN_MDB_C,MN_CODE,MN_CODE_st,MN_NAME,DC_MDB_C,DC_MN_C,DC_MN_C_st,...,Shape_Leng,Shape_Area,geometry,total_pop,pop_30,pop_60,pop_120,pop_180,pop_240,tt_pop_w_
41,163009.0,163009,Marcus Island,WC014,163.0,163,Saldanha Bay,DC1,101.0,101,...,0.028065,0.000017,"POLYGON ((17.96996 -33.04141, 17.97002 -33.041...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,163011.0,163011,Malgaseiland,WC014,163.0,163,Saldanha Bay,DC1,101.0,101,...,0.027367,0.000020,"POLYGON ((17.92525 -33.05061, 17.92527 -33.050...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,163013.0,163013,Jutteneiland,WC014,163.0,163,Saldanha Bay,DC1,101.0,101,...,0.051615,0.000043,"POLYGON ((17.95346 -33.08014, 17.95348 -33.080...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,163014.0,163014,Meeueiland,WC014,163.0,163,Saldanha Bay,DC1,101.0,101,...,0.022670,0.000012,"POLYGON ((18.00717 -33.08387, 18.00723 -33.083...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,163015.0,163015,Skaapeliland,WC014,163.0,163,Saldanha Bay,DC1,101.0,101,...,0.043599,0.000039,"POLYGON ((18.02114 -33.08799, 18.02121 -33.088...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12575,965056.0,965056,Tshamutore,LIM342,965.0,965,Mutale,DC34,934.0,934,...,0.027434,0.000018,"MULTIPOLYGON (((30.64121 -22.63606, 30.64114 -...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
12581,965062.0,965062,Lamvi,LIM342,965.0,965,Mutale,DC34,934.0,934,...,0.131442,0.000247,"POLYGON ((30.78545 -22.63740, 30.78573 -22.637...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
12911,969023.0,969023,Bahamanoa,LIM351,969.0,969,Blouberg,DC35,935.0,935,...,0.014728,0.000008,"POLYGON ((28.90656 -22.91205, 28.90675 -22.912...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
13622,984043.0,984043,Masoyeng,LIM472,984.0,984,Elias Motsoaledi,DC47,947.0,947,...,0.120465,0.000173,"POLYGON ((29.69436 -25.14214, 29.69446 -25.142...",0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Investigating 1s in traveltime results
tt_file = os.path.join(zonal_res_folder, "tt_cities_tt.csv")
inT = pd.read_csv(tt_file, index_col=0)
inT.head()

In [None]:
badT = inT.loc[inT["tt_pop_w_tt_cities_tt"] == 1.0]
badT

In [None]:
inM_proj = inM.to_crs(proj_epsg)
inM_proj["area_km2"] = inM_proj["geometry"].apply(lambda x: x.area / 1000000)

In [None]:
inM_proj.loc[badT.index].sort_values(["area_km2"])

In [None]:
rMisc.zonalStats?