# Extract data for urban calculations

Test input for Tanzania

0. Select focal ADM, buffer by 1km, rasterize as [0, 1]
1. Download DEM data from ASTER, mosaick
2. Calculate slope of DEM
3. Extract water layer from Globcover
4. Rasterize building footprints
5. Select population layer
6. Standardize all rasters to population layer  
   a. Set area outside of focal admin to NoData  
   b. Set everything to 16bit  
   
   


In [2]:
import sys
import os
import importlib
import shutil
import pathlib
import datetime
import math
import rasterio
import rasterio.warp

import pandas as pd
import geopandas as gpd
import numpy as np

from shapely.geometry import Point

# Import raster helpers
sys.path.insert(0, "/home/wb411133/Code/gostrocks/src")

import GOSTRocks.metadataMisc as meta

# Import GOST urban functions
sys.path.append("../../../src")

# Import local functions
import novelUrbanization as nu
from novelUrbanization import *

%load_ext autoreload
%autoreload 2



METADATA Library: Could not import arcgis libraries


In [3]:
global_bounds = "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
global_bounds_adm2 = "/home/public/Data/GLOBAL/ADMIN/Admin2_Polys.shp"

inG = gpd.read_file(global_bounds)
inG2 = gpd.read_file(global_bounds_adm2)

runSmall = True
runLarge = True

# Convert EA csv files to geometry

In [None]:
in_folder = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/EA_Files/"
ea_files = []
for root, dirs, files in os.walk(in_folder):
    for x in files:
        if (x.endswith(".csv")) and ("URBAN" not in x):
            ea_files.append(os.path.join(root, x))

ea_files

In [None]:
pd.read_csv(ea_files[-1]).head()

In [None]:
def try_float(x):
    try:
        return float(x)
    except:
        return None


def read_geog(file, lat_column, lon_column, crs="epsg:4326", write_out=True):
    print(os.path.basename(file))
    out_file = file.replace(".csv", ".geojson")
    inD = pd.read_csv(file)

    print(inD.shape)
    inD[lat_column] = inD[lat_column].apply(try_float)
    inD[lon_column] = inD[lon_column].apply(try_float)
    inD = inD.loc[~(inD[lat_column].isna() | inD[lon_column].isna())]
    print(inD.shape)

    inD_geom = inD.apply(
        lambda x: Point(float(x[lon_column]), float(x[lat_column])), axis=1
    )
    inD = gpd.GeoDataFrame(inD, geometry=inD_geom, crs=crs)

    if write_out:
        inD.to_file(out_file, driver="GeoJSON")
    return inD


# res = read_geog(ea_files[0], "latdd_corrected", "londd_corrected")
# res = read_geog(ea_files[1], "lat", "lon")
# res = read_geog(ea_files[2], "latitude", "longitude")
# res = read_geog(ea_files[3], "latitude", "longitude")
# res = read_geog(ea_files[4], "lat_mean", "long_mean")
# res = read_geog(ea_files[5], "latdd_corrected", "londd_corrected")
# res = read_geog(ea_files[6], "latdd_corrected", "londd_corrected")
# res = read_geog(ea_files[7], "lat_modified","lon_modified")
# res = read_geog(ea_files[8], "lat_corrected", "lon_corrected")
# res = read_geog(ea_files[9], "lat_corrected", "lon_corrected")
res = read_geog(ea_files[-1], "latDD_corrected", "lonDD_corrected")

# Run individual counties

In [8]:
# Process Individual countries
iso3 = "COG"
local_path = "/home/public/Data/COUNTRY/{country}/WORLDPOP/".format(country=iso3)
constrained_WP_folder = "/home/public/Data/GLOBAL/Population/RF_SSA_2015-2020"
worldPop_2015 = (
    "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2015/worldPop_2015.vrt"
)
global_ghspop = "/home/public/Data/GLOBAL/Population/GHS/250/GHS_POP_E2015_GLOBE_R2019A_54009_250_V1_0.tif"
c_WP_15 = f"{constrained_WP_folder}/{iso3}/ppp_{iso3}_const_2015.tif"
c_WP_20 = f"{constrained_WP_folder}/{iso3}/ppp_{iso3}_const_2020.tif"
custom_pop = "/home/public/Data/COUNTRY/COG/Population/COG_population_202309271640.tif"

pop_files = [[worldPop_2015, f"{iso3.lower()}_upo15.tif"]]
pop_files.append([global_ghspop, f"{iso3.lower()}_gpo.tif"])
pop_files.append([c_WP_15, f"{iso3.lower()}_cpo15.tif"])
pop_files.append([c_WP_20, f"{iso3.lower()}_cpo20.tif"])
pop_files.append([custom_pop, f"{iso3.lower()}_cpo20_WB.tif"])

output_folder = (
    "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/%s_URBAN_DATA_new_naming"
    % iso3
)
ea_file = "/home/public/Data/COUNTRY/COG/Population/ZD_CONGO_CLIP_FIXED.shp"
db_folder = os.path.join(output_folder, "DB_Results", "SentWB", "delineations")

In [10]:
importlib.reload(nu)
# Calculate urban definitions
nu.calculate_urban(
    iso3, inG, inG2, pop_files, ea_file, output_folder, small=runSmall, km=runLarge
)
pp_urban = nu.calc_pp_urban(
    db_folder, "%s_gpo.tif" % iso3.lower(), ea_file, output_folder
)
pd.DataFrame(pp_urban.drop(["geometry"], axis=1)).to_csv(
    os.path.join(output_folder, f"{iso3}_DB_UrbanPopulation_admin3.csv")
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


11:50:52	COG ***1k Extracting Global Layers
11:50:52	COG ***1k Downloading and processing elevation
11:50:52	COG ***1k Standardizing rasters
11:50:52	COG ***1k Calculating Urban
/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/COG_URBAN_DATA_new_naming/FINAL_STANDARD_1KM/cog1k_cpo20.tif
/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/COG_URBAN_DATA_new_naming/FINAL_STANDARD_1KM/cog1k_cpo20_WB.tif
/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/COG_URBAN_DATA_new_naming/FINAL_STANDARD_1KM/cog1k_upo15.tif
/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/COG_URBAN_DATA_new_naming/FINAL_STANDARD_1KM/cog1k_gpo.tif
/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/COG_URBAN_DATA_new_naming/FINAL_STANDARD_1KM/cog1k_cpo15.tif
11:50:52	COG ***1k Calculating Zonal admin2
11:50:55	COG ***1k Calculating Zonal communes
11:53:03	COG ***** Extracting Global Layers COG
11:53:03	COG ***** Downloading and processing elevation COG
11:53:03	COG ***** Standardizing raste

In [None]:
# Calculate Point-based statistics
input_file = os.path.join(output_folder, "HBS_GPS.csv")
pop_tiffs = ["eth_gpo.tif", "eth_upo15.tif", "eth_upo16.tif"]
all_tiffs = []
base_folder = os.path.join(output_folder, "FINAL_STANDARD")
base_folder_1km = os.path.join(output_folder, "FINAL_STANDARD_1KM")
for pFile in pop_tiffs:
    all_tiffs.append(os.path.join(base_folder, pFile))
    all_tiffs.append(os.path.join(base_folder_1km, pFile.replace("eth", "eth1k")))

# Read in ETH HH locations, clean
inD = pd.read_csv(input_file)
inD = inD.loc[~inD["latDD_corrected"].isnull()]
inD = inD.loc[~inD["lonDD_corrected"].isnull()]
geoms = [
    Point(row["lonDD_corrected"], row["latDD_corrected"]) for idx, row in inD.iterrows()
]
inD = gpd.GeoDataFrame(inD, geometry=geoms, crs={"init": "epsg:4326"})
# Calculate point urbanization for degree of urbanization
out_file = os.path.join(output_folder, f"{iso3}_DoU_Urban.csv")
nu.point_urban_summaries(inD, all_tiffs, out_file)
# Calculate point urbanization for PP urban
out_file = os.path.join(output_folder, f"{iso3}_DB_Urban.csv")
in_folder = os.path.join(output_folder, "ethiopia")
pop_tiffs = [os.path.join(in_folder, x) for x in os.listdir(in_folder)]
nu.pp_point_urban_summaries(inD, pop_tiffs, out_file)

In [None]:
# Run zonal stats
constrained_WP_folder = "/home/public/Data/GLOBAL/Population/RF_SSA_2015-2020"
worldPop_2015 = (
    "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2015/worldPop_2015.vrt"
)
global_ghspop = "/home/public/Data/GLOBAL/Population/GHS/250/GHS_POP_E2015_GLOBE_R2019A_54009_250_V1_0.tif"
c_WP_15 = f"{constrained_WP_folder}/{iso3}/ppp_{iso3}_const_2015.tif"
c_WP_20 = f"{constrained_WP_folder}/{iso3}/ppp_{iso3}_const_2020.tif"

pop_files = [[worldPop_2015, f"{iso3.lower()}_upo15.tif"]]
pop_files.append([global_ghspop, f"{iso3.lower()}_gpo.tif"])
pop_files.append([c_WP_15, f"{iso3.lower()}_cpo15.tif"])
pop_files.append([c_WP_20, f"{iso3.lower()}_cpo20.tif"])

nu.run_zonal(iso3, output_folder, inG, pop_files, ea_file, "")

# Compile and copy mapping data

In [None]:
countries = {
    "AGO": "angola",
    "BGD": "bangladesh",
    "EGY": "egypt",
    "ETH": "ethiopia",
    "GHA": "ghana",
    "TZA": "tanzania",
    "VNM": "vietnam",
}
for iso3 in countries.keys():
    out_folder = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Mapping/URBAN_Data"
    data_folder = (
        "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/%s_URBAN_DATA_new_naming/"
        % iso3
    )
    dou_folder = os.path.join(data_folder, "FINAL_STANDARD")
    db_folder = os.path.join(data_folder, countries[iso3])

    dou_urban = os.path.join(dou_folder, f"{iso3.lower()}_upo15_urban.tif")
    dou_urban_hd = os.path.join(dou_folder, f"{iso3.lower()}_upo15_urban_hd.tif")

    db_urban_cc = os.path.join(db_folder, f"{iso3.lower()}_upo15d20b2000_cc.tif")
    db_urban_co = os.path.join(db_folder, f"{iso3.lower()}_upo15d20b2000_co.tif")
    db_urban_ur = os.path.join(db_folder, f"{iso3.lower()}_upo15d20b2000_ur.tif")

    for uFile in [dou_urban, dou_urban_hd, db_urban_cc, db_urban_co, db_urban_ur]:
        print(f"{iso3}: {os.path.exists(uFile)}")
        out_file = os.path.join(out_folder, os.path.basename(uFile))
        shutil.copy(uFile, out_file)

# Compile zonal results

In [None]:
# copy only the zonal stats with ea defs
cur_countries = list(nu.EA_DEFS.keys())

in_folder = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/"
out_folder = os.path.join(in_folder, "URBAN_ZONAL_RESULTS_EAs")
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

for root, dirs, files in os.walk(in_folder):
    if "URBAN_DATA_new_naming" in root:
        country = os.path.basename(root).split("_")[0]
        if country in nu.EA_DEFS.keys():
            for f in files:
                if (
                    ("EA_PP_URBAN_Updated" in f)
                    | ("EA_WB_URBAN_" in f)
                    | ("HH_GPS" in f)
                ):
                    fName = pathlib.Path(os.path.join(root, f))
                    date = datetime.fromtimestamp(fName.stat().st_mtime)
                    if datetime(2021, 6, 1) < date:
                        print(f"{country}: {f} - {date}")
                    else:
                        print(f"***OLD: {country}: {f} - {date}")
                    shutil.copy(os.path.join(root, f), os.path.join(out_folder, f))

In [None]:
in_folder = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/"
out_folder = os.path.join(in_folder, "URBAN_ZONAL_RESULTS")
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

for root, dirs, files in os.walk(in_folder):
    if "URBAN_DATA_new_naming" in root:
        country = os.path.basename(root).split("_")[0]
        if country in nu.EA_DEFS.keys():
            for f in files:
                if (
                    ("EA_PP_URBAN_Updated" in f)
                    | ("EA_WB_URBAN_" in f)
                    | ("HH_GPS" in f)
                ):
                    fName = pathlib.Path(os.path.join(root, f))
                    date = datetime.fromtimestamp(fName.stat().st_mtime)
                    if datetime(2021, 6, 1) < date:
                        print(f"{country}: {f} - {date}")
                    else:
                        print(f"***OLD: {country}: {f} - {date}")
                    shutil.copy(os.path.join(root, f), os.path.join(out_folder, f))

In [None]:
datetime(2021, 6, 1)

In [None]:
# Delete all zonal stats
for root, dirs, files in os.walk(in_folder):
    if "URBAN_DATA_new_naming" in root:
        country = os.path.basename(root).split("_")[0]
        if country in nu.EA_DEFS.keys():
            for f in files:
                if ("URBAN_COMMUNE_STATS" in f) | ("URBAN_ADMIN2" in f):
                    print(f"{country}: {f}")
                    os.remove(os.path.join(root, f))

# Generate Metadata

In [None]:
template_metadata = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/LSO_URBAN_DATA_new_naming/METADATA/metadata.xlsx"
dataset_info = pd.read_excel(template_metadata, sheet_name=0)
layer_info = pd.read_excel(template_metadata, sheet_name=1, index_col=0)

In [None]:
base_folder = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/{ISO3}_URBAN_DATA_new_naming"
country_name = "Angola"
iso3 = "AGO"
in_folder = base_folder.format(ISO3=iso3)
out_dir = os.path.join(in_folder, "metadata")

make_meta = meta.metadata_gost(in_folder, out_dir)
layers = make_meta.get_layers()
metadata = make_meta.generate_metadata()

In [None]:
layer_info["layer_name"] = [
    p.replace("lso", iso3.lower()) for p in layer_info["layer_name"]
]

In [None]:
sel_info = layer_info.loc[
    :,
    [
        "layer_name",
        "layer_label",
        "description",
        "source_name",
        "source_url",
        "data_process_summary",
    ],
]
sel_info

In [None]:
final_meta = metadata["metadata"]
final_meta = final_meta.loc[
    :,
    ~final_meta.columns.isin(
        [
            "layer_label",
            "description",
            "source_name",
            "source_url",
            "data_process_summary",
        ]
    ),
]
final_meta.merge(sel_info, on="layer_name")

In [None]:
make_meta.write_metadata(
    os.path.join(out_dir, f"{iso3}_novel_urbanization_metadata.xlsx"),
    layer_metadata=final_meta,
    field_metadata=metadata["fields"],
    dataset_id=dataset_info.Definition[0].format(ISO3=iso3, Country=country_name),
    dataset_title=dataset_info.Definition[1].format(ISO3=iso3, Country=country_name),
    country=dataset_info.Definition[2].format(ISO3=iso3, Country=country_name),
    abstract=dataset_info.Definition[3].format(ISO3=iso3, Country=country_name),
    purpose=dataset_info.Definition[4].format(ISO3=iso3, Country=country_name),
    creation_date=datetime.today().strftime("%Y-%m-%d"),
    release_date=datetime.today().strftime("%Y-%m-%d"),
    owner=dataset_info.Definition[7].format(ISO3=iso3, Country=country_name),
    email=dataset_info.Definition[8].format(ISO3=iso3, Country=country_name),
)

In [None]:
out_dir

# Generating zip commands

In [None]:
# Delete existing files
in_folder = "/home/wb411133/temp"
for root, dirs, files in os.walk(in_folder):
    for d in dirs:
        if (d == "FINAL_STANDARD") or (d == "FINAL_STANDARD_1KM"):
            cur_dir = os.path.join(root, d)
            print(
                "zip -r {out_file} {infolder}".format(
                    out_file="%s_%s.zip"
                    % (cur_dir.split("/")[-2].split("_")[0], cur_dir.split("_")[-1]),
                    infolder=os.path.join(
                        os.path.basename(os.path.dirname(cur_dir)),
                        os.path.basename(cur_dir),
                    ),
                )
            )

# Debugging

In [52]:
# there is an error in scaling a new population dataset; testing out why
pop_raster = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/COG_URBAN_DATA_new_naming/cog_cpo20_WB.tif"
template_raster = "/home/wb411133/data/Projects/MR_Novel_Urbanization/Data/COG_URBAN_DATA_new_naming/FINAL_STANDARD/cog_gpo.tif"

in_raster = rasterio.open(pop_raster)
in_r = in_raster.read()
in_r[in_r == in_raster.meta["nodata"]] = 0

ghs_R = rasterio.open(template_raster)
out_array = np.zeros(ghs_R.shape)

In [61]:
in_r[0, 0, 0] == in_raster.meta["nodata"]

False

In [65]:
in_r[0, 0, 0].__class__

numpy.float32

In [74]:
temp_nodata = type(in_r[0, 0, 0])(in_raster.meta["nodata"])
in_r == temp_nodata

array([[[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]]])

In [55]:
in_r == in_raster.meta["nodata"]

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]])

In [50]:
# in_r[in_r < 0] = 0
rSample = rasterio.warp.Resampling.bilinear
rasterio.warp.reproject(
    in_r,
    out_array,
    src_transform=in_raster.meta["transform"],
    dst_transform=ghs_R.meta["transform"],
    src_crs=in_raster.crs,
    dst_crs=ghs_R.crs,
    src_nodata=in_raster.meta["nodata"],
    dst_nodata=ghs_R.meta["nodata"],
    resampling=rSample,
)
out_array[out_array == ghs_R.meta["nodata"]] = 0.0

In [51]:
out_array_sum = out_array.sum()
original_sum = in_r.sum()

In [47]:
if math.isinf(original_sum):
    in_r[in_r < 0] = 0
    original_sum = in_r.sum()
total_ratio = original_sum / out_array_sum

out_array = out_array * total_ratio
out_array[out_array < 0] = ghs_R.meta["nodata"]

In [48]:
out_array_sum

749608.9979193077

In [49]:
total_ratio

7.280417544544301

In [20]:
in_raster.meta

{'driver': 'GTiff',
 'dtype': 'float32',
 'nodata': -3.4e+38,
 'width': 8939,
 'height': 10449,
 'count': 1,
 'crs': CRS.from_epsg(4326),
 'transform': Affine(0.0008333333300145432, 0.0, 11.200416637,
        0.0, -0.0008333333299618321, 3.702916853)}

In [23]:
in_r == float(in_raster.meta["nodata"])

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]])

In [26]:
in_raster.meta["nodata"].__class__

float