In [None]:
import sys, os
import boto3

import rasterio

import pandas as pd
import numpy as np

from GOSTrocks.misc import tPrint

In [None]:
bucket = "wbg-geography01"
prefix = "URBANIZATION/MR_Novel_Poverty/"
file_ends_with = 'urban_hd.tif'

s3client = boto3.client("s3", verify=False)

# Loop through the S3 bucket and get all the file keys
more_results = True
try:
    del token  # noqa
except Exception:
    pass
loops = 0

all_res = []
while more_results:
    print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(
            Bucket=bucket,
            ContinuationToken=token,  # noqa
            Prefix=prefix,  # noqa
        )
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects["IsTruncated"]
    if more_results:
        token = objects["NextContinuationToken"]
    loops += 1
    for res in objects["Contents"]:
        if res['Key'].endswith(file_ends_with):
            all_res.append(res['Key'])

In [None]:
all_res = pd.DataFrame(all_res, columns=['Key'])
all_res['ISO3'] = all_res['Key'].apply(lambda x: x.split('/')[-1][:3])
all_res['pop_layer'] = all_res['Key'].apply(lambda x: x.split('/')[-1].split('_')[1])
all_res['file'] = all_res['Key'].apply(lambda x: os.path.basename(x))
all_res['res'] = '250'
all_res.loc[all_res['file'].str.contains('1k'), 'res'] = '1000'

In [None]:
def calculate_urban_pop(hd_key, verbose=False):
    urban_key = hd_key.replace('urban_hd.tif', 'urban.tif')
    pop_key = hd_key.replace('_urban_hd.tif', '.tif')
    
    # Summarize total population
    with rasterio.Env(GDAL_HTTP_UNSAFESSL="YES"):
        popR = rasterio.open(f's3://{bucket}/{pop_key}')
        popD = popR.read(1)
        popD = np.where(popD > 0, popD, 0)
        total_pop = popD.sum()

        urbanR = rasterio.open(f's3://{bucket}/{urban_key}')
        urbanD = urbanR.read(1)
        urbanD = np.where(urbanD > 0, 1, 0)
        urban_pop = urbanD * popD
        urban_pop = urban_pop.sum()

        hdR = rasterio.open(f's3://{bucket}/{hd_key}')
        hdD = hdR.read(1)
        hdD = np.where(hdD > 0, 1, 0)
        hd_pop = hdD * popD
        hd_pop = hd_pop.sum()
    return([total_pop, urban_pop, hd_pop])

#xx = calculate_urban_pop(all_res['Key'].iloc[0], verbose=True)

In [None]:
def process_pop_summaries(in_res, sel_res):
    focal_set = in_res.loc[(in_res['res'] == sel_res)].copy()
    for idx, rows in focal_set.iterrows():
        tPrint('Processing: %s' % rows['ISO3'])
        try:
            total_pop, urban_pop, hd_pop = calculate_urban_pop(rows['Key'])
            focal_set.loc[idx, 'total_pop'] = total_pop
            focal_set.loc[idx, 'urban_pop'] = urban_pop
            focal_set.loc[idx, 'hd_pop'] = hd_pop
        except Exception as e:
            print(f"Error with {rows['Key']}")
            print(e)
    return(focal_set)

In [None]:
# Process a single country
sel_iso3 = 'cog'
sel_country_res = all_res.loc[all_res['ISO3'] == sel_iso3].copy()

country_summary = process_pop_summaries(sel_country_res, '1000')
country_summary

In [None]:
sel_res = '1000'

all_res['total_pop'] = 0.
all_res['urban_pop'] = 0.
all_res['hd_pop'] = 0.


focal_set = all_res.loc[(all_res['res'] == sel_res)].copy()
for idx, rows in focal_set.iterrows():
    tPrint('Processing: %s' % rows['ISO3'])
    try:
        total_pop, urban_pop, hd_pop = calculate_urban_pop(rows['Key'])
        all_res.loc[idx, 'total_pop'] = total_pop
        all_res.loc[idx, 'urban_pop'] = urban_pop
        all_res.loc[idx, 'hd_pop'] = hd_pop
    except Exception as e:
        print(f"Error with {rows['Key']}")
        print(e)

all_res.loc[focal_set.index].to_csv(f'C:/Temp/urban_pop_{sel_res}.csv', index=False)


# Exploring population values

Each country has a text file evaluating several attributes (listed below), however, I don't see any comparisons between the source data and the re-scaled values:

**Evaluated stats, currently**
1. Total population between datasets
2. Total urbanization
3. 8-class SMOD breakdown
4. Intersection of Water and GHSL
5. Intersection of population and water

In [None]:
pop_res = []
for iso3, data in all_res.groupby('ISO3'):
    if iso3 in ['som','ken','ssd']:
        pop_layers = list(data['pop_layer'].unique())
        folder_base = os.path.dirname(os.path.dirname(data['Key'].iloc[0]))
        iso3_res = []
        tPrint(f"Processing: {iso3}")
        for pop_layer in pop_layers:
            orig_res = f's3://{bucket}/{folder_base}/{iso3}_{pop_layer}.tif'
            res_250 = f's3://{bucket}/{folder_base}/FINAL_STANDARD/{iso3}_{pop_layer}.tif'
            res_1000 = f's3://{bucket}/{folder_base}/FINAL_STANDARD_1KM/{iso3}1k_{pop_layer}.tif'
            res = []
            with rasterio.Env(GDAL_HTTP_UNSAFESSL="YES"):
                for c_res in [orig_res, res_250, res_1000]:                
                    try:
                        origR = rasterio.open(c_res)
                        origD = origR.read(1)
                        origD = np.where(origD > 0, origD, 0)
                        res.append(origD.sum())
                    except Exception as e:
                        print(f"Error with {c_res}")
                        print(e)
                        res.append(0)
            pop_res.append([iso3, pop_layer, res[0], res[1], res[2]])            

In [None]:
pd.DataFrame(pop_res)

In [None]:
data