In [1]:
import sys, os, importlib, json
import geohash, rasterio

import pandas as pd
import geopandas as gpd
import numpy as np

from shapely.geometry import box

sys.path.append('../../')

import infrasap.rasterMisc as rMisc

# Summarize hotspots in features

In [None]:
hotspot_folder = "/home/wb411133/data/Projects/HeightComparison"
hotspot_files = ['pop_floor.tif'] #,'water_points_hotspots.tif','toilets_hotspots.tif','shops_hotspots.tif']
all_res = []

for root, dirs, files in os.walk(hotspot_folder):
    for f in files:
        if f in hotspot_files:
            print(os.path.basename(root))
            #Document the extent of the analysis
            curR = rasterio.open(os.path.join(root, f))
            shp = box(*curR.bounds)
            all_res.append([os.path.basename(root), shp])
            
all_extents = pd.DataFrame(all_res, columns=["city","geometry"])
all_extents = gpd.GeoDataFrame(all_extents, geometry="geometry", crs=curR.crs)
all_extents.to_file("%s_extents.shp" % hotspot_folder)

In [None]:
hotspot_extents = all_extents.unary_union

In [None]:
hotspot_files = ['pop_floor.tif','water_points_hotspots.tif','toilets_hotspots.tif','shops_hotspots.tif']
hotspot_folder = "/home/wb411133/data/Projects/HeightComparison"
out_folder = "/home/wb411133/data/Projects/CoVID_hotspots"

for cov_folder in all_folders:
    final_file = os.path.join(out_folder, f"{os.path.basename(cov_folder)}_hotspots.csv")
    if not os.path.exists(final_file):
        print(os.path.basename(cov_folder))
        fishnets = []
        for root, dirs, files in os.walk(cov_folder):
            for f in files:
                if ("URBAN" in f) & (f[-4:] == ".shp"):
                    fishnets.append(os.path.join(root, f))        
        try:
            del(final)
        except:
            pass
        for f in fishnets:
            cur_f = gpd.read_file(f)
            if cur_f.unary_union.intersects(hotspot_extents):
                sel_h = all_extents.loc[all_extents.intersects(cur_f.unary_union)]
                city = sel_h["city"].iloc[0]
                print(f'***** {os.path.basename(f)} {cur_f.shape[0]}: {city}')
                cur_hotspot_folder = os.path.join(hotspot_folder, city)   
                for h in hotspot_files:
                    cur_hot = os.path.join(cur_hotspot_folder, h)
                    field_name = h.split("_")[0]
                    if os.path.exists(cur_hot):
                        res = rMisc.zonalStats(cur_f, cur_hot)
                        res = pd.DataFrame(res, columns=['SUM', 'MIN', 'MAX', 'MEAN'])
                        cur_f[field_name] = (res['MAX'] > 0.29).astype(int)
                    else:
                        cur_f[field_name] = 0
                try:
                    final = final.append(cur_f)
                except:
                    final = cur_f
        try:
            pd.DataFrame(final).drop(['geometry'], axis=1).to_csv(final_file)       
        except:
            print("DOES NOT INTERSECT HOTSPOTS")

In [None]:
final.head()

In [None]:
pd.DataFrame(final).drop(['geometry'], axis=1).to_csv(final_file)

In [None]:
rename_defs = {'pop':'P10','water':'P11','toilets':'P12','shops':'P13'}
for f in os.listdir(out_folder):
    cur_f = os.path.join(out_folder, f)
    curD = pd.read_csv(cur_f, index_col=0)
    curD = curD.rename(columns=rename_defs)
    curD.to_csv(cur_f)

# Check on processing status

In [2]:
covid_folder = "/home/wb411133/data/Projects/CoVID"
all_folders = []
all_zips = []
for thing in os.listdir(covid_folder):
    path = os.path.join(covid_folder, thing)
    if os.path.isdir(path):
        all_folders.append(path)
    elif thing[-4:] == ".zip":
        all_zips.append(path)

In [3]:
outputs = {}
bad_files = []
for folder in all_folders:
    shps = []
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f[-4:] == ".shp":
                shps.append(f)
            if f[:3] == "ADM":
                bad_files.append(os.path.join(root, f))
    outputs[os.path.basename(folder)] = shps

# Prepare data for DevSeed

Need to ensure data follow strict data structure:

1. All files delivered as GeoJSON
2. Ensure all files have geohash
3. Deliver preliminary zonal results as well

In [5]:
import zipfile
from infrasap import covid_data_extraction as cov
importlib.reload(cov)
    
dhs_definitions = '/home/public/Data/PROJECTS/CoVID/DHS/column_definitions.csv'
dhs = pd.read_csv(dhs_definitions)
column_defs = {}
for idx, row in dhs.iterrows():
    column_defs['%s_SUM' % row['Column_name']] = "%s_SUM" % row['Tight_Name'] #'Total %s' % row['Definition']
    column_defs['%s_MEAN' % row['Column_name']] = "%s_MEAN" % row['Tight_Name'] #'Mean density %s' % row['Definition']    

class covid_result(object):
    def __init__(self, country, files, cur_folder, dhs_column_defs):
        self.dhs_column_defs = dhs_column_defs
        self.iso3 = country
        self.files = files
        self.cur_folder = cur_folder
        self.LC_cols = ['LC_%s' % x for x in [11,14,20,30,40,50,60,70,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230]]
        self.final_folder = os.path.join(cur_folder, "FINAL_GEOMS")
        self.final_stats_folder = os.path.join(cur_folder, "FINAL_STATS")
        self.bad_cols = ['R10_SUM',"P1_SUM","P2_SUM","Shape_Leng","Shape_Area"]
        self.ready = True
        
        if not os.path.exists(self.final_folder):
            os.makedirs(self.final_folder)
        if not os.path.exists(self.final_stats_folder):
            os.makedirs(self.final_stats_folder)
    
    
    def check_zonal(self):
        csv_files = []
        for root, dirs, files in os.walk(self.cur_folder):
            for f in files:
                if f[-4:] == ".csv" and not "FINAL_STATS" in root:
                    csv_files.append(os.path.join(root, f))
        self.csv_files = csv_files
        
        for csv_file in csv_files:
            cur_res = pd.read_csv(csv_file, index_col=0)
            geom_file = os.path.basename(csv_file).replace(".csv","").replace("_zonal","").replace("_BASE","").replace("_DHS","") + ".geojson"
            geom_file = os.path.join(self.final_folder, geom_file)
            if os.path.exists(geom_file):
                raw_data = gpd.read_file(geom_file)
                pk_column = "geohash"
                if os.path.basename(csv_file)[:3] == "adm":
                    pk_column = f"WB_ADM{os.path.basename(csv_file)[3:4]}_CO"
                if "BASE" in csv_file:
                    orig_columns = list(cur_res.columns)
                    orig_columns[-len(self.LC_cols):] = self.LC_cols
                    cur_res.columns = orig_columns
                if "_DHS" in csv_file:
                    cur_res.rename(columns = self.dhs_column_defs, inplace=True)
                cur_res['geom_key'] = raw_data[pk_column]
                cur_res.to_csv(os.path.join(self.final_stats_folder, os.path.basename(csv_file)))
        return(self.csv_files)
            

    def check_for_fishnets(self):
        self.fishnets = False
        for f in self.files:
            if "URBAN_" in f:
                self.fishnets = True
                return(self.fishnets)
        return(self.fishnets)
        
    def write_tiff_zip(self, zip_file):
        zipf = zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED)        
        tiff_files = ["LC.tif", "WP_2020_1km.tif", "WP_2020_1km_urban_pop.tif", "WP2020_vulnerability_map.tif"]
        for t in tiff_files:
            cur_t = os.path.join(self.cur_folder, t)
            zipf.write(cur_t)            
    
    def write_geom_data(self):
        for f in self.files:
            inD = gpd.read_file(f)
            inD = inD.to_crs({'init':'epsg:4326'})
            if not 'geohash' in inD.columns:
                try:
                    inD['geohash'] = inD['geometry'].apply(lambda x: geohash.encode(x.centroid.y, x.centroid.x))
                except:
                    raise(ValueError(f))
            for col in self.bad_cols:
                if col in inD.columns:
                    inD.drop([col],axis=1,inplace=True)
            inD.to_file(os.path.join(self.final_folder, os.path.basename(f).replace(".shp",".geojson")), driver="GeoJSON")
      
    def zip_folder(self, folder, out_file):
        # ziph is zipfile handle
        zipf = zipfile.ZipFile(out_file, 'w', zipfile.ZIP_DEFLATED)
        for root, dirs, files in os.walk(folder):
            for file in files:
                zipf.write(os.path.join(root, file))
                
    def check_for_zip(self):
        stats_files = os.listdir(self.final_stats_folder)
        # list all geojson files in FINAL_GEOMS
        self.ready = True
        for g_file in os.listdir(self.final_folder):
            stats_file_base = g_file.replace(".geojson", "_zonal_BASE.csv")
            #stats_file_dhs = g_file.replace(".geojson", "_zonal_DHS.csv")
            if (not stats_file_base in stats_files):# or (not stats_file_dhs in stats_files):                
                self.ready = False
        return(self.ready)

def process_all(iso3, files, out_z_folder):
    print(iso3)
    final_geometry_zip = os.path.join(out_z_folder, "%s_GEOMETRY.zip" % iso3)
    final_stats_zip = os.path.join(out_z_folder, "%s_STATS.zip" % iso3)
    xx = covid_result(iso3, files, os.path.join(covid_folder, iso3), column_defs)
    final_tiff_zip = os.path.join(out_zip_folder, "%s_tiffs.zip" % iso3)
    if not os.path.exists(final_tiff_zip):
        xx.write_tiff_zip(final_tiff_zip)        
    if not os.path.exists(final_geometry_zip) or not os.path.exists(final_stats_zip):
        fishnets_exist = xx.check_for_fishnets()
        if not fishnets_exist:
            print("Creating fishnet for %s" % iso3)
            extent_file = os.path.join(covid_folder, iso3, "urban_areas_hd.shp")
            prefix = "HD_URBAN"
            out_folder = os.path.join(covid_folder, iso3, "hd_urban_fishnets")
            if not os.path.exists(extent_file):
                extent_file = os.path.join(covid_folder, iso3, "urban_areas.shp")
                prefix = "URBAN"
                out_folder = os.path.join(covid_folder, iso3, "urban_fishnets")
            if os.path.exists(extent_file):
                if not os.path.exists(out_folder):
                    os.makedirs(out_folder)
                cov.create_fishnet(extent_file, out_folder, prefix)
            else:
                print("ERROR with %s" % iso3)
        xx.write_geom_data()
        xx.zip_folder(xx.final_folder, final_geometry_zip)            
        res = xx.check_zonal()
        if xx.check_for_zip():
            xx.zip_folder(xx.final_stats_folder, final_stats_zip)
        else:
            print("%s is not ready" % iso3)
    return(xx)

In [6]:
out_zip_folder = "/home/wb411133/data/Projects/CoVID_FINAL"
covid_folder = "/home/wb411133/data/Projects/CoVID"
outputs = {}
bad_files = []
for folder in all_folders:
    shps = []
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f[-4:] == ".shp":
                shps.append(os.path.join(root, f))
            if f[:3] == "ADM":
                bad_files.append(os.path.join(root, f))
    outputs[os.path.basename(folder)] = shps

In [8]:
iso3 = 'EGY'
files = outputs[iso3]
res = process_all(iso3, files, out_zip_folder)

EGY


In [9]:
all_bad = []
for iso3, files in outputs.items():
    res = process_all(iso3, files, out_zip_folder)
    if not res.ready:
        all_bad.append(res)

VNM
ARG
PAK
ZAF
COL
ZWE
MNG
SLE
CPV
KEN
GHA
AFG
YEM
ECU
PRY
MRT
MDV
Creating fishnet for MDV
ERROR with MDV
KGZ
HTI
DJI
KHM
TJK
GMB
LKA
SEN
STP
SLV
VEN
MLI
RWA
BOL
TZA
MAR
IND
IDN
SDN
AGO
BEN
BWA
BFA
BDI
CMR
CAF
TCD
COM
COG
CIV
COD
SSD
ERI
ETH
GAB
GNB
GIN
LSO
LBR
MDG
MWI
MUS
MOZ
NAM
NER
NGA
SYC
SOM
SWZ
TGO
UGA
ZMB
LCA
PHL
GTM
BGD
BRA
MEX
EGY
UKR
PER
LAO
PSE
NPL
PNG
Creating fishnet for PNG
ERROR with PNG
DZA
BLR
BTN
Creating fishnet for BTN
ERROR with BTN
BIH
NIC
FJI
GEO
HND
JOR
MHL
Creating fishnet for MHL
ERROR with MHL
MDA
MMR
MKD
PAN
WSM
Creating fishnet for WSM
ERROR with WSM
SLB
TUN
TUR
URY
UZB
ALB
HRV
IRN
SRB
TTO
ATG
Creating fishnet for ATG
ERROR with ATG
CHN
IRQ


In [None]:
for x in all_bad:
    print("'%s'" % x.iso3)

In [None]:
uploaded[0]

In [None]:
in_aws = "/home/wb411133/data/Projects/CoVID_FINAL/AWS_FILES.txt"
uploaded = []
with open(in_aws, 'r') as in_file:
    for line in in_file:
        if "zip" in line:
            uploaded.append(line.split(" ")[-1].replace("\n",""))
for f in os.listdir(out_zip_folder):
    if not f in uploaded and "zip" in f:
        print(f"aws s3 cp {f} s3://covid-wb/data/{f} --profile covid")
    

In [None]:
iso3 = 'IDN'
files = outputs[iso3]

final_geometry_zip = os.path.join(out_zip_folder, "%s_GEOMETRY.zip" % iso3)
final_stats_zip = os.path.join(out_zip_folder, "%s_STATS.zip" % iso3)
if not os.path.exists(final_geometry_zip) or not os.path.exists(final_stats_zip):
    xx = covid_result(iso3, files, os.path.join(covid_folder, iso3), column_defs)
    fishnets_exist = xx.check_for_fishnets()
    if not fishnets_exist:
        print("Creating fishnet for %s" % iso3)
        extent_file = os.path.join(covid_folder, iso3, "urban_areas_hd.shp")
        prefix = "HD_URBAN"
        out_folder = os.path.join(covid_folder, iso3, "hd_urban_fishnets")
        if not os.path.exists(extent_file):
            extent_file = os.path.join(covid_folder, iso3, "urban_areas.shp")
            prefix = "URBAN"
            out_folder = os.path.join(covid_folder, iso3, "urban_fishnets")
        if os.path.exists(extent_file):
            if not os.path.exists(out_folder):
                os.makedirs(out_folder)
            cov.create_fishnet(extent_file, out_folder, prefix)
        else:
            print("ERROR with %s" % iso3)
    xx.write_geom_data()
    res = xx.check_zonal()
    if len(res)/len(xx.files) >= 2:
        xx.zip_folder(xx.final_folder, final_geometry_zip)
        xx.zip_folder(xx.final_stats_folder, final_stats_zip)
    else:
        print("%s is not ready" % iso3)

In [None]:
len(res)

In [None]:
len(xx.files)

# Update HNP JSON

In [None]:
dhs_definitions = '/home/public/Data/PROJECTS/CoVID/DHS/column_definitions.csv'
dhs = pd.read_csv(dhs_definitions)
column_defs = {}
for idx, row in dhs.iterrows():
    column_defs['%s_SUM' % row['Column_name']] = "%s_SUM" % row['Tight_Name'] #'Total %s' % row['Definition']
    column_defs['%s_MEAN' % row['Column_name']] = "%s_MEAN" % row['Tight_Name'] #'Mean density %s' % row['Definition']    


In [None]:
with open("RiskSchema.json") as f:
    inJ = json.load(f)
    
inJ['hnp_indicators']

In [None]:
dhs.head()

In [None]:
for idx, row in dhs.iterrows():
    cur_def = {'Name': '%s' % row['Definition'],
      'file_link': 'csv',
      'source': ['DHS'],
      'scale': ['fishnet', 'urban_hd', 'urban', 'adm2', 'adm1', 'adm0'],
      'cite': ['_FORMAL_CITATION_HARVARD_'],
      'vars': ['_PROXY_VARIABLE_'],
      'access': 'Public',
      'description': 'DHS variables are collected at their delivered administrative divisions and multiplied through the WorldPop gridded population data to get pixel level DHS numbers, which are then aggregated through zonal stats'}
    inJ['hnp_indicators'][row['Tight_Name']] = cur_def

In [None]:
inJ['hnp_indicators']

In [None]:
with open("RiskSchema.json", 'w') as outJ:
    json.dump(inJ, outJ)