In [1]:
import sys, os, importlib
import geohash

import pandas as pd
import geopandas as gpd
import numpy as np

sys.path.append('../../')

#

# Check on processing status

In [4]:
covid_folder = "/home/wb411133/data/Projects/CoVID"
all_folders = []
all_zips = []
for thing in os.listdir(covid_folder):
    path = os.path.join(covid_folder, thing)
    if os.path.isdir(path):
        all_folders.append(path)
    elif thing[-4:] == ".zip":
        all_zips.append(path)

In [5]:
outputs = {}
bad_files = []
for folder in all_folders:
    shps = []
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f[-4:] == ".shp":
                shps.append(f)
            if f[:3] == "ADM":
                bad_files.append(os.path.join(root, f))
    outputs[os.path.basename(folder)] = shps

In [6]:
bad_files

[]

# Prepare data for DevSeed

Need to ensure data follow strict data structure:

1. All files delivered as GeoJSON
2. Ensure all files have geohash
3. Deliver preliminary zonal results as well

In [14]:
import zipfile
from infrasap import covid_data_extraction as cov
importlib.reload(cov)
    
dhs_definitions = '/home/public/Data/PROJECTS/CoVID/DHS/column_definitions.csv'
dhs = pd.read_csv(dhs_definitions)
column_defs = {}
for idx, row in dhs.iterrows():
    column_defs['%s_SUM' % row['Column_name']] = 'Total %s' % row['Definition']
    column_defs['%s_MEAN' % row['Column_name']] = 'Mean density %s' % row['Definition']    

class covid_result(object):
    def __init__(self, country, files, cur_folder, dhs_column_defs):
        self.dhs_column_defs = dhs_column_defs
        self.iso3 = country
        self.files = files
        self.cur_folder = cur_folder
        self.LC_cols = ['LC_%s' % x for x in [11,14,20,30,40,50,60,70,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230]]
        self.final_folder = os.path.join(cur_folder, "FINAL_GEOMS")
        self.final_stats_folder = os.path.join(cur_folder, "FINAL_STATS")
        self.bad_cols = ['R10_SUM',"P1_SUM","P2_SUM","Shape_Leng","Shape_Area"]
        self.ready = True
        
        if not os.path.exists(self.final_folder):
            os.makedirs(self.final_folder)
        if not os.path.exists(self.final_stats_folder):
            os.makedirs(self.final_stats_folder)
    
    
    def check_zonal(self):
        csv_files = []
        for root, dirs, files in os.walk(self.cur_folder):
            for f in files:
                if f[-4:] == ".csv" and not "FINAL_STATS" in root:
                    csv_files.append(os.path.join(root, f))
        self.csv_files = csv_files
        
        for csv_file in csv_files:
            cur_res = pd.read_csv(csv_file)
            geom_file = os.path.basename(csv_file).replace(".csv","").replace("_zonal","").replace("_BASE","").replace("_DHS","") + ".geojson"
            geom_file = os.path.join(self.final_folder, geom_file)
            if os.path.exists(geom_file):
                raw_data = gpd.read_file(geom_file)
                pk_column = "geohash"
                if os.path.basename(csv_file)[:3] == "adm":
                    pk_column = f"WB_ADM{os.path.basename(csv_file)[3:4]}_CO"
                if "BASE" in csv_file:
                    orig_columns = list(cur_res.columns)
                    orig_columns[-len(self.LC_cols):] = self.LC_cols
                    cur_res.columns = orig_columns
                if "_DHS" in csv_file:
                    cur_res.rename(columns = self.dhs_column_defs, inplace=True)
                cur_res['geom_key'] = raw_data[pk_column]
                cur_res.to_csv(os.path.join(self.final_stats_folder, os.path.basename(csv_file)))
        return(self.csv_files)
            

    def check_for_fishnets(self):
        self.fishnets = False
        for f in self.files:
            if "URBAN_" in f:
                self.fishnets = True
                return(self.fishnets)
        return(self.fishnets)
        
    def write_geom_data(self):
        for f in self.files:
            inD = gpd.read_file(f)
            inD = inD.to_crs({'init':'epsg:4326'})
            if not 'geohash' in inD.columns:
                try:
                    inD['geohash'] = inD['geometry'].apply(lambda x: geohash.encode(x.centroid.y, x.centroid.x))
                except:
                    raise(ValueError(f))
            for col in self.bad_cols:
                if col in inD.columns:
                    inD.drop([col],axis=1,inplace=True)
            inD.to_file(os.path.join(self.final_folder, os.path.basename(f).replace(".shp",".geojson")), driver="GeoJSON")
      
    def zip_folder(self, folder, out_file):
        # ziph is zipfile handle
        zipf = zipfile.ZipFile(out_file, 'w', zipfile.ZIP_DEFLATED)
        for root, dirs, files in os.walk(folder):
            for file in files:
                zipf.write(os.path.join(root, file))
                
    def check_for_zip(self):
        stats_files = os.listdir(self.final_stats_folder)
        # list all geojson files in FINAL_GEOMS
        self.ready = True
        for g_file in os.listdir(self.final_folder):
            stats_file_base = g_file.replace(".geojson", "_zonal_BASE.csv")
            stats_file_dhs = g_file.replace(".geojson", "_zonal_DHS.csv")
            if (not stats_file_base in stats_files) or (not stats_file_dhs in stats_files):                
                self.ready = False
        return(self.ready)

In [15]:
iso3 = 'VNM'
files = outputs[iso3]
xx = covid_result(iso3, files, os.path.join(covid_folder, iso3), column_defs)
xx.check_zonal()
xx.check_for_zip()

True

In [22]:
out_zip_folder = "/home/wb411133/data/Projects/CoVID_FINAL"
covid_folder = "/home/wb411133/data/Projects/CoVID"
outputs = {}
bad_files = []
for folder in all_folders:
    shps = []
    for root, dirs, files in os.walk(folder):
        for f in files:
            if f[-4:] == ".shp":
                shps.append(os.path.join(root, f))
            if f[:3] == "ADM":
                bad_files.append(os.path.join(root, f))
    outputs[os.path.basename(folder)] = shps

In [24]:
def process_all(iso3, files, out_z_folder):
    print(iso3)
    final_geometry_zip = os.path.join(out_z_folder, "%s_GEOMETRY.zip" % iso3)
    final_stats_zip = os.path.join(out_z_folder, "%s_STATS.zip" % iso3)
    xx = covid_result(iso3, files, os.path.join(covid_folder, iso3), column_defs)
    if not os.path.exists(final_geometry_zip) or not os.path.exists(final_stats_zip):
        fishnets_exist = xx.check_for_fishnets()
        if not fishnets_exist:
            print("Creating fishnet for %s" % iso3)
            extent_file = os.path.join(covid_folder, iso3, "urban_areas_hd.shp")
            prefix = "HD_URBAN"
            out_folder = os.path.join(covid_folder, iso3, "hd_urban_fishnets")
            if not os.path.exists(extent_file):
                extent_file = os.path.join(covid_folder, iso3, "urban_areas.shp")
                prefix = "URBAN"
                out_folder = os.path.join(covid_folder, iso3, "urban_fishnets")
            if os.path.exists(extent_file):
                if not os.path.exists(out_folder):
                    os.makedirs(out_folder)
                cov.create_fishnet(extent_file, out_folder, prefix)
            else:
                print("ERROR with %s" % iso3)
        xx.write_geom_data()
        xx.zip_folder(xx.final_folder, final_geometry_zip)            
        res = xx.check_zonal()
        if xx.check_for_zip():
            xx.zip_folder(xx.final_stats_folder, final_stats_zip)
        else:
            print("%s is not ready" % iso3)
    return(xx)
    
all_bad = []
for iso3, files in outputs.items():
    res = process_all(iso3, files, out_zip_folder)
    if not res.ready:
        all_bad.append(res)




VNM
ARG
PAK
ZAF
COL
ZWE
MNG
SLE
CPV
KEN
GHA
AFG
YEM
ECU
PRY
MRT
MDV
KGZ
HTI
DJI
KHM
TJK
GMB
LKA
SEN
STP
SLV
VEN
MLI
RWA
BOL
TZA
MAR
IND
IDN
SDN
AGO
BEN
BWA
BWA is not ready
BFA
BDI
CMR
CAF
TCD
COM
COG
CIV
COD
SSD
ERI
ETH
GAB
GNB
GIN
LSO
LBR
MDG
MWI
MUS
MOZ
NAM
NER
NGA
SYC
SOM
SWZ
TGO
UGA
ZMB
LCA
PHL
GTM
BGD
BRA
BRA is not ready
MEX
MEX is not ready
EGY
EGY is not ready
UKR
UKR is not ready
PER
PER is not ready
LAO
PSE
NPL
PNG
DZA
BLR
BTN
BIH
NIC
FJI
FJI is not ready
GEO
HND
JOR
MHL
MDA
MMR
MKD
PAN
WSM
SLB
TUN
TUR
URY
UZB
ALB
HRV
IRN
SRB
TTO
ATG
CHN
IRQ


In [25]:
for x in all_bad:
    print("'%s'" % x.iso3)

'BWA'
'BRA'
'MEX'
'EGY'
'UKR'
'PER'
'FJI'


In [None]:
for f in os.listdir(out_zip_folder):
    if "GEOMETRY" in f:
        print(f"aws s3 cp {f} s3://covid-wb/data/{f} --profile covid")
    

In [None]:
iso3 = 'IDN'
files = outputs[iso3]

final_geometry_zip = os.path.join(out_zip_folder, "%s_GEOMETRY.zip" % iso3)
final_stats_zip = os.path.join(out_zip_folder, "%s_STATS.zip" % iso3)
if not os.path.exists(final_geometry_zip) or not os.path.exists(final_stats_zip):
    xx = covid_result(iso3, files, os.path.join(covid_folder, iso3), column_defs)
    fishnets_exist = xx.check_for_fishnets()
    if not fishnets_exist:
        print("Creating fishnet for %s" % iso3)
        extent_file = os.path.join(covid_folder, iso3, "urban_areas_hd.shp")
        prefix = "HD_URBAN"
        out_folder = os.path.join(covid_folder, iso3, "hd_urban_fishnets")
        if not os.path.exists(extent_file):
            extent_file = os.path.join(covid_folder, iso3, "urban_areas.shp")
            prefix = "URBAN"
            out_folder = os.path.join(covid_folder, iso3, "urban_fishnets")
        if os.path.exists(extent_file):
            if not os.path.exists(out_folder):
                os.makedirs(out_folder)
            cov.create_fishnet(extent_file, out_folder, prefix)
        else:
            print("ERROR with %s" % iso3)
    xx.write_geom_data()
    res = xx.check_zonal()
    if len(res)/len(xx.files) >= 2:
        xx.zip_folder(xx.final_folder, final_geometry_zip)
        xx.zip_folder(xx.final_stats_folder, final_stats_zip)
    else:
        print("%s is not ready" % iso3)

In [None]:
len(res)

In [None]:
len(xx.files)