# Download and run zonal stats on WOfS water statistics

https://docs.digitalearthafrica.org/en/latest/data_specs/Landsat_WOfS_specs.html

For the WOfS daily data, the data need to be downloaded and then processed.

In [1]:
import sys, os, importlib, json, multiprocessing
import rasterio, boto3

import pandas as pd
import geopandas as gpd
import numpy as np
import GOSTRocks.rasterMisc as rMisc

from tqdm import tqdm
from osgeo import gdal
from shapely.geometry import shape, Polygon, box
from botocore import UNSIGNED
from botocore.config import Config
from GOSTRocks.misc import tPrint

bucket = 'deafrica-services'
prefix = 'wofs_ls' # 'wofs_ls_summary_annual'
region = 'af-south-1'
s3client = boto3.client('s3', region_name='af-south-1', config=Config(signature_version=UNSIGNED))

In [2]:
in_folder = "/home/wb411133/projects/WOFS_Walker"
grid_folder = os.path.join(in_folder, "GRIDS")
pt_files = [os.path.join(grid_folder, x) for x in os.listdir(grid_folder) if x.endswith(".shp")]

in_extents = gpd.read_file("wofs_ls_summary_alltime-regions-deafrica-data.geojson")
in_extents['COL'] = in_extents['region_code'].apply(lambda x: int(x.split("_")[0]) + 84) # 181 These additions transform numbers for downloads
in_extents['ROW'] = in_extents['region_code'].apply(lambda x: int(x.split("_")[1]) + 96) #
in_extents['COL_ROW'] = in_extents.apply(lambda x: f"{x['COL']}_{x['ROW']}", axis=1)
extents_index = in_extents.sindex
                                         
landsat_extents = gpd.read_file("/home/public/Data/GLOBAL/WRS2_descending.shp")
landsat_index = landsat_extents.sindex        


In [3]:
final_folder = os.path.join(in_folder, "WOFS_Daily_summaries")
if not os.path.exists(final_folder):
    os.makedirs(final_folder)

In [4]:
wofs_tiles = pd.read_csv(os.path.join(in_folder,"WOFS_AWS_SUMMARY.csv"), index_col=0)
wofs_tiles['MONTH'] = wofs_tiles['Key'].apply(lambda x: x.split("/")[-3])
wofs_tiles['DAY'] = wofs_tiles['Key'].apply(lambda x: x.split("/")[-2])
wofs_tiles.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  mask |= (ar1 == a)


Unnamed: 0,Key,LastModified,ETag,Size,StorageClass,COL,ROW,COL_ROW,TYPE,YEAR,MONTH,DAY
0,wofs_ls/1-0-0/148/072/2005/11/21/wofs_ls_14807...,2021-09-01 04:58:11+00:00,"""865518daa97eb5abada45389ca8275b0""",3.961378,STANDARD,148,72,148_072,water,2005,11,21
1,wofs_ls/1-0-0/148/072/2005/12/23/wofs_ls_14807...,2021-09-01 04:06:08+00:00,"""ada110dbc2125e9f887503c1d1eb6099""",3.956423,STANDARD,148,72,148_072,water,2005,12,23
2,wofs_ls/1-0-0/148/072/2006/01/24/wofs_ls_14807...,2021-09-01 04:09:59+00:00,"""8baba74aab96bd79fdf4e1e598f5eba0""",4.412727,STANDARD,148,72,148_072,water,2006,1,24
3,wofs_ls/1-0-0/148/072/2006/02/25/wofs_ls_14807...,2021-09-01 03:58:38+00:00,"""d960813048736658c36542711939e9f7-2""",8.448545,STANDARD,148,72,148_072,water,2006,2,25
4,wofs_ls/1-0-0/148/072/2006/04/30/wofs_ls_14807...,2021-09-01 04:10:04+00:00,"""58b52ddc11ff59c1b56d7d20334d0052""",3.539613,STANDARD,148,72,148_072,water,2006,4,30


# Download and create VRT files for WOFS data

In [5]:
sel_year = 2017

In [6]:
out_folder = os.path.join(in_folder, "IMAGES", str(sel_year))
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    
for pt_file in pt_files:
    # Read in the current points file
    pt_name = os.path.basename(pt_file)[:-4]
    final_year = os.path.join(final_folder, f'{pt_name}_{sel_year}.csv')
    if not os.path.exists(final_year):
        curPt = gpd.read_file(pt_file)
        # Find intersecting Landsat tiles
        potential_tiles = landsat_extents.iloc[list(landsat_index.intersection(curPt.total_bounds))].sort_values(['PATH','ROW'])
        tPrint(f"Downloading imagery for {pt_name} in {sel_year}")
        for idx, row in potential_tiles.iterrows():
            # For each Landsat tile, identify the WOFS tiles
            sel_wofs = wofs_tiles.loc[(wofs_tiles['ROW'] == row['ROW']) & 
                                      (wofs_tiles['COL'] == row['PATH']) & 
                                      (wofs_tiles['TYPE'] == 'water') &
                                      (wofs_tiles['YEAR'] == sel_year)]
            if sel_wofs.shape[0] > 0:
                # Download each image in the selected tile, organized by day and month
                for sel_idx, sel_row in sel_wofs.iterrows():
                    image_folder = os.path.join(out_folder, sel_row['MONTH'], sel_row['DAY'])
                    if not os.path.exists(image_folder):
                        os.makedirs(image_folder)
                    out_file = os.path.join(image_folder, os.path.basename(sel_row['Key']))
                    if not os.path.exists(out_file):
                        s3client.download_file(bucket, sel_row['Key'], out_file)                        
        tPrint('Download Complete')
        # Create VRT file for each YEAR_MONTH_DAY 
        all_dirs = []
        all_vrts = []
        for root, dirs, files in os.walk(out_folder):
            if os.path.basename(root) != str(sel_year):
                for d in dirs:
                    all_dirs.append(os.path.join(root, d))
            for c_dir in all_dirs:
                out_vrt = os.path.join(os.path.dirname(c_dir), f'WOFS_VRT_{"_".join(c_dir.split("/")[-3:])}.vrt')
                vrt_files = [os.path.join(c_dir, x) for x in os.listdir(c_dir)]                       
                vrt_options = gdal.BuildVRTOptions(resampleAlg='cubic', addAlpha=True)
                my_vrt = gdal.BuildVRT(out_vrt, vrt_files, options=vrt_options)
                my_vrt = None
                all_vrts.append(out_vrt)



10:02:17	Downloading imagery for grid_MLI05_ply in 2017
10:13:02	Download Complete
10:16:29	Downloading imagery for grid_MLI06_ply in 2017
10:23:56	Download Complete
10:29:47	Downloading imagery for grid_MLI07_ply in 2017
10:36:35	Download Complete
10:44:49	Downloading imagery for grid_BFA01_ply in 2017
10:54:00	Download Complete
11:04:47	Downloading imagery for grid_BFA02_ply in 2017
11:09:57	Download Complete
11:22:21	Downloading imagery for grid_MLI01_ply in 2017
11:29:58	Download Complete
11:44:22	Downloading imagery for grid_MLI02_ply in 2017
11:45:49	Download Complete
12:00:41	Downloading imagery for grid_MLI03_ply in 2017
12:00:46	Download Complete
12:16:12	Downloading imagery for grid_MLI04_ply in 2017
12:28:35	Download Complete


# Run grid-base zonal stats

In [7]:
unq_vals = [128, 130, 160]

grid_folder = "/home/wb411133/projects/WOFS_Walker/GRIDS"
grids = [os.path.join(grid_folder, x) for x in os.listdir(grid_folder) if x.endswith(".shp")]

vrt_folder = f"/home/wb411133/projects/WOFS_Walker/IMAGES/{sel_year}"
vrts = []
for root, dirs, files in os.walk(vrt_folder):
    for f in files:
        if f.endswith(".vrt"):
            vrts.append(os.path.join(root, f))


In [8]:
# Group VRTs into months
vrt_months = {}
for vrt in vrts:
    cur_month = vrt.split("_")[-2]
    try:
        vrt_months[cur_month].append(vrt)
    except:
        vrt_months[cur_month] = [vrt]

In [9]:
vrt_months

{'01': ['/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_03.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_11.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_19.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_27.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_02.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_10.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_18.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_26.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_01.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_09.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_17.vrt',
  '/home/wb411133/projects/WOFS_Walker/IMAGES/2017/01/WOFS_VRT_2017_01_25.vrt',
  '/home/wb411133/projects/WOFS_Wa

### Multi-processing

In [10]:
unq_vals = [128, 130, 160]
def run_zonal(x_cur_grid, x_vrts, x_out_file):
    curD = gpd.read_file(x_cur_grid)
    #pbar = tqdm(x_vrts)
    #pbar.set_description(f'Processing {os.path.basename(x_out_file).replace(".csv", "")}')
    for cur_vrt in vrts:
        curR = rasterio.open(cur_vrt)
        if curD.crs != curR.crs:
            curD = curD.to_crs(curR.crs)
        date = "_".join(cur_vrt.split("_")[-3:]).replace(".vrt", "")
        
        res = rMisc.zonalStats(curD, curR, rastType='C', unqVals=unq_vals)
        res = pd.DataFrame(res, columns=[f'{date}_{x}' for x in unq_vals])

        try:
            final = final.join(res)                
        except:
            final = res        
    final['GRID_ID'] = curD['GRID_ID']
    final.to_csv(x_out_file)
    tPrint(f'Completed {os.path.basename(x_out_file).replace(".csv", "")}')
    


In [11]:
try:
    del final
except:
    pass

mp_args = []        
for cur_grid in grids:
    for month, vrts in vrt_months.items():
        out_file = os.path.join(final_folder, os.path.basename(cur_grid).replace(".shp", f"_{month}_{sel_year}.csv"))        
        if not os.path.exists(out_file):
            mp_args.append([cur_grid, vrts, out_file])

print (len(mp_args))
            

108


In [12]:
nCores = min([60, len(mp_args)])
with multiprocessing.Pool(nCores) as pool:
    pool.starmap(run_zonal, mp_args)

16:10:22	Completed grid_BFA01_ply_07_2017
16:17:38	Completed grid_BFA01_ply_11_2017
16:19:03	Completed grid_BFA01_ply_08_2017
16:20:34	Completed grid_BFA01_ply_12_2017
16:21:04	Completed grid_BFA01_ply_10_2017
16:21:30	Completed grid_BFA01_ply_09_2017
16:22:16	Completed grid_BFA01_ply_02_2017
16:23:06	Completed grid_BFA01_ply_06_2017
16:23:44	Completed grid_BFA01_ply_04_2017
16:24:01	Completed grid_BFA01_ply_05_2017
16:24:29	Completed grid_BFA01_ply_03_2017
16:25:44	Completed grid_BFA01_ply_01_2017
17:57:03	Completed grid_MLI01_ply_01_2017
18:10:38	Completed grid_MLI01_ply_02_2017
18:11:51	Completed grid_MLI01_ply_04_2017
18:12:55	Completed grid_MLI01_ply_05_2017
18:12:58	Completed grid_MLI01_ply_03_2017
18:13:23	Completed grid_MLI01_ply_06_2017
18:17:04	Completed grid_MLI01_ply_08_2017
18:19:02	Completed grid_MLI01_ply_09_2017
18:19:49	Completed grid_MLI01_ply_07_2017
18:20:08	Completed grid_MLI01_ply_11_2017
18:22:52	Completed grid_MLI01_ply_12_2017
18:24:37	Completed grid_BFA02_ply_

In [None]:
print("FUBAR")

### Run individually

In [None]:
for cur_grid in grids:
    grid_folder = os.path.join(final_folder, os.path.basename(cur_grid).replace(".shp", ""))
    if not os.path.exists(grid_folder):
        os.makedirs(grid_folder)
    for month, vrts in vrt_months.items():
        try:
            del final
        except:
            pass
        out_file = os.path.join(grid_folder, os.path.basename(cur_grid).replace(".shp", f"_{month}.csv"))
        if not os.path.exists(out_file):
            curD = gpd.read_file(cur_grid)
            pbar = tqdm(vrts)
            pbar.set_description(f'Processing {os.path.basename(cur_grid).replace(".shp", "")} {month}')

            for cur_vrt in pbar:
                curR = rasterio.open(cur_vrt)
                if curD.crs != curR.crs:
                    curD = curD.to_crs(curR.crs)
                date = "_".join(cur_vrt.split("_")[-3:]).replace(".vrt", "")

                res = rMisc.zonalStats(curD, curR, rastType='C', unqVals=unq_vals)
                res = pd.DataFrame(res, columns=[f'{date}_{x}' for x in unq_vals])

                try:
                    final = final.join(res)                
                except:
                    final = res            
            final['GRID_ID'] = curD['GRID_ID']
            final.to_csv(out_file)

# Summarize Aggregates

## Evaluate zonal stats

In [None]:
pd.read_csv?

In [None]:

# loop through all the zonal stats and calculate maximum counts
zonal_files = os.listdir(final_folder)
for z_file in zonal_files:
    inD = pd.read_csv(os.path.join(final_folder, z_file), index_col=0)
    print(z_file)
    print(inD.apply(get_max, axis=0))
    


In [None]:
inD.head()

## extract extent of files

In [None]:
pt_files

In [None]:
# get extents of grid_files
all_res = []
for g_file in pt_files:
    inD = gpd.read_file(g_file)
    all_res.append([os.path.basename(g_file), inD.unary_union])
    

In [None]:
all_res2 = []
for key, items in vrt_months.items():
    for vrt in items:
        xx = rasterio.open(vrt)
        all_res2.append([os.path.basename(vrt), box(*xx.bounds)])

In [None]:
final_extents = gpd.GeoDataFrame(pd.DataFrame(all_res, columns=['File','geometry']), geometry='geometry', crs=inD.crs)
final_extents.to_file("GRIDS_Extents.geojson", driver="GeoJSON")

In [None]:
final_extents2 = gpd.GeoDataFrame(pd.DataFrame(all_res2, columns=['File','geometry']), geometry='geometry', crs=xx.crs)
final_extents2.to_file("VRT_Extents.geojson", driver="GeoJSON")

# DEBUGGING

In [None]:
run_zonal(grids[0], vrt_months['01'], '/home/wb411133/projects/WOFS_Walker/FUBAR_01.csv')
xx = pd.read_csv('/home/wb411133/projects/WOFS_Walker/FUBAR_01.csv', index_col=0)

def get_max(x):
    try:
        return(x.max())
    except:
        pass

list(xx.apply(get_max, axis=0))

In [None]:
for vrt in vrts:
    print(os.path.basename(vrt))
    xx = rasterio.open(vrt)
    data = xx.read()
    unique, counts = np.unique(data, return_counts=True)
    print(np.asarray((unique, counts)).T)