In this notebook, we import an adminboundary file, and use it to summarize the .tif files we generated in the last step. These will give us our summary statistics by region

In [49]:
import pandas as pd
import os, sys
sys.path.append(r'C:\Users\charl\Documents\GitHub\GOST_PublicGoods\GOSTNets\GOSTNets')
sys.path.append(r'C:\Users\charl\Documents\GitHub\GOST')
import GOSTnet as gn
import importlib
import geopandas as gpd
import rasterio as rt
from rasterio import features
from shapely.wkt import loads
import numpy as np
import networkx as nx
from shapely.geometry import box, Point, Polygon
import glob

Generate a sub list of files that have the extension '.tif'

In [50]:
tif_file_list = []
for f in glob.glob(data_path+r'\**'):
    if f[-4:] == '.tif':
        tif_file_list.append(f)

Define zonalStats function - clone for GOSTrocks

In [51]:
def zonalStats(inShp, 
               inRaster, 
               bandNum=1, 
               mask_A = None, 
               reProj = False, 
               minVal = '', 
               maxVal = '', 
               verbose=False , 
               rastType='N', 
               unqVals=[]):
    import sys, os, inspect, logging, json
    import rasterio, affine

    import pandas as pd
    import geopandas as gpd
    import numpy as np

    from collections import Counter
    from shapely.geometry import box
    from affine import Affine
    from rasterio import features
    from rasterio.mask import mask
    from rasterio.features import rasterize
    from rasterio.warp import reproject, Resampling
    from osgeo import gdal
    
    ''' Run zonal statistics against an input shapefile
    
    INPUT VARIABLES
    inShp [string or geopandas object] - path to input shapefile
    inRaster [string or rasterio object] - path to input raster
    
    OPTIONAL
    bandNum [integer] - band in raster to analyze
    reProj [boolean] -  whether to reproject data to match, if not, raise an error
    minVal [number] - if defined, will only calculation statistics on values above this number
    verbose [boolean] - whether to be loud with responses
    rastType [string N or C] - N is numeric and C is categorical. Categorical returns counts of numbers
    unqVals [array of numbers] - used in categorical zonal statistics, tabulates all these numbers, will report 0 counts
    mask_A [numpy boolean mask] - mask the desired band using an identical shape boolean mask. Useful for doing conditional zonal stats
    
    RETURNS
    array of arrays, one for each feature in inShp
    '''   
    if isinstance(inShp, str):
        inVector = gpd.read_file(inShp) 
    else:
        inVector = inShp
    if isinstance(inRaster, str):
        curRaster = rasterio.open(inRaster, 'r+')
    else:
        curRaster = inRaster
        
    # If mask is not none, apply mask 
    if mask_A is not None:
        
        curRaster.write_mask(np.invert(mask_A))
    
    outputData=[]
    if inVector.crs != curRaster.crs:
        if reProj:
            inVector = inVector.to_crs(curRaster.crs)
        else:
            raise ValueError("Input CRS do not match")
    fCount = 0
    tCount = len(inVector['geometry'])
    #generate bounding box geometry for raster bbox
    b = curRaster.bounds
    rBox = box(b[0], b[1], b[2], b[3])
    for geometry in inVector['geometry']:
        #This test is used in case the geometry extends beyond the edge of the raster
        #   I think it is computationally heavy, but I don't know of an easier way to do it
        if not rBox.contains(geometry):
            geometry = geometry.intersection(rBox)            
        try:
            fCount = fCount + 1
            if fCount % 1000 == 0 and verbose:
                tPrint("Processing %s of %s" % (fCount, tCount) )
            # get pixel coordinates of the geometry's bounding box
            ul = curRaster.index(*geometry.bounds[0:2])
            lr = curRaster.index(*geometry.bounds[2:4])
            '''
            TODO: There is a problem with the indexing - if the shape falls outside the boundaries, it errors
                I want to change it to just grab what it can find, but my brain is wrecked and I cannot figure it out
            print(geometry.bounds)
            print(curRaster.shape)
            print(lr)
            print(ul)
            lr = (max(lr[0], 0), min(lr[1], curRaster.shape[1]))
            ul = (min(ul[0], curRaster.shape[0]), min(ul[1]))
            '''
            # read the subset of the data into a numpy array
            window = ((float(lr[0]), float(ul[0]+1)), (float(ul[1]), float(lr[1]+1)))
            
            if mask is not None:
                data = curRaster.read(bandNum, window=window, masked = True)
            else:
                data = curRaster.read(bandNum, window=window, masked = False)
            
            # create an affine transform for the subset data
            t = curRaster.transform
            shifted_affine = Affine(t.a, t.b, t.c+ul[1]*t.a, t.d, t.e, t.f+lr[0]*t.e)

            # rasterize the geometry
            mask = rasterize(
                [(geometry, 0)],
                out_shape=data.shape,
                transform=shifted_affine,
                fill=1,
                all_touched=False,
                dtype=np.uint8)

            # create a masked numpy array
            masked_data = np.ma.array(data=data, mask=mask.astype(bool))
            if rastType == 'N':                
                if minVal != '' or maxVal != '':
                    if minVal != '':
                        masked_data = np.ma.masked_where(masked_data < minVal, masked_data)
                    if maxVal != '':
                        masked_data = np.ma.masked_where(masked_data > maxVal, masked_data)                    
                    if masked_data.count() > 0:                        
                        results = [masked_data.sum(), masked_data.min(), masked_data.max(), masked_data.mean()]
                    else :
                        results = [-1, -1, -1, -1]                
                else:
                    results = [masked_data.sum(), masked_data.min(), masked_data.max(), masked_data.mean()]
            if rastType == 'C':
                if len(unqVals) > 0:                          
                    xx = dict(Counter(data.flatten()))
                    results = [xx.get(i, 0) for i in unqVals]                
                else:
                    results = np.unique(masked_data, return_counts=True)                    
            outputData.append(results)
        except Exception as e: 
            print(e)
            outputData.append([-1, -1, -1, -1])            
    return outputData   

Generate zonal statistics for each .tif file; send to zonal statistics output location. See in line comments for further detail

In [72]:
zonal_stats = 1
if zonal_stats == 0:
    pass
else:
    for tif_file in tif_file_list:
        
        for resolution in ['national','district']:
            
            # set output location
            out_loc = r'D:\GOST\SierraLeone\Output\Zonal'

            # output raster name
            out_fn = tif_file
            subset = tif_file.split('\\')[-1][:-4]

            # utility file location - where the admin boundary file sits
            utils = r'D:\GOST\SierraLeone\Util_files'

            # import the national boundary file as GeoPandas DF
            national_shp_name = os.path.join(utils, r'SL_bound.shp')
            national_shp = gpd.read_file(national_shp_name)

            # Reproject to WGS 84 if in any other projection
            if national_shp.crs != {'init': 'epsg:4326'}:
                national_shp = national_shp.to_crs({'init': 'epsg:4326'})

            # import the district level shapefile
            district_shp_name = os.path.join(utils, r'Admin2_Polys_SL.shp')
            district_shp = gpd.read_file(district_shp_name)

            # Reproject to WGS 84 if in any other projection
            if district_shp.crs != {'init': 'epsg:4326'}:
                district_shp = district_shp.to_crs({'init': 'epsg:4326'})

            # Here, we load the .tif files generated in Step 6. They all have a very similar layout - 
            # the first band is the population layer, the second band is the travel time to the closest facility
            inraster = out_fn
            ras = rt.open(inraster, mode = 'r+')
            pop = ras.read(1)
            tt_matrix = ras.read(2)

            # this analysis is set to run slightly differently for national / district summary levels. 
            # Choose the shapefile accordingly
            if resolution == 'national':
                target_shp = national_shp
            elif resolution == 'district':
                target_shp = district_shp

            # First, add on the total population of the district to each district shape
            base_pop = zonalStats(target_shp, 
                                    inraster, 
                                    bandNum = 1,
                                    reProj = False, 
                                    minVal = 0,
                                    maxVal = 10000, 
                                    verbose = True, 
                                    rastType='N')

            cols = ['total_pop','min','max','mean']

            temp_df = pd.DataFrame(base_pop, columns = cols)

            target_shp['total_pop'] = temp_df['total_pop']
            target_shp['total_pop'].loc[target_shp['total_pop'] == -1] = 0

            ## Now, calculate the population within a range of time thresholds from the destination set
            for time_thresh in [30,60,120, 240]: # these are the time thresholds

                mask_obj = np.ma.masked_where(tt_matrix > (time_thresh), tt_matrix).mask

                raw = zonalStats(target_shp, 
                                    inraster, 
                                    bandNum = 1,
                                    mask_A = mask_obj,
                                    reProj = False, 
                                    minVal = 0,
                                    maxVal = np.inf, 
                                    verbose = True, 
                                    rastType='N')

                cols = ['pop_%s' % time_thresh,'min','max','mean']

                temp_df = pd.DataFrame(raw, columns = cols)

                target_shp['pop_%s' % time_thresh] = temp_df['pop_%s' % time_thresh]
                target_shp['pop_%s' % time_thresh].loc[target_shp['pop_%s' % time_thresh] == -1] = 0
                target_shp['frac_%s' % time_thresh] = (target_shp['pop_%s' % time_thresh]) / (target_shp['total_pop']).fillna(0)
                target_shp['frac_%s' % time_thresh].replace([np.inf, -np.inf], 0)
                target_shp['frac_%s' % time_thresh] = target_shp['frac_%s' % time_thresh].fillna(0)

            # Save to file
            if resolution == 'national':
                print('saving national')
                outter = target_shp[['total_pop','pop_30','frac_30','pop_60','frac_60','pop_120','frac_120','pop_240','frac_240']]
                outter.to_csv(os.path.join(out_loc,'%s_zonal_%s.csv'% (subset, resolution)))
            else:
                print('saving district')
                target_shp['abs_pop_iso'] = target_shp['total_pop'] - target_shp['pop_30']
                target_shp.to_file(os.path.join(out_loc,'%s_zonal_%s.shp' % 
                                                (subset, resolution)), driver = 'ESRI Shapefile')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
saving national
saving district
