### Mexico Poverty Implementaton
This was the version of the script used to search for GBDX imagery over AOIs for David Newhouse's collaboration with GOST to test the utility of high resolution satellite imagery for poverty prediction.

It makes use of the same logic as the single AOI script, with a customised first section where overlapping AOIs are grouped together with a sequential 'buffer and collapse' process. 

If looking at searching / downloading GBDX imagery for the first time, it is recommended that you start by looking at either of the other scripts (single AOI / Multipoint) in this folder. 

### Library installation and script setup
This box needs to only be run once. It builds the environment to carry out the rest of the analysis.

In [7]:
# Run one time only - install pip and unusual Libraries
import pip
import time
import pandas as pd
import geopandas as gpd
import shapely
from shapely.wkt import loads
from shapely.geometry import MultiPolygon, MultiPoint, Polygon, box
from shapely.ops import cascaded_union, unary_union
from shapely.ops import nearest_points
import time
import json
from gbdxtools import Interface
from gbdxtools.task import env
from gbdxtools import CatalogImage
import sys, os
gbdx = Interface()
%matplotlib inline

### Simplify Complex Clustered Polygon Objects

This section aims to import the AOIs as described by raw shapefiles, and draw sensible bounding boxes around them

In [26]:
shps = []
pth = r'C:\Users\charl\Documents\GOST\LAC floods\RE__Fathom_global_flood_data_use'
for f in ['RC_CABA.shp','RC_Cordoba_Capital.shp','RC_Jujuy_Capital.shp','RC_RegionMetropolitanaBA.shp','RC_Resistencia.shp','RC_SantaFe_Capital.shp']:
    gdf = gpd.read_file(os.path.join(pth,f))
    shp = unary_union(gdf.geometry)
    shps.append(shp)
shape = gpd.GeoDataFrame({'geometry':shps}, geometry = 'geometry', crs = {'init':'epsg:4326'})

In [27]:
### rawAOI = r'agebs_val_muni.shp'
crs = {'init': 'epsg:4326'}
bufw = 0.015

# Define conversion function - objects to list of bounding boxes
def BoundingBoxList(MultiPolygonObj):
    boxlist = []
    for obj in MultiPolygonObj:
        coords = [n for n in obj.bounds]
        bbox = box(coords[0],coords[1],coords[2],coords[3])
        boxlist.append(bbox)
    return boxlist

polygons = MultiPolygon(shape['geometry'].loc[i] for i in shape.index)

exterior = cascaded_union(polygons)
exterior_boxxs = BoundingBoxList(exterior)

# Scientific Buffer Setting based on nearest neighbour median
dff = pd.DataFrame({'exterior': exterior})
dff['ext.centroid'] = dff['exterior'].apply(lambda x: x.centroid)

def func(x):
    m = dff.loc[dff['ext.centroid'] != x]
    l = MultiPoint(m['ext.centroid'].tolist())
    n = nearest_points(x, l)
    return x.distance(n[0])
    i += 1
    
dff['nn_distance'] = dff['ext.centroid'].apply(lambda x: func(x))
bufw = dff['nn_distance'].median()
print 'Buffer width set as %f' % bufw

# Group nearby AOIs
tight_bbox = MultiPolygon(exterior_boxxs)
reduced_boxes = cascaded_union(tight_bbox.buffer(bufw))

rboxxs = BoundingBoxList(reduced_boxes)
final_boxes = cascaded_union(MultiPolygon(rboxxs))
fboxxs = BoundingBoxList(final_boxes.buffer(-bufw))

pd.DataFrame({'AOI_geometry':fboxxs}).to_csv(os.path.join(pth, 'AOI_Collection.csv'))

print 'useful area of tight bbox: %d percent' % (exterior.area / tight_bbox.area * 100)
print 'useful area of reduced bbox: %d percent' % (exterior.area / reduced_boxes.area * 100)
print 'useful area of final bbox: %d percent' % (exterior.area / final_boxes.area * 100)
print 'number of AOIs: %d' % len(fboxxs)

Buffer width set as 0.000000
useful area of tight bbox: 68 percent
useful area of reduced bbox: 68 percent
useful area of final bbox: 68 percent
number of AOIs: 6


### Define the Search Parameters

In [8]:
# Define categorical search parameters
cutoff_cloud_cover = 25   # images with CC over this threshold discarded
cutoff_overlap = 0     # images with AOI overlap below this threshold discarded. [N.b.: keep small if AOI large.]
cutoff_date_low = '1-Jan-13'  # images older than this date discarded
cutoff_date_high = '1-Jan-16' # images newer than this date discarded
cutoff_nadir = 25 # Images at nadir angles greater than threshold discarded
cutoff_pan_res = 1 # Images below this resolution discarded
accepted_bands = ['PAN_MS1','PAN_MS1_MS2'] #  Images with any other band entry discarded


# Define continuous image ranking preferences
optimal_date =  '1-Jul-14' # Optimal date (enter as dd-mmm-yy)
optimal_pan_res = 0.4 # Optimal pan resolution, metres
optimal_nadir = 0 # optimal image angle. 0 = vertical

# Define continuous image ranking preference weights. Must sum to 1.
# If user cares more about scenes being contemporaneous, up 'date' weighting at expense of other categories. 
pref_weights = {
    'cloud_cover': 0.4,
    'overlap':0.25,
    'date': 0.25,
    'nadir': 0.1,
    'resolution': 0.0
    }

### Define Charles Rocks II Process

In [9]:
%matplotlib inline

def Process(AOI, 
            cutoff_cloud_cover, 
            cutoff_overlap, 
            cutoff_date_low,
            cutoff_date_high,
            cutoff_nadir, 
            cutoff_pan_res, 
            accepted_bands, 
            optimal_date, 
            optimal_pan_res, 
            optimal_nadir, 
            pref_weights, 
            AOI_counter
           ):
    
    # Define bbox object
    bbox = [AOI.bounds[i] for i in range(0,len(AOI.bounds))]
    
    # Define search function. Returns up to 1000 images where cloud cover smaller than 25%
    def search_unordered(bbox, _type, count=1000, cloud_cover=25):
        aoi = AOI.wkt
        query = "item_type:{} AND item_type:DigitalGlobeAcquisition".format(_type)
        query += " AND attributes.cloudCover_int:<{}".format(cloud_cover)
        return gbdx.vectors.query(aoi, query, count=count)

    # Run search on Area of Interest (AOI). Passes in AOI in Well Known Text format (wkt)
    records = search_unordered(AOI.wkt, 'DigitalGlobeAcquisition')

    # Create list object of all catalog IDs returned in search
    ids = [r['properties']['attributes']['catalogID'] for r in records]

    # Define Counters
    l = 0    # number of non-IDAHO images
    scenes = [] # list containing metadata dictionaries of all scenes in our AOI 

    # Toggle for printing images to screen
    download_thumbnails = 0

    # Loop catalog IDs
    for i in ids:

        # Fetch metadata dictionary for each catalog ID in ids list
        r = gbdx.catalog.get(i)

        # Check location of ID - is it in IDAHO?
        try:
            location = gbdx.catalog.get_data_location(i)
        except: 
            location == 'not_delivered'
        
        # Defines IDAHO variable as binary 1 / 0 depending on whether it is in IDAHO already or not
        if location == 'not_delivered':
            l = l + 1
            idaho = 0
        else:
            idaho = 1

            # Download image if image in IDAHO and toggle on
            if download_thumbnails == 1:
                image = CatalogImage(i, band_type="MS", bbox=bboxx)
                image.plot(w=10, h=10)
            else:
                pass

        # Calculate the percentage overlap with our AOI for each scene
        # load as a Shapely object the wkt representation of the scene footprint
        footprint = r['properties']['footprintWkt']
        shapely_footprint = shapely.wkt.loads(footprint)

        # Calculate the object that represents the difference between the AOI and the scene footprint 
        AA = AOI.difference(shapely_footprint)

        # Define frac as the fraction, between 0 and 1, of the AOI that the scene covers
        frac = 1 - ((AA).area / AOI.area)

        # Create BB - the proxy for the useful area. IF scene entirely contains AOI, then BB = AOI, else it is the intersection 
        # of the scene footprint and the AOI
        BB = AOI 
        if frac < 1:
            BB = AOI - AA
    
        # Similarly, AA, the difference area between AOI and the scene, can be set to null if the scene contains 100% of the AOI 
        if frac == 1:
            AA = ""

        # Append key metadata to list obejct 'scenes' for the current scene, as a dictionary. This then moves into the pandas dataframe.
        # Several objects here are from DigitalGlobe's metadata dictionary (anything with an r start)
        scenes.append({
            'ID':i, 
            'TimeStamp':r['properties']['timestamp'],
            'CloudCover':r['properties']['cloudCover'],
            'ImageBands':r['properties']['imageBands'],
            'On_IDAHO':idaho,
            'browseURL': r['properties']['browseURL'],
            'Overlap_%': frac * 100,
            'PanResolution': r['properties']['panResolution'],
            'MultiResolution': r['properties']['multiResolution'],
            'OffNadirAngle': r['properties']['offNadirAngle'],
            'Sensor':r['properties']['sensorPlatformName'],
            'Full_scene_WKT':r['properties']['footprintWkt'],
            'missing_area_WKT':AA,
            'useful_area_WKT':BB
            })

    # Define column order for dataframe of search results
    cols = ['ID','Sensor','ImageBands','TimeStamp','CloudCover','Overlap_%','PanResolution','MultiResolution','OffNadirAngle','On_IDAHO','browseURL','Full_scene_WKT','useful_area_WKT','missing_area_WKT']

    #Generate pandas dataframe from results
    out = pd.DataFrame(scenes,columns = cols)
    
    # Convert Timestamp field to pandas DateTime object
    out['TS'] = out['TimeStamp'].apply(lambda x: pd.Timestamp(x))

    # Add separate date and time columns for easy interpretation
    string = out['TimeStamp'].str.split('T')
    out['Date'] = string.str.get(0)
    out['Time'] = string.str.get(1)

    # Categorical Search: remove disqualified images. Copy of dataframe taken, renamed to 'out_1stcut'.
    out_1stcut = out.loc[(out['CloudCover'] <= cutoff_cloud_cover) & 
                         (out['Overlap_%'] >= cutoff_overlap) & 
                         (out['TS'] > pd.Timestamp(cutoff_date_low)) & 
                         (out['TS'] < pd.Timestamp(cutoff_date_high)) & 
                         (out['ImageBands'].isin(accepted_bands)) & 
                         (out['OffNadirAngle'] <= cutoff_nadir) & 
                         (out['PanResolution'] <= cutoff_pan_res)
                        ]

    # Apply ranking method over all non-disqualified search results for each field
    optimal_date = pd.to_datetime(optimal_date, utc = True)

    # each 1% of cloud cover = 1 point
    out_1stcut['points_CC'] = (out_1stcut['CloudCover'])  

    # each 1% of overlap missed = 1 point
    out_1stcut['points_Overlap'] = (100 - out_1stcut['Overlap_%'])  

    # each week away from the optimal date = 1 point 
    out_1stcut['points_Date'] = ((abs(out_1stcut['TS'] - optimal_date)).view('int64') / 60 / 60 / 24 / 1E9) / 7 

    # each degree off nadir = 1 point
    out_1stcut['points_Nadir'] = abs(out_1stcut['OffNadirAngle'] - optimal_nadir) 

    # each cm of resolution worse than the optimal resolution = 1 point
    out_1stcut['points_Res'] = (out_1stcut['PanResolution'] - optimal_pan_res).apply(lambda x: max(x,0)) * 100 

    # Define ranking algorithm - weight point components defined above by the preference weighting dictionary
    def Ranker(out_1stcut, pref_weights):
        a = out_1stcut['points_CC'] * pref_weights['cloud_cover']
        b = out_1stcut['points_Overlap'] * pref_weights['overlap']
        c = out_1stcut['points_Date'] * pref_weights['date'] 
        d = out_1stcut['points_Nadir'] * pref_weights['nadir']
        e = out_1stcut['points_Res'] * pref_weights['resolution']

        # Score is linear addition of the number of 'points' the scene wins as defined above. More points = worse fit to criteria
        rank = a + b + c + d + e
        return rank

    # Add new column - Rank Result - with the total number of points accrued by the scene 
    out_1stcut['RankResult'] = Ranker(out_1stcut, pref_weights)

    # Add a Preference order column - Pref_Order - based on Rank Result, sorted ascending (best scene first)
    out_1stcut = out_1stcut.sort_values(by = 'RankResult', axis = 0, ascending = True)
    out_1stcut = out_1stcut.reset_index()
    out_1stcut['Pref_order'] = out_1stcut.index + 1
    out_1stcut = out_1stcut.drop(['index'], axis = 1)
    
    cols = ['ID','Sensor','ImageBands','Date','Time','CloudCover','Overlap_%','PanResolution','MultiResolution','OffNadirAngle','On_IDAHO','Pref_order','RankResult','points_CC','points_Overlap','points_Date','points_Nadir','points_Res','browseURL','Full_scene_WKT','useful_area_WKT','missing_area_WKT']
    out_1stcut = out_1stcut[cols]
    
    # Create a new copy of the dataframe to work on
    finaldf = out_1stcut
    
    # Add column for used scene region area, expressed as .wkt
    finaldf['used_scene_region_WKT'] = 0
    finaldf['used_area'] = 0
    
    # Set initial value of AOI_remaining to the full AOI under consideration
    AOI_remaining = AOI

    # Create two lists - usedareas for the areas of scenes used in the final product, and AOI_rems to record sequential reduction in 
    # remaining AOI that needs to be filled
    usedareas = []
    AOI_rems = []

    # Set up loop for each image in dataframe of ranked images
    for s in finaldf.index:
        if AOI_remaining.area < (AOI.area / 100):
            pass
        else: 
            # pick up the WKT of the useful area as the useful_scene_region variable
            useful_scene_region = finaldf['useful_area_WKT'].loc[s]

            # Set up try loop - to catch if there is no intersection of AOI_remaining and useful_scene_region
            #try
            # define 'used_scene_region' as the useable bit of the image that overlaps the AOI
            used_scene_region = AOI_remaining.intersection(useful_scene_region)

            # calculate the area of that region
            used_area = used_scene_region.area
            # Check to see if this is a geometry collection. This shapely type if for 'jumbles' of outputs (e.g. Polygons + Lines)
            # This can be created if the intersection process decides that it also wants a 1-pixel strip from the bottom of the image
            # as well as the main chunk. This won't translate back to a shapefile, so we drop non-Polygon objects iteratively. 
            if used_scene_region.type == 'GeometryCollection':
                xlist = []

                # Iterate through all objects in the geometry collection
                for y in used_scene_region.geoms:

                    # Add polygons to a fresh list
                    if y.type == 'Polygon':
                        xlist.append(y)

                # Convert that list to a multipolygon object
                used_scene_region = MultiPolygon(xlist)

            # Append the used bit of the image to the usedareas list. 
            usedareas.append(used_scene_region)
            try: 
                # Add two new columns to the dataframe - the used scene geometry in wkt, and the area of the used scene
                finaldf['used_scene_region_WKT'].loc[s] = used_scene_region
                finaldf['used_area'].loc[s] = used_area
            except: 
                pass
            # Redefine the area of the AOI that needs to be filled by the next, lower-rank image
            AOI_remaining = AOI_remaining.difference(used_scene_region)

            # Add this to the AOI_rems list for troubelshooting and verification
            AOI_rems.append(AOI_remaining)

            print '\t...after image %s, %d percent remaining' % (s+1, (AOI_remaining.area/AOI.area*100))

    # Drop from the scene list any scene where the area used is less than 1% of the AOI
    finaldf = finaldf.loc[finaldf['used_area'] > (AOI.area / 100)]
    # Print summary statistics to consol
    print 'AOI %s Complete. Proportion of AOI covered: %d percent' % (AOI_counter, (finaldf['used_area'].sum() / AOI.area * 100))
    #Add counter
    finaldf['AOI_counter'] = AOI_counter
    return finaldf

### Run Process


In [None]:
AOI_counter = 1
list_of_dfs = []
print 'Beginning image identification process. Standby.'
for AOI in fboxxs:
    time.sleep(1)
    output = Process(
            AOI, 
            cutoff_cloud_cover, 
            cutoff_overlap, 
            cutoff_date_low,
            cutoff_date_high,
            cutoff_nadir, 
            cutoff_pan_res, 
            accepted_bands, 
            optimal_date, 
            optimal_pan_res, 
            optimal_nadir, 
            pref_weights, 
            AOI_counter
            )
    list_of_dfs.append(output)
    AOI_counter += 1
    finaldf = pd.concat(list_of_dfs)
    finaldf.to_csv('Scene_List.csv')
print 'Process complete'

Beginning image identification process. Standby.
	...after image 1, 0 percent remaining
AOI 1 Complete. Proportion of AOI covered: 100 percent


### Run Intersection Check

As the GBDX search function takes bounding boxes, we passed all AOIs to it in box format. It is possible that the above process filled in the box with images, AND that some of those images don't intersect any part of the true AOI. Hence, we remove any images from the ordering list if less than 2% of the used footprint intersects with areas we are interested in. 

In [364]:
finaldf = pd.read_csv('Scene_List.csv')
print 'length before: %s' % len(finaldf)

def check(x): 
    if exterior.intersection(x.buffer(0)).area > x.area/50:
        return 1
    else:
        return 0

finaldf['drop'] = finaldf['used_scene_region_WKT'].map(shapely.wkt.loads).apply(lambda x: check(x))
finaldf = finaldf.loc[finaldf['drop'] == 1]
print 'lenght after check %s' %  len(finaldf)

length before: 157
lenght after check 133


### Calculate Area Coverage

This looks at the original shapes - and how far the imagery found covers that area

In [365]:
def area(x):
    return exterior.intersection(x.buffer(0)).area
finaldf['AOI_coverage_area'] =  finaldf['used_scene_region_WKT'].map(shapely.wkt.loads).apply(lambda x: area(x))

### Final Statistical Print and File Save

In [366]:
# Mexico ITRF2008 / LCC
crs_targ = {'init': 'epsg:6372'}

def AreaCalc(obj, crs, crs_targ):
    df = gpd.GeoDataFrame(range(0, len(obj)), crs = crs, geometry = obj)
    df = df.to_crs(crs_targ)
    df['area'] = df.area / 1E6
    return df['area'].sum()

mehico = gpd.read_file(mex)
area_of_Mexico = AreaCalc(mehico['geometry'], crs, crs_targ)
area_of_AOIs = AreaCalc(shape['geometry'].tolist(), crs, crs_targ)
area_of_final_bboxs = AreaCalc(fboxxs, crs, crs_targ)
area_of_imagery_used = AreaCalc(finaldf['used_scene_region_WKT'].map(shapely.wkt.loads).tolist(), crs, crs_targ)

unique_images = len(finaldf['ID'].unique())
unique_images_on_idaho = len(finaldf.loc[finaldf['On_IDAHO'] == 1].groupby('ID'))
to_be_ordered = unique_images - unique_images_on_idaho

print 'Area of mexico: %d square kilometres.' % area_of_Mexico
print 'Area of AOIs: %d square kilometres.' % area_of_AOIs
print 'Anticipated area of compute (bounding boxes for AOIs): %d square kilometres' % area_of_final_bboxs
print 'Area of bounding boxes filled by imagery: %d square kilometres' % area_of_imagery_used
print 'As a pecentage of Mexico, AOIs = %f percent, imagery area = %f percent' % ((area_of_AOIs*100 / area_of_Mexico), (area_of_final_bboxs*100 / area_of_Mexico))
print 'Percentage coverage of target bounding boxes = %f percent' % (area_of_imagery_used*100 / area_of_final_bboxs)
print 'Percentage coverage of actual AOIs = %f percent' % (finaldf['AOI_coverage_area'].sum()*100 / exterior.area)
print 'Total images used: %d' % unique_images
print 'Images on IDAHO already: %d' % unique_images_on_idaho
print 'Images that need to be ordered: %d' % to_be_ordered

finaldf.to_csv('Final_Scene_List.csv')

Area of mexico: 1943096 square kilometres.
Area of AOIs: 2333 square kilometres.
Anticipated area of compute (bounding boxes for AOIs): 15062 square kilometres
Area of bounding boxes filled by imagery: 12827 square kilometres
As a pecentage of Mexico, AOIs = 0.120069 percent, imagery area = 0.775202 percent
Percentage coverage of target bounding boxes = 85.157103 percent
Percentage coverage of actual AOIs = 98.029653 percent
Total images used: 127
Images on IDAHO already: 29
Images that need to be ordered: 98


In [367]:
order_df = finaldf.drop_duplicates('ID')
order_df.to_csv('Unique IDs.csv')
order_df = order_df.loc[order_df['On_IDAHO'] == 0]
order_list = order_df['ID'].tolist()

### Order Imagery

Use this cell block to order up to IDAHO all imagery in ther 'order_list' variable

In [None]:
order_receipts = []
print 'Number of images to be ordered: %d' % len(order_list)

consent = 'I agree to ordering these image IDs to IDAHO'

if consent == 'I agree to ordering these image IDs to IDAHO':
    for x in order_list:
        order_id = gbdx.ordering.order(x)
        order_receipts.append(order_id)
else: 
    print 'please write out your consent in the consent variable above'

Number of images to be ordered: 98


### Check Ordering Status

Use this code block to check whether the images have yet been ordered up to IDAHO

In [374]:
for receipt in order_receipts[:10]:
    print gbdx.ordering.status(receipt)

[{u'acquisition_id': u'1050410010BB8200', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'103001002F0A1400', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'105041001232A700', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'1050410010402000', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'103001003B35C200', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'105041001232A600', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'1050410003F6D000', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'1010010011C1AD00', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'105041001119EA00', u'state': u'placed', u'location': u'not_delivered'}]
[{u'acquisition_id': u'103001003AC43300', u'state': u'placed', u'location': u'not_delivered'}]


In [231]:
import matplotlib.pyplot as plt
import numpy as np

AREA_TEST = []
AOI_NUMBER = []

def BoundingBoxList(MultiPolygonObj):
    boxlist = []
    for obj in MultiPolygonObj:
        coords = [n for n in obj.bounds]
        bbox = box(coords[0],coords[1],coords[2],coords[3])
        boxlist.append(bbox)
    return boxlist

for i in range(1, 21):
    rawAOI = r'agebs_val_muni.shp'
    crs = {'init': 'epsg:4326'}
    shape = gpd.read_file(rawAOI)
    bufw = float(i / 100.0)

    polygons = MultiPolygon(shape['geometry'].loc[i] for i in shape.index)

    exterior = cascaded_union(polygons)
    exterior_boxxs = BoundingBoxList(exterior)

    tight_bbox = MultiPolygon(exterior_boxxs)
    reduced_boxes = cascaded_union(tight_bbox.buffer(bufw))

    rboxxs = BoundingBoxList(reduced_boxes)
    final_boxes = cascaded_union(MultiPolygon(rboxxs))
    fboxxs = BoundingBoxList(final_boxes.buffer(-bufw))
    AREA_TEST.append(final_boxes.area)
    AOI_NUMBER.append(len(fboxxs))

X = range(1,21)
X[:] = [x / 100.0 for x in X]
plt.plot(X, AREA_TEST, color = 'blue')
plt.plot(X, AOI_NUMBER, color = 'green')
plt.xlabel('buf_width')
plt.ylabel('Value')
plt.show()

1
0.01
2
0.02
3
0.03
4
0.04
5
0.05
6
0.06
7
0.07
8
0.08
9
0.09
10
0.1
11
0.11
12
0.12
13
0.13
14
0.14
15
0.15
16
0.16
17
0.17
18
0.18
19
0.19
20
0.2
