# Training data download pipeline

Downloads 16x16 training data plots from Sentinel Hub, with the following steps:

*  Convert coordinates to UTM, identify bounding boxes of 160 and 180 meter borders
*  Download all L1C steps, correct missing bands, and calculate cloud cover
*  Select L2A imagery corresponding to the best imagery per 15 days, with missing imagery calculated as the weighted average of the nearest time steps

In [1]:
import pandas as pd
import numpy as np
from random import shuffle
from osgeo import ogr, osr
from sentinelhub import WmsRequest, WcsRequest, MimeType, CRS, BBox, constants, DataSource, CustomUrlParam
from s2cloudless import S2PixelCloudDetector, CloudMaskRequest
import logging
from collections import Counter
import datetime
import os
import yaml

with open("../config.yaml", 'r') as stream:
        key = (yaml.safe_load(stream))
        API_KEY = key['key'] 



In [153]:
DATA_LOCATION = '../data/kenya-test.csv'
OUTPUT_FOLDER = '../data/test-data-nov-27/'
EPSG = CRS.WGS84
existing = [int(x[:-4]) for x in os.listdir(OUTPUT_FOLDER) if ".DS" not in x]

In [154]:
%run ../src/slope.py

In [155]:
# setup function to reproject coordinates
def convertCoords(xy, src='', targ=''):

    srcproj = osr.SpatialReference()
    srcproj.ImportFromEPSG(src)
    targproj = osr.SpatialReference()
    if isinstance(targ, str):
        targproj.ImportFromProj4(targ)
    else:
        targproj.ImportFromEPSG(targ)
    transform = osr.CoordinateTransformation(srcproj, targproj)

    pt = ogr.Geometry(ogr.wkbPoint)
    pt.AddPoint(xy[0], xy[1])
    pt.Transform(transform)
    return([pt.GetX(), pt.GetY()])

def bounding_box(points, expansion = 160):
    # LONG, LAT FOR SOME REASON
    bl = list(points[0])
    tr = list(points[1])
    
    if 48 <= bl[0] <= 54:
        epsg = 32639 if bl[1] > 0 else 32739
    if 42 <= bl[0] <= 48:
        epsg = 32638 if bl[1] > 0 else 32738
    if 36 <= bl[0] <= 42:
        epsg = 32637 if bl[1] > 0 else 32737
    if 30 <= bl[0] <= 36:
        epsg = 32636 if bl[1] > 0 else 32736
    if 24 <= bl[0] <= 30:
        epsg = 32635 if bl[1] > 0 else 32735
    if 18 <= bl[0] <= 24:
        epsg = 32634 if bl[1] > 0 else 32734

    bl = convertCoords(bl, 4326, epsg)
    tr = convertCoords(tr, 4326, epsg)
    init = [b - a for a,b in zip(bl, tr)]
    distance1 = tr[0] - bl[0]
    distance2 = tr[1] - bl[1]
    EXPANSION = (expansion - np.mean([distance1, distance2]))/2 # should this be 155 or 160?
    
    bl = [a - EXPANSION for a in bl]
    tr = [a + EXPANSION for a in tr]
    
    after = [b - a for a,b in zip(bl, tr)]    
    if max(init) > 130:
        print("ERROR: Initial field greater than 130m")
    if min(init) < 120:
        print("ERROR: Initial field less than 130m")
        
    if min(after) < (expansion - 4.5):
        print("ERROR")
    if max(after) > (expansion + 5):
        print("ERROR")
    diffs = [b - a for b, a in zip(after, init)]

    bl = convertCoords(bl, epsg, 4326)
    tr = convertCoords(tr, epsg, 4326)
    return bl, tr

location = calc_bbox(val)
location = bounding_box(location)
box = BBox(location, crs = EPSG)

image_request = WcsRequest(
                layer='ALL_BANDS_NDVI',
                bbox=box,
                time = ('2018-01-01', '2018-12-31'),
                image_format = MimeType.TIFF_d32f,
                maxcc=1,
                instance_id=API_KEY,
                custom_url_params = {constants.CustomUrlParam.UPSAMPLING: 'BICUBIC'},
                time_difference=datetime.timedelta(hours=24),
            )
img_bands = image_request.get_data()
img_bands[0].shape

In [156]:
def calc_bbox(plot_id):
    subs = df[df['PLOT_ID'] == plot_id]
    # TOP, LEFT, BOTTOM, RIGHT
    # (min x, min y), (max x, max y)
    return [(min(subs['LON']), min(subs['LAT'])),
            (max(subs['LON']), max(subs['LAT']))]


df = pd.read_csv(DATA_LOCATION)
df = df.drop('IMAGERY_TITLE', axis = 1)
df = df.dropna(axis = 0)
plot_ids = sorted(df['PLOT_ID'].unique())

In [157]:
bounding_box(calc_bbox(plot_ids[1]))

([35.5686634922896, 4.310535388619154],
 [35.570108785525406, 4.311976573448053])

In [158]:
cloud_detector = S2PixelCloudDetector(threshold=0.4, average_over=4, dilation_size=2)


year = 2018
time = (str(year - 1) +'-12-15', str(year+1) +'-1-15')
print(time)

def calculate_proximal_steps(uniques, date, clean_steps):
    arg_before = None
    arg_after = None
    uniques = np.array(uniques)
    satisfactory = np.argwhere(uniques > 2)
    satisfactory = np.array([x for x in satisfactory if x in clean_steps])
    if date > 0:
        idx_before = satisfactory - date
        arg_before = idx_before[np.where(idx_before < 0, idx_before, -np.inf).argmax()]
    if date < np.max(satisfactory):
        idx_after = satisfactory - date
        arg_after = idx_after[np.where(idx_after > 0, idx_after, np.inf).argmin()]
    if not arg_after and not arg_before:
        arg_after = date
        arg_before = date
    if not arg_after:
        arg_after = arg_before
    if not arg_before:
        arg_before = arg_after
    print(arg_before, date, arg_after)
    return arg_before, arg_after


def identify_clouds(bbox, epsg = EPSG, time = time):
    try:
        box = BBox(bbox, crs = epsg)
        cloud_request = WmsRequest(
            layer='CLOUD_DETECTION',
            bbox=box,
            time=time,
            width=16,
            height=16,
            image_format = MimeType.TIFF_d32f,
            maxcc=0.33,
            instance_id=API_KEY,
            custom_url_params = {constants.CustomUrlParam.UPSAMPLING: 'BICUBIC'},
            time_difference=datetime.timedelta(hours=24),
        )
        
        cloud_img = cloud_request.get_data()
        cloud_probs = cloud_detector.get_cloud_probability_maps(np.array(cloud_img))
        means = np.mean(cloud_probs, (1, 2))
        cloud_steps = [i for i, val in enumerate(means) if val > 0.25]
        return cloud_steps, means, cloud_probs
    except Exception as e:
        logging.fatal(e, exc_info=True)
    
    
def download_dem(val, epsg = EPSG):
    location = calc_bbox(val)
    bbox = bounding_box(location, expansion = 180)
    box = BBox(bbox, crs = epsg)
    dem_request = WmsRequest(data_source=DataSource.DEM,
                         layer='DEM',
                         bbox=box,
                         width=18,
                         height=18,
                         instance_id=API_KEY,
                         image_format=MimeType.TIFF_d32f,
                         custom_url_params={CustomUrlParam.SHOWLOGO: False})
    dem_image = dem_request.get_data()[0]
    dem_image = calcSlope(dem_image.reshape((1, 18, 18)),
                  np.full((18, 18), 10), np.full((18, 18), 10), zScale = 1, minSlope = 0.02)
    dem_image = dem_image.reshape((18, 18, 1))
    dem_image = dem_image[1:17, 1:17, :]
    return dem_image

        
    
def download_tiles(bbox, clean_steps, epsg = EPSG, time = time):
    try:
        clean_steps = np.argwhere(clean_steps <= 0.2)
        box = BBox(bbox, crs = epsg)
        image_request = WmsRequest(
                layer='ALL_BANDS_NDVI',
                bbox=box,
                time=time,
                width=16,
                height=16,
                image_format = MimeType.TIFF_d32f,
                maxcc=0.33,
                instance_id=API_KEY,
                custom_url_params = {constants.CustomUrlParam.UPSAMPLING: 'BICUBIC'},
                time_difference=datetime.timedelta(hours=24),
            )
        img_bands = image_request.get_data()
        img_bands = np.array(img_bands)
        print("There are {}/{} clean steps".format(len(clean_steps), len(img_bands)))
        num_broken_steps = 0
        for date in range(img_bands.shape[0]):
            if date in clean_steps:
                for band in range(10):
                    uniques = [len(np.unique(img_bands[i, :, :, band])) for i in range(img_bands.shape[0])]
                    maxs = np.max(img_bands[date, :, :, band])
                    mins = np.min(img_bands[date, :, :, band])
                    if maxs >= 1.0 or mins <= 0.0:
                        num_broken_steps += 1
                        before, after = calculate_proximal_steps(uniques, date, clean_steps)
                        before = img_bands[date + int(before), :, :, band]
                        after = img_bands[date + int(after), :, :, band]
                        img_bands[date, :, :, band] = (before + after) / 2
                    if len(np.unique(img_bands[date, :, :, band])) <= 3:
                        num_broken_steps += 1
                        before, after = calculate_proximal_steps(uniques, date, clean_steps)
                        before = img_bands[date + int(before), :, :, band]
                        after = img_bands[date + int(after), :, :, band]
                        img_bands[date, :, :, band] = (before + after) / 2
        print("{} broken normal steps".format(num_broken_steps))
        return img_bands, image_request

    except Exception as e:
        logging.fatal(e, exc_info=True)
    

        
def calculate_and_save_best_images(cloud_steps, img_bands, image_request, means, year = year):
    # Identify the date of the imagery
    image_dates = []
    for date in image_request.get_dates():
        if date.year == year - 1:
            image_dates.append(-360 + (date.month-1)*30 + date.day)
        if date.year == year:
            image_dates.append((date.month-1)*30 + date.day)
        if date.year == year + 1:
            image_dates.append(365 + (date.month-1)*30+date.day)
        #image_dates.append((date.year - 2018)*395 + date.month*30 + date.day)
    print(image_dates)

    biweekly_dates = [day for day in range(0, 360, 15)] # ideal imagery dates are every 15 days
    
    # Identify the dates where there is < 20% cloud cover
    satisfactory_ids = list(np.argwhere(np.array(means) < 0.2).reshape(-1, )) 
    satisfactory_dates = [value for idx, value in enumerate(image_dates) if idx in satisfactory_ids]
    
    
    selected_images = {}
    for i in biweekly_dates:
        distances = [abs(date - i) for date in satisfactory_dates]
        closest = np.min(distances)
        closest_id = np.argmin(distances)
        # If there is imagery within 8 days, select it
        if closest < 8:
            date = satisfactory_dates[closest_id]
            image_idx = int(np.argwhere(np.array(image_dates) == date)[0])
            selected_images[i] = {'image_date': [date], 'image_ratio': [1], 'image_idx': [image_idx]}
        # If there is not imagery within 8 days, look for the closest above and below imagery
        else:
            distances = np.array([(date - i) for date in satisfactory_dates])
            # Number of days above and below the selected date of the nearest clean imagery
            above = distances[np.where(distances < 0, distances, -np.inf).argmax()]
            below = distances[np.where(distances > 0, distances, np.inf).argmin()]
            if abs(above) > 100: # If date is the last date, occassionally argmax would set above to - number
                above = below
            if abs(below) > 100:
                below = above
            if above != below:
                below_ratio = above / (above - below)
                above_ratio = 1 - below_ratio
            else:
                above_ratio = below_ratio = 0.5
                
            # Extract the image date and imagery index for the above and below values
            above_date = i + above
            above_image_idx = int(np.argwhere(np.array(image_dates) == above_date)[0])
            
            below_date = i + below
            below_image_idx = int(np.argwhere(np.array(image_dates) == below_date)[0])
            
            selected_images[i] = {'image_date': [above_date, below_date], 'image_ratio': [above_ratio, below_ratio],
                                 'image_idx': [above_image_idx, below_image_idx]}
                            
    max_distance = 0
    
    for i in selected_images.keys():
        print(i, selected_images[i])
        if len(selected_images[i]['image_date']) == 2:
            dist = selected_images[i]['image_date'][1] - selected_images[i]['image_date'][0]
            if dist > max_distance:
                max_distance = dist
    
    print("Maximum time distance: {}".format(max_distance))
        
    # Compute the weighted average of the selected imagery for each time step
    keep_steps = []
    for i in selected_images.keys():
        info = selected_images[i]
        if len(info['image_idx']) == 1:
            step = img_bands[info['image_idx'][0]]
        if len(info['image_idx']) == 2:
            step1 = img_bands[info['image_idx'][0]] * info['image_ratio'][0]
            step2 = img_bands[info['image_idx'][1]] * info['image_ratio'][1]
            step = step1 + step2
        keep_steps.append(step)
        
    keep_steps = np.stack(keep_steps)
    return keep_steps, max_distance



('2016-12-15', '2018-1-15')


In [159]:
def calc_best(tiles, cloud_probs, request, offset_x, offset_y):
    c_probs = cloud_probs[:, offset_x:offset_x+16, offset_y:offset_y+16]
    images = np.stack(tiles)[:, offset_x:offset_x+16, offset_y:offset_y+16]
    means = np.mean(c_probs, (1, 2))
    cloud_steps = [i for i, val in enumerate(means) if val > 0.20]
    best = calculate_and_save_best_images(cloud_steps, images, request, means)
    return best

In [160]:
to_download = [x for x in plot_ids if x not in existing]
errors = []
print("STARTING DOWNLOAD OF {} plots from {} to {}".format(len(to_download), DATA_LOCATION, OUTPUT_FOLDER))
for i, val in enumerate(plot_ids):
    if val not in existing:
        print("Downloading {}".format(val))
        location = calc_bbox(val)
        location = bounding_box(location, expansion = 160)
        try:
            # Initiate hash tables
            cloud, means, probs = identify_clouds(location)
            dem = download_dem(val)
            img, image_request = download_tiles(location, means)
            tiles, max_distance = calculate_and_save_best_images(cloud, img, image_request, means) # 22, 16, 16, 10
            dem = np.tile(dem.reshape((1, 16, 16, 1)), (tiles.shape[0], 1, 1, 1))
            tiles = np.concatenate([tiles, dem], axis = -1)
            if max_distance <= 90:
                np.save(OUTPUT_FOLDER + str(val), tiles)
            else:
                print("Skipping {} because there is a {} distance".format(val, max_distance))

        except Exception as e:
        #    print(e)
            logging.fatal(e, exc_info=True)
            errors.append(img)
            #continue

STARTING DOWNLOAD OF 3 plots from ../data/kenya-test.csv to ../data/test-data-nov-27/
Downloading 135542384
There are 10/33 clean steps
0 broken normal steps
[-7, 12, 31, 63, 73, 83, 92, 112, 142, 161, 171, 181, 186, 196, 201, 206, 215, 225, 230, 235, 249, 259, 264, 274, 279, 303, 328, 338, 343, 348, 353, 358, 377]
0 {'image_date': [-7], 'image_ratio': [1], 'image_idx': [0]}
15 {'image_date': [12], 'image_ratio': [1], 'image_idx': [1]}
30 {'image_date': [12, 73], 'image_ratio': [0.7049180327868853, 0.29508196721311475], 'image_idx': [1, 4]}
45 {'image_date': [12, 73], 'image_ratio': [0.4590163934426229, 0.5409836065573771], 'image_idx': [1, 4]}
60 {'image_date': [12, 73], 'image_ratio': [0.21311475409836067, 0.7868852459016393], 'image_idx': [1, 4]}
75 {'image_date': [73], 'image_ratio': [1], 'image_idx': [4]}
90 {'image_date': [92], 'image_ratio': [1], 'image_idx': [6]}
105 {'image_date': [92, 92], 'image_ratio': [0.5, 0.5], 'image_idx': [6, 6]}
120 {'image_date': [92, 206], 'image_ra