# Download and process Sentinel 1 data

This notebook downloads and processes one year of Sentinel 1 data for training and testing plots labelled in Collect Earth Online.

## John Brandt
## July 12, 2021

## Package imports, API import, source scripts

In [1]:
import datetime
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import scipy.sparse as sparse
import seaborn as sns
import yaml

from collections import Counter
from random import shuffle
from scipy.sparse.linalg import splu
from sentinelhub import WmsRequest, WcsRequest, MimeType
from sentinelhub import CRS, BBox, constants, DataSource, CustomUrlParam
from skimage.transform import resize
from sentinelhub.config import SHConfig

import reverse_geocoder as rg
import pycountry
import pycountry_convert as pc
import hickle as hkl
from shapely.geometry import Point, Polygon

with open("../config.yaml", 'r') as stream:
    key = (yaml.safe_load(stream))
    API_KEY = key['key']
    SHUB_SECRET = key['shub_secret']
    SHUB_KEY = key['shub_id']
    AWSKEY = key['awskey']
    AWSSECRET = key['awssecret']
            
shconfig = SHConfig()
shconfig.instance_id = API_KEY
shconfig.sh_client_id = SHUB_KEY
shconfig.sh_client_secret = SHUB_SECRET
        
%matplotlib inline
%run ../src/downloading/utils.py

In [2]:
time = ('2016-12-15', '2018-01-15')
YEAR = 2017
IMSIZE = 32

starting_days = np.cumsum([0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30])

In [3]:
df = pd.read_csv("../data/train-csv/chm/chm-2017-2016.csv")
df.rename(columns={'field_1':'plotid'}, inplace=True)
for i, val in df.iterrows():
    #print(df['filename'][i].split('gedi')[1])
    df.plotid[i] = int(df['filename'][i].split('gedi')[1][:-16])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [4]:
df.plotid

0       12783080020
1        6441080030
2       11559060020
3       11803050020
4        8647060020
           ...     
1981    12191050030
1982     6953080020
1983    14539050020
1984     9663050020
1985     9472080040
Name: plotid, Length: 1986, dtype: int64

In [5]:
existing = [int(x[:-4]) for x in os.listdir("../data/train-s1/") if x[-4:] == '.hkl'] 

for i, val in df.iterrows():
    #print(np.sum(df.plotid == val.plotid)
    if (np.sum(df.plotid == val.plotid)) > 1:
        print(df.iloc[i].plotid, np.sum(df.plotid == df.iloc[i].plotid))
        df.plotid[i] = df.plotid[i] + 1
    if np.sum(existing == df.plotid[i]) > 0:
        print(df.plotid[i], np.sum(df.plotid == df.iloc[i].plotid))
        df.plotid[i] = df.plotid[i] + 1
    #df[df.plotid.duplicated() == True].plotid

12783080020 1
6441080030 1
11559060020 1
11803050020 1
8647060020 1
5359060030 1
14977080030 1
11241060030 1
10997080030 1
5168060020 1
13026080030 1
8052080020 1
8351050030 1
7581050030 1
13664080020 1
4040080020 1
3767110030 1
7003060020 1
14522060020 1
6942110020 1
4657080030 1
11215080030 1
10487060020 1
14097080010 1
8378080030 1
14261110020 1
2270060010 1
11822050030 1
13857080030 1
8439060030 1
8439110030 1
12282060030 1
8318110030 1
13649080020 1
5698060030 1
4748050030 1
10044060030 2
10044060031 1
13915060030 1
11184060030 1
13887060040 1
6760080020 1
4151050020 1
4073050010 1
6072080010 1
2656080030 1
12130110030 1
10377050030 1
6544110030 1
3262110020 1
6072080020 1
11511060020 2
11511060021 1
10959060030 1
13765110040 1
10082110030 1
2038110020 1
4237060020 1
14288080020 1
11700060020 1
8332080040 1
12130050030 1
10143110030 1
12781080030

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


 1
3143110020 1
4922110020 1
9609110010 1
6696050010 1
9032080020 1
11150060030 1
2128080020 1
3183110020 1
11338050030 1
7104050020 1
12486060020 1
7959050020 1
4146110020 1
6538050040 1
7641110010 1
3645110030 1
5405080030 1
8128050020 1
10382050040 1
13715050020 1
13294050020 1
6258080030 1
11559080020 1
10077110020 1
12767110010 1
4878080010 1
9942080010 1
6224050030 1
8155110040 1
8413080040 1
11211110030 1
8160050040 1
9038110030 2
9038110031 1
6273110040 1
7611110010 1
12425080020 1
9061050030 1
14275110010 1
13074080030 1
9480050040 1
12957080040 1
3851060040 1
13769080020 1
8728060020 1
3346050030 1
12805050030 1
9168060030 2
9168060031 1
8969110020 1
5019060030 1
10507060020 1
7015110020 1
11303080030 1
11511060020 1
8856060020 1
11451080020 1
13609060020 1
6277110030 1
7638080010 1
9705080010 1
3363110030 1
5708110030 1
12415050010 1
9631050010 1
13227050040 1
11387110040 1
4252050020 1
5015060020 1
9807080020 1
14966080030 1
3767080030 1
4523060040 2
4523060041 1
9075050030

11924050020 1
6191060030 1
8429050040 1
5263050030 2
5263050031 1
4297080020 1
13796060040 2
13796060041 1
14651110020 1
8679060010 1
6461060040 1
11409050010 1
8893110020 1
8603060010 1
14337050010 1
3787060030 1
12427050020 1
6283060030 1
3218080010 1
4240060030 1
5297050020 1
3350110020 1
4271050030 1
6558080030 1
3485050040 1
2989060010 1
5001050020 1
7059110010 1
8873060020 1
14613080030 2
14613080031 1
11541110020 1
14337080010 1
6966080020 1
11424050010 1
9041110020 1
3320050020 1
4446080030 1
9228080040 1
13712050030 1
10551110020 1
13758050040 1
9887080030 1
4164080020 1
6455060030 1
7688060020 1
9869050020 1
8261080040 1
11296060040 1
4381050040 1
13605060010 1
8969110010 1
9899110020 1
3454050030 1
6728110020 1
9404110040 1
7047050010 1
14261080010 1
12477110020 1
3142060010 1
11157060030 1
5346050030 1
4973110020 1
12982050030 1
5280110030 1
3433060030 1
11993050020 1
5469060040 1
9866080020 1
7415110030 1
9683080020 1
11304080030 1
11451110020 1
4235050020 1
12415060010 1


4911060010 1
2133050010 1
2175110020 1
8958080020 1
6644080040 1
2255060010 1
7977110010 1
4851080010 1
2301080010 1
9080080020 1
13796060040 1
13140060040 1
9099080040 1
2629110040 1
8801060010 1
3324080010 1
4446080040 1
4690060040 1
5697060040 1
9373060040 1
4314060020 1
14659060030 1
13745060020 1
8488060040 1
12011060040 1
7580080010 1
2920110040 1
4517050030 1
9018080020 1
3881060010 1
13186050040 1
7962060010 1
5697050040 1
2629060040 1
9821080030 1
14698050020 1
11032110020 1
10029080040 1
9236080040 1
3107050010 1
13571060010 1
9083110030 1
13604080010 1
2378080010 1
7886080010 1
4469110030 1
12703050030 1
3080060010 1
8950110030 1
10810110020 1
9972110020 1
11685050020 1
8649060010 1
4210110010 1
2115060020 1
5392060030 1
14215050010 1
14215110010 1
10004050030 1
8473110040 1
14318050010 1
13709060030 1
11604060020 1
6292080030 1
5423060040 1
11265080040 1
5228080020 1
14514050020 1
4121060020 1
12710050020 1
5529060030 1
11977060040 1
5349060020 1
7391060040 1
10040110030 1


14116060030 1
10296050030 1
11894050020 1
4483080030 1
5182060020 1
5526080030 1
9252110030 1
5110080010 1
10732110020 1
4949060010 1
14590050020 1
10263050030 1
3705110030 1
14031110030 1
10985110030 1
11452060020 1
4945080020 1
6258110030 1
6894110010 1
2590080030 1
6783080020 1
13826050030 1
9421060040 1
3370110030 1
2081060020 1
3058110010 1
14070080040 1
8923050010 1
5347050030 1
5281080030 1
9710050020 1
9649110020 1
11558110020 1
5664060030 1
3619110030 1
2342080020 1
11265060030 1
2577050030 1
2175080020 1
4407050030 1
6816110020 1
6772060010 1
10238080030 1
11973060030 1
4961060020 1
10289080030 1
4523060040 1
11158060040 1
9746050010 1
4072080010 1
2283080020 1
11803060020 1
14924110030 1
8996110020 1
12670110020 1
10002050020 1
7804110020 1
3613110030 1
7857080020 1
10916080020 1
4958050010 1
4393060030 1
3355050030 1
5508110030 1
11512060020 1
13708050020 1
13796080030 1
9983110030 1
12822050020 1
4173050020 1
14718050030 1
5232110030 1
11287080030 1
11572110020 1
498311002

In [6]:
df[df.plotid.duplicated() == True].plotid

754    3729050042
Name: plotid, dtype: int64

In [7]:
df.plotid

0       12783080021
1        6441080031
2       11559060021
3       11803050021
4        8647060021
           ...     
1981    12191050031
1982     6953080021
1983    14539050021
1984     9663050021
1985     9472080041
Name: plotid, Length: 1986, dtype: int64

In [8]:
df.to_csv("../data/train-csv/chm/chm-2017-2016.csv", index = False)

# Bounding boxes

In [9]:
def identify_s1_layer(coords: list) -> str:
    coords = (coords[1], coords[0])
    results = rg.search(coords)
    admin1 = (results[-1]['admin1'])
    admin2 = results[-1]['admin2']
    country = results[-1]['cc']
    continent_name = pc.country_alpha2_to_continent_code(country)
    print(admin1, admin2, country, continent_name)
    if continent_name in ['AF', 'OC', 'EU']:
        layer = "SENT"
    if continent_name in ['SA']:
        if coords[0] > -7.11:
            layer = "SENT"
        else:
            layer = "SENT_DESC"
    if continent_name in ['AS']:
        if coords[0] > 23.3:
            layer = "SENT"
        else:
            layer = "SENT_DESC"
    if continent_name in ['NA']:
        layer = "SENT_DESC"
    return layer


def calc_bbox(plot_id: int, df: pd.DataFrame) -> list:
    """ Calculates the corners of a bounding box from an input
        pandas dataframe as output by Collect Earth Online

        Parameters:
         plot_id (int): plot_id of associated plot
         df (pandas.DataFrame): dataframe of associated CEO survey
    
        Returns:
         bounding_box (list): [(min(x), min(y)),
                              (max(x), max_y))]
    """
    subs = df[df['PLOT_ID'] == plot_id]
    return [(min(subs['LON']), min(subs['LAT'])),
            (max(subs['LON']), max(subs['LAT']))]


def bounding_box(points: list, expansion: int = 160) -> (tuple, 'CRS'):
    """ Calculates the corners of a bounding box with an
        input expansion in meters from a given bounding_box
        
        Subcalls:
         calculate_epsg, convertCoords

        Parameters:
         points (list): output of calc_bbox
         expansion (float): number of meters to expand or shrink the
                            points edges to be
    
        Returns:
         bl (tuple): x, y of bottom left corner with edges of expansion meters
         tr (tuple): x, y of top right corner with edges of expansion meters
    """
    bl = list(points[0])
    tr = list(points[1])
    inproj = Proj('epsg:4326')
    outproj_code = calculate_epsg(bl)
    outproj = Proj('epsg:' + str(outproj_code))
    
    bl_utm =  transform(inproj, outproj, bl[1], bl[0])
    tr_utm =  transform(inproj, outproj, tr[1], tr[0])

    distance1 = tr_utm[0] - bl_utm[0]
    distance2 = tr_utm[1] - bl_utm[1]
    expansion1 = (expansion - distance1)/2
    expansion2 = (expansion - distance2)/2
    
    bl_utm = [bl_utm[0] - expansion1, bl_utm[1] - expansion2]
    tr_utm = [tr_utm[0] + expansion1, tr_utm[1] + expansion2]
    
    zone = str(outproj_code)[3:]
    zone = zone[1:] if zone[0] == "0" else zone
    direction = 'N' if tr[1] >= 0 else 'S'
    utm_epsg = "UTM_" + zone + direction
    return (bl_utm, tr_utm), CRS[utm_epsg]

# Data download

In [10]:
def extract_dates(date_dict: dict, year: int) -> List:
    """ Transforms a SentinelHub date dictionary to a
         list of integer calendar dates
    """
    dates = []
    days_per_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30]
    starting_days = np.cumsum(days_per_month)
    for date in date_dict:
        if date.year == year - 1:
            dates.append(-365 + starting_days[(date.month-1)] + date.day)
        if date.year == year:
            dates.append(starting_days[(date.month-1)] + date.day)
        if date.year == year + 1:
            dates.append(365 + starting_days[(date.month-1)]+date.day)
    return dates


def identify_dates_to_download(dates: list) -> list:
    """ Identify the S1 dates to download"""
    days_per_month = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30]
    days_per_month = np.array(days_per_month)
    #days_per_month = np.reshape(days_per_month, (4, 3))
    #days_per_month = np.sum(days_per_month, axis = 1)

    starting_days = np.cumsum(days_per_month)

    dates = np.array(dates)
    dates_to_download = []
    for i in starting_days:
        s1_month = dates[dates > i]
        s1_month = s1_month[s1_month < (i + 30)]
        if len(s1_month) > 0:
            dates_to_download.append(s1_month[0])
    return dates_to_download


def download_sentinel_1(bbox, epsg, time = time, 
                        layer = "SENT", year = YEAR, 
                        image_format = MimeType.TIFF, 
                        data = DataSource.SENTINEL1_IW_ASC):
    """ Downloads all 10 and 20 meter L2A bands from sentinel-hub
        for input bbox and epsg, within time range
        
        Parameters:
         bbox (list): output of calc_bbox
         epsg (float): UTM EPSG associated with bbox 
         time (tuple): YY-MM-DD - YY-MM-DD bounds for downloading 
    
        Returns:
         s1 (arr): (Time, X, Y, 2) array of sentinel 1 data
         image_dates (list): number of days since time[0] for each
                              image in s1.shape[0]
    """
    try:
        print(f"The data is {data}")
        box = BBox(bbox, crs = epsg)
        image_request = WcsRequest(
                layer=layer,
                bbox=box,
                time=time,
                image_format = image_format,
                data_source= data,
                maxcc=1.0,
                resx='20m', resy='20m',
                config=shconfig,
                custom_url_params = {constants.CustomUrlParam.DOWNSAMPLING: 'NEAREST',
                                    constants.CustomUrlParam.UPSAMPLING: 'NEAREST'},
                time_difference=datetime.timedelta(hours=72),
            )
        
        
        s1_dates_dict = [x for x in image_request.get_dates()]
        s1_dates = extract_dates(s1_dates_dict, year)
        dates_to_download = identify_dates_to_download(s1_dates)
        
        steps_to_download = [i for i, val in enumerate(s1_dates) if val in dates_to_download]
        print(f"The following steps will be downloaded: {steps_to_download}, for {dates_to_download}")
        
        
        data_filter = steps_to_download
        if len(image_request.download_list) <= 3 or len(steps_to_download) <= 3:
            return np.empty((0,)), np.empty((0,))
        s1 = image_request.get_data(data_filter = data_filter)
        s1 = np.stack(s1)
        s1 = to_float32(s1)
        
        assert np.max(s1) <= 1.
        assert s1.shape[1] == 16.
        assert s1.shape[2] == 16.
        
        print(f"Sentinel 1 used {(2/3)*s1.shape[0] * (s1.shape[1]*s1.shape[2])/(512*512)} PU for"
              f" {s1.shape[0]} out of {len(image_request.download_list)} images")
        
        original = s1.shape
        #s1 = s1.repeat(3, axis = 0)
        # Store it with nearest upsample, but this will be converted to bilinear at train time
        s1 = resize(s1, (s1.shape[0], 32, 32, 2), 0)
        new = s1.shape
        print(f"{original} -> {new}")
        
        image_dates = []
        for date in image_request.get_dates():
            if date.year == year - 1:
                image_dates.append(-365 + starting_days[(date.month-1)] + date.day)
            if date.year == year:
                image_dates.append(starting_days[(date.month-1)] + date.day)
            if date.year == year + 1:
                image_dates.append(365 + starting_days[(date.month-1)]+date.day)
        image_dates = [val for idx, val in enumerate(image_dates) if idx in data_filter]
        image_dates = np.array(image_dates)
        
        s1c = np.copy(s1)
        s1c[np.where(s1c < 1.)] = 0
        n_pix_oob = np.sum(s1c, axis = (1, 2, 3))
        to_remove = np.argwhere(n_pix_oob > (32*32*2)/10)
        if len(to_remove) > 0:
            print(f'A total of {len(to_remove)} steps of {s1.shape[0]} were removed.')
            s1 = np.delete(s1, to_remove, 0)
            image_dates = np.delete(image_dates, to_remove)

        return s1, image_dates

    except Exception as e:
        logging.fatal(e, exc_info=True)

# Download function

In [11]:
def download_plots(data_location: str, output_folder: str, image_format = MimeType.TIFF) -> None:
    """ Downloads sentinel-1 data for the plot IDs associated
        with an input CSV from a collect earth online survey
        
        Parameters:
         data_location (os.path)
         output_folder (os.path)
        
        Subcalls:
         calc_bbox, bounding_box
         download_sentinel_1,
         calculate_and_save_best_images
         
        Creates:
         output_folder/{plot_id}.npy
    
        Returns:
         None
    """
    df = pd.read_csv(data_location, encoding = "ISO-8859-1")
    print(df.shape)
    df.columns = [x.upper() for x in df.columns]
    for column in ['IMAGERY_TITLE', 'STACKINGPROFILEDG', 'PL_PLOTID', 'IMAGERYYEARDG',
                  'IMAGERYMONTHPLANET', 'IMAGERYYEARPLANET', 'IMAGERYDATESECUREWATCH',
                  'IMAGERYENDDATESECUREWATCH', 'IMAGERYFEATUREPROFILESECUREWATCH',
                  'IMAGERYSTARTDATESECUREWATCH',
                  'IMAGERY_ATTRIBUTIONS',
                  'SAMPLE_GEOM']:
        if column in df.columns:
            df = df.drop(column, axis = 1)
    print(df.shape)
    df = df.rename(columns={df.columns[0]: 'PLOT_ID'})
    print(df.shape)
    df = df.dropna(axis = 0)
    print(df.shape)
    df.PLOT_ID = df.PLOT_ID.astype(int)
    #df = df[df['LAT'] > -24]
    #df = df[df['LAT'] < 24]
    plot_ids = sorted(df['PLOT_ID'].unique())
    existing = [int(x[:-4]) for x in os.listdir(output_folder) if ".DS" not in x]
    to_download = [x for x in plot_ids if x not in existing]
    existing = [x for x in plot_ids if x in existing]
    print(existing)
    print(f"Starting download of {len(to_download)}"
          f" plots from {data_location} to {output_folder}")
    errors = []

    for i, val in enumerate(to_download):
        print(f"Downloading {i+1}/{len(to_download)}, {val}")
        location_wgs = calc_bbox(val, df = df)
        print(location_wgs)
        location, epsg = bounding_box(location_wgs, expansion = IMSIZE*10)
        try:
            # Identify cloud steps, download DEM, and download L2A series
            s1_layer = identify_s1_layer(location_wgs[0])
            data_source = DataSource.SENTINEL1_IW_DES if s1_layer == "SENT_DESC" else DataSource.SENTINEL1_IW_ASC
            s1, s1_dates = download_sentinel_1(location, 
                                               layer = s1_layer, 
                                               epsg = epsg,
                                               data = data_source)
            if s1.shape[0] < 2:
                s1_layer = "SENT_DESC" if s1_layer == "SENT" else "SENT"
                data_source = DataSource.SENTINEL1_IW_DES if s1_layer == "SENT_DESC" else DataSource.SENTINEL1_IW_ASC
                print(f'Switching to {s1_layer}')
                s1, s1_dates = download_sentinel_1(location, 
                                                   layer = s1_layer,
                                                   epsg = epsg,
                                                   data = data_source)
            
            s1_a = np.copy(s1)
            print(s1.shape, len(s1_dates))
            s1, max_distance = calculate_and_save_best_images(s1, s1_dates)
            print(s1.shape)

            s1_b = np.copy(s1)
            # Retain only iamgery every month
            monthly = np.empty((12, IMSIZE, IMSIZE, 2))
            index = 0
            for start, end in zip(range(0, 24 + 2, 24 // 12), #0, 72, 6
                                  range(24 // 12, 24 + 2, 24 // 12)): # 6, 72, 6
                monthly[index] = np.median(s1[start:end], axis = 0)
                index += 1

            s1 = monthly
            s1_c = np.copy(s1)
            print(s1.shape)
            
            assert s1.shape[1] == IMSIZE
            assert s1.shape[2] == IMSIZE
            if max_distance < 200:
                hkl.dump(s1, output_folder + str(val) + ".hkl", mode='w', compression='gzip')
                print('\n')
            else:
                print(f"Skipping {val} because max distance is {max_distance}")
            
        except Exception as e:
            print(e)
            logging.fatal(e, exc_info=True)
            errors.append(i)

In [13]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

def to_float32(array: np.array) -> np.array:
    """Converts an int_x array to float32"""
    print(f'The original max value is {np.max(array)}')
    if not isinstance(array.flat[0], np.floating):
        assert np.max(array) > 1
        array = np.float32(array) / 65535.
    assert np.max(array) <= 1
    return array

for i in (os.listdir("../data/train-csv/")):
    if "elephantgrass-sample-data-2020" in i:
        print(i)
        download_plots("../data/test-csv/" + i, 
                       "../data/test-s1/",
                       image_format = MimeType.TIFF)

ceo-elephantgrass-sample-data-2020.csv
(5684, 12)
(5684, 9)
(5684, 9)
(5684, 9)
[202860, 202861, 202862, 202863, 202864, 202865, 202866, 202867, 202868, 202869, 202870, 202871, 202872, 202873, 202874, 202875, 202876, 202877, 202878, 202879, 202880, 202881, 202882, 202883, 202884, 202885, 202886, 202887, 202888]
Starting download of 0 plots from ../data/train-csv/ceo-elephantgrass-sample-data-2020.csv to ../data/train-s1/
