#### Purpose: Add Sentinel coverage information to the daily ground-truth image dataset so that capture metrics can be based on an overlap between flood extent and contemporary ground coverage

Creates and saves the dataframe "flooddays_with_sentinel_df"

Method:

1. Start with the dataframe of daily flood images (used as ground truth)
2. merge in the metadata from the Global Flood Database
3. merge in the DFO metadata from json files accompanying each GFD tif image
4. merge in the WGS84 coordinates of the ground truth image frame --> this used as a bounding box when requesting contemporary Sentinel-1 data from the hub API
5. merge in the Sentinel orbit information (ground coverage) obtained from API calls to the Sentinel Hub

In [1]:
import os
import pickle
import json
import rasterio
import pandas as pd
import requests
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from dotenv import load_dotenv
load_dotenv()  


gfd_root = "STEP 1 - Data Acquisition/Global Flood Database/"
tif_root = "STEP 1 - Data Acquisition/Global Flood Database/TIF/unzipped/"

#### 1. Start with the dataframe of daily flood images (used as ground truth)

In [2]:
# dataframe of daily flood images
with open('flooddays_df.pkl', 'rb') as f:
    flooddays_df = pickle.load(f) 
f.close()

flooddays_df.head()

Unnamed: 0_level_0,DFO_id,flood_day,tif_filename,flood_year,flood_start,reported_duration,observed_total_duration,snapshot_date,snapshot_extent_img,snapshot_extent_km2
DFO_day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DFO_4459_0,DFO_4459,0,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-30,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8721.8125
DFO_4459_1,DFO_4459,1,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-31,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8166.0625
DFO_4459_2,DFO_4459,2,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-01,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7618.375
DFO_4459_3,DFO_4459,3,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-02,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7086.9375
DFO_4459_4,DFO_4459,4,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-03,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",6669.75


#### 2. Merge in the metadata from the Global Flood Database

First pick up the metadata taken directly from Global Floods Database website...

In [3]:
with open(gfd_root+'meta_floods_df.pkl', 'rb') as f:
    meta_floods_df = pickle.load(f) 
f.close()

def to_int(s):
    # text data represents integers as strings with spaces as thousand separators
    return int(s.replace(" ",""))

meta_floods_df['killed'] = meta_floods_df['killed'].apply(lambda x : to_int(x))

meta_floods_df.head()

Unnamed: 0_level_0,cause,displaced_k,duration_days,exposed_mn,flood,killed,start_date,end_date
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
146,Heavy rain,24000000,33,7481712,18/09/2000,1468,2000-09-18,2000-10-21
145,Heavy rain,2000000,33,208434,08/07/2001,100,2001-07-08,2001-08-10
144,Heavy rain,300000,22,1701849,20/08/2001,65,2001-08-20,2001-09-11
143,Heavy rain,30000,17,250633,07/09/2001,146,2001-09-07,2001-09-24
142,Heavy rain,15000000,68,7191176,21/06/2002,503,2002-06-21,2002-08-28


In [4]:
for col in meta_floods_df.columns:
    print(col,type(meta_floods_df[col].iloc[0]))

cause <class 'str'>
displaced_k <class 'numpy.int64'>
duration_days <class 'numpy.int64'>
exposed_mn <class 'numpy.int64'>
flood <class 'str'>
killed <class 'numpy.int64'>
start_date <class 'pandas._libs.tslibs.timestamps.Timestamp'>
end_date <class 'pandas._libs.tslibs.timestamps.Timestamp'>


... then join it.

In [5]:
print("shape before :",flooddays_df.shape)
flooddays_df=flooddays_df.reset_index().merge(
                                        meta_floods_df.drop(columns="flood"), 
                                        left_on=["flood_start","reported_duration"], 
                                        right_on=["start_date","duration_days"]
                                        ).set_index('DFO_day_id')
print("shape after :",flooddays_df.shape)
flooddays_df.head()


shape before : (155, 10)
shape after : (188, 17)


Unnamed: 0_level_0,DFO_id,flood_day,tif_filename,flood_year,flood_start,reported_duration,observed_total_duration,snapshot_date,snapshot_extent_img,snapshot_extent_km2,cause,displaced_k,duration_days,exposed_mn,killed,start_date,end_date
DFO_day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
DFO_4459_0,DFO_4459,0,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-30,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8721.8125,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18
DFO_4459_1,DFO_4459,1,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-31,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8166.0625,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18
DFO_4459_2,DFO_4459,2,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-01,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7618.375,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18
DFO_4459_3,DFO_4459,3,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-02,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7086.9375,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18
DFO_4459_4,DFO_4459,4,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-03,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",6669.75,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18


#### 3. Add the DFO data from the json files given with each tif image

First create the complimentary dataset...

In [6]:
#Create a dataframe of metadata with the same index as the flooddays dataframe
DFO_day_id = []
dfo_main_cause = []
dfo_severity = []
dfo_displaced = []
dfo_dead = []

for index, row in flooddays_df.iterrows():

    dfo_id = row['tif_filename'][0:8]
    json_filename = tif_root+dfo_id+'_properties.json'
    with open(json_filename) as f:
        img_meta = json.load(f)
    f.close()
    DFO_day_id.append(index)
    dfo_main_cause.append(img_meta['dfo_main_cause'])
    dfo_severity.append(img_meta['dfo_severity'])
    dfo_displaced.append(img_meta['dfo_displaced'])
    dfo_dead.append(img_meta['dfo_dead'])
                    
dfo_df = pd.DataFrame({'DFO_day_id' :DFO_day_id,
                #'dfo_main_cause' :dfo_main_cause,  # same info as in metadata above (field = "cause")
                #'dfo_displaced':dfo_displaced,     # same info as in metadata above (field = "displaced_k")
                #'dfo_dead':dfo_dead,               # same info as in metadata above (field = "killed")
                'dfo_severity':dfo_severity}).set_index('DFO_day_id')

dfo_df.head()

Unnamed: 0_level_0,dfo_severity
DFO_day_id,Unnamed: 1_level_1
DFO_4459_0,1.0
DFO_4459_1,1.0
DFO_4459_2,1.0
DFO_4459_3,1.0
DFO_4459_4,1.0


... then join it

In [7]:
print("shape before :",flooddays_df.shape)
#simple join because both dataframes have the unique DFO_day_id as index
flooddays_df=flooddays_df.join(dfo_df)
print("shape after :",flooddays_df.shape)
flooddays_df.head()

shape before : (188, 17)
shape after : (254, 18)


Unnamed: 0_level_0,DFO_id,flood_day,tif_filename,flood_year,flood_start,reported_duration,observed_total_duration,snapshot_date,snapshot_extent_img,snapshot_extent_km2,cause,displaced_k,duration_days,exposed_mn,killed,start_date,end_date,dfo_severity
DFO_day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
DFO_4459_0,DFO_4459,0,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-30,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8721.8125,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0
DFO_4459_1,DFO_4459,1,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-31,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8166.0625,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0
DFO_4459_2,DFO_4459,2,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-01,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7618.375,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0
DFO_4459_3,DFO_4459,3,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-02,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7086.9375,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0
DFO_4459_4,DFO_4459,4,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-03,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",6669.75,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0


#### 4. merge in the WGS84 coordinates of the ground truth image frame (to be used as a bounding box when acquiring contemporary Sentinel-1 data)

First create a dataframe of bounding box coordinates per ground truth image, using transformation parameters found in the meratata

In [8]:
DFO_day_id = []
wsg84_bbox = []

for index, row in flooddays_df.iterrows():

    # get the image
    img_filename = tif_root+row['tif_filename']
    raster=rasterio.open(img_filename)
    spatial_transform = raster.meta['transform']

    # test using pixel coordinates of the centre of the image (to allow validation against the centroid info in the metadata json above)
    pixel_x0 = 0
    pixel_y0 = 0
    pixel_xmax = raster.meta['width']
    pixel_ymax = raster.meta['height']

    (wgs_x0, wgs_y0) = rasterio.transform.xy(spatial_transform,pixel_x0,pixel_y0)
    (wgs_xmax, wgs_ymax) = rasterio.transform.xy(spatial_transform,pixel_xmax,pixel_ymax)

    DFO_day_id.append(index)
    wsg84_bbox.append([wgs_x0, wgs_y0,wgs_xmax, wgs_ymax])

bbox_df = pd.DataFrame({'DFO_day_id' :DFO_day_id,'wsg84_bbox':wsg84_bbox}).set_index('DFO_day_id')

bbox_df.head()

Unnamed: 0_level_0,wsg84_bbox
DFO_day_id,Unnamed: 1_level_1
DFO_4459_0,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_1,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_2,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_3,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_4,"[87.98661339719169, 28.59898996405012, 93.7493..."


... then join it

In [9]:
print("shape before :",flooddays_df.shape)
#simple join because both dataframes have the unique DFO_day_id as index
flooddays_df=flooddays_df.join(bbox_df)
print("shape after :",flooddays_df.shape)
flooddays_df.head()

shape before : (254, 18)
shape after : (650, 19)


Unnamed: 0_level_0,DFO_id,flood_day,tif_filename,flood_year,flood_start,reported_duration,observed_total_duration,snapshot_date,snapshot_extent_img,snapshot_extent_km2,cause,displaced_k,duration_days,exposed_mn,killed,start_date,end_date,dfo_severity,wsg84_bbox
DFO_day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
DFO_4459_0,DFO_4459,0,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-30,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8721.8125,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_1,DFO_4459,1,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-31,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8166.0625,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_2,DFO_4459,2,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-01,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7618.375,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_3,DFO_4459,3,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-02,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7086.9375,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493..."
DFO_4459_4,DFO_4459,4,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-03,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",6669.75,Heavy rain,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493..."


#### 5. Merge in the Sentinel orbit information (ground coverage) obtained from API calls to the Sentinel Hub

First set up the endpoint details and authentication

In [10]:
# client credentials valid for 90 days from 07/07
client_id = os.getenv('COPERNICUS_CLIENT_ID')
client_secret = os.getenv('COPERNICUS_CLIENT_SECRET')

# Create a session
client = BackendApplicationClient(client_id=client_id)
oauth = OAuth2Session(client=client)

token = oauth.fetch_token(token_url='https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token',
                          client_secret=client_secret, include_client_id=True)
url = "https://sh.dataspace.copernicus.eu/api/v1/catalog/1.0.0/search"
headers = {"Authorization": "Bearer "+token['access_token']}

... then loop through all GFD images and fetch any Sentinel data available from the same time and place, defined by snapshot day and GFD bounding box

In [11]:
DFO_day_id = []
sentinel_coverage = []

for index, row in flooddays_df.iterrows():

    date_start_required_string_format  = str(row['snapshot_date']).replace(" ","T")+"Z"
    date_end_required_string_format = date_start_required_string_format.replace("00:00:00","23:59:59")
    data = {
        "bbox": row['wsg84_bbox'],
        "datetime": date_start_required_string_format+"/"+date_end_required_string_format,
        "collections": ["sentinel-1-grd"],
        "limit": 5,
        "next": 5,
    }

    response = requests.post(url, json=data, headers =headers)

    # one or more bounding box available for each request 
    dict_of_sentinel_coverages = {}
    for i in range(len(response.json()['features'])):
        dict_of_sentinel_coverages[i] = response.json()['features'][i]['bbox']

    DFO_day_id.append(index)
    sentinel_coverage.append(dict_of_sentinel_coverages)

sentinel_df = pd.DataFrame({'DFO_day_id' :DFO_day_id,'sentinel_coverage':sentinel_coverage}).set_index('DFO_day_id')

sentinel_df.head()


Unnamed: 0_level_0,sentinel_coverage
DFO_day_id,Unnamed: 1_level_1
DFO_4459_0,{}
DFO_4459_1,"{0: [89.43702692331432, 20.720710517424283, 92..."
DFO_4459_2,{}
DFO_4459_3,"{0: [89.54372177242541, 28.04839799173297, 92...."
DFO_4459_4,{}


Sanity check that we do get some empty and some non-empty responses

In [12]:
sentinel_df['sentinel_coverage_Nboxes'] = sentinel_df['sentinel_coverage'].apply(lambda x : len(x))
sentinel_df.groupby('sentinel_coverage_Nboxes').size()

sentinel_coverage_Nboxes
0     97
1     28
2     36
3     22
4     45
5    422
dtype: int64

... then join this on to the master dataframe

In [13]:
print("shape before :",flooddays_df.shape)
#simple join because both dataframes have the unique DFO_day_id as index
flooddays_df=flooddays_df.join(sentinel_df)
print("shape after :",flooddays_df.shape)
flooddays_df.head()

shape before : (650, 19)
shape after : (8570, 21)


Unnamed: 0_level_0,DFO_id,flood_day,tif_filename,flood_year,flood_start,reported_duration,observed_total_duration,snapshot_date,snapshot_extent_img,snapshot_extent_km2,...,displaced_k,duration_days,exposed_mn,killed,start_date,end_date,dfo_severity,wsg84_bbox,sentinel_coverage,sentinel_coverage_Nboxes
DFO_day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DFO_4459_0,DFO_4459,0,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-30,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8721.8125,...,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493...",{},0
DFO_4459_1,DFO_4459,1,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-03-31,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",8166.0625,...,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493...","{0: [89.43702692331432, 20.720710517424283, 92...",1
DFO_4459_2,DFO_4459,2,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-01,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7618.375,...,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493...",{},0
DFO_4459_3,DFO_4459,3,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-02,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",7086.9375,...,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493...","{0: [89.54372177242541, 28.04839799173297, 92....",5
DFO_4459_4,DFO_4459,4,DFO_4459_From_20170330_to_20170418.tif,2017,2017-03-30,19,22,2017-04-03,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",6669.75,...,0,19,2681462,0,2017-03-30,2017-04-18,1.0,"[87.98661339719169, 28.59898996405012, 93.7493...",{},0


In [14]:
with open('flooddays_with_sentinel_df.pkl', 'wb') as f:  
    pickle.dump(flooddays_df, f) 

f.close()