# BDA training set @JEODPP

 - This notebook demonstrates how the user can have access to the **BDA** collection via FTP: https://jeodpp.jrc.ec.europa.eu/ftp/public/MachineLearning/SatImNet/.
 - It contains different ways of retrieving the **BDA** masks which can be used as training data for semantic segmentation.

**For more information:** 

- @GitLab: https://jeodpp.jrc.ec.europa.eu/apps/gitlab/jeodpp-services/training-sets-for-earth-observation-applications/-/wikis/home
- @Connected: https://connected.cnect.cec.eu.int/groups/bigdataeoss 
- @Internet: https://jeodpp.jrc.ec.europa.eu/home/

**Contacts:**  jrc-jeodpp@ec.europa.eu

**Source data:** https://www.kaggle.com/c/airbus-ship-detection/overview

<img src="https://cidportal.jrc.ec.europa.eu/services/shared/html/JRClogo2.png" width="200" height="200" /> <img src="https://cidportal.jrc.ec.europa.eu/services/shared/html/JRCBigDataPlatform_512.png" width="200" height="200" /> 

In [None]:
import numpy as np
import os, fnmatch, urllib.request
import pandas as pd
import json 
import matplotlib.pyplot as plt

In [None]:
# Please, download the scripts Query.py and gdalRead.py into the working directory
from Query import Query
from gdalRead import gdalRead

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
mainfolder = 'https://jeodpp.jrc.ec.europa.eu/ftp/public/MachineLearning/SatImNet'
collection = 'BDA'

# Get info for the dataset

In [None]:
# Read general info for all the datasets
df = pd.read_json(os.path.join(mainfolder, 'Table.json'))
cols = list(df.columns)
cols.remove('Feature')
df = df[['Feature']+cols]
df

In [None]:
# Read specific info for BDA
pd.set_option('display.max_colwidth', 200)
try: df.set_index('Feature', inplace=True);
except: pass
df[[collection]]

# Read the structure of the BDA dataset

In [None]:
with urllib.request.urlopen(os.path.join(mainfolder, collection, 'content_public.json')) as f:
    content = json.loads(f.read().decode())

# Get class notation

In [None]:
classes = content['classes']
classes

# Search for images according to some criteria

In [None]:
# Use the string 'path' as 3rd argument in case you would like to retrieve the file paths only.
# Search for files having specific number of bands
query = Query(content['tree'], 
               {'type': 'file', 'metainfo_numofbands': 1})
query

# Read the content of an image file

In [None]:
# Read a mask containing class labels
infile = '/vsizip//vsicurl/https://jeodpp.jrc.ec.europa.eu/ftp/public/MachineLearning/SatImNet/BDA/train/Urban.zip/BEN_S2A_MSIL2A_20170613T101031_43_85_B02.tif'
InfoMask, Mask = gdalRead(infile)
InfoMask

In [None]:
plt.figure(figsize=(10, 10), num=None)
plt.imshow(Mask)
plt.show()

# Imageshow

In [None]:
# Use the string 'path' as 3rd argument in case you would like to retrieve the file paths only.
# Search for files having specific number of bands
query = Query(content['tree'], 
               {'type': 'file', 'metainfo_numofbands': 1}, 'path')

In [None]:
# Press x to exit
from IPython.display import display, clear_output
f, axarr = plt.subplots(1, 5)
f.set_size_inches(20, 5)
val = 0
for idx in np.arange(0,len(query),10):
    pos = 0
    for q in range(5):
        _, I = gdalRead(query[idx+pos])
        axarr[q].axis('off')
        axarr[q].imshow(I, cmap='gray')
        pos += 1
    h = plt.subplots_adjust(hspace=0.05, wspace=0.05)
    display(f)
    if val != 'p':
        val = input("Press Enter to continue...")
        if val == 'x':
            clear_output(wait = True)
            break
    clear_output(wait = True)

> **In order to read many images via the _vsicurl_ driver, please use the command _gdal.VSICurlClearCache()_ after every _gdalRead_ command.**