# BigEarthNet-v1.0 training set @JEODPP

- This notebook demonstrates how the user can have access to the **BigEarthNet-v1.0** collection stored at the JEODPP EOS storage system.
- More specifically, it contains different ways of retrieving the **BigEarthNet-v1.0** images (input images and masks containing the class labels) which can be used as training data.

**For more information:** 

- @GitLab: https://jeodpp.jrc.ec.europa.eu/apps/gitlab/jeodpp-services/training-sets-for-earth-observation-applications/-/wikis/home
- @Connected: https://connected.cnect.cec.eu.int/groups/bigdataeoss 
- @Internet: https://jeodpp.jrc.ec.europa.eu/home/

**Contacts:**  jrc-jeodpp@ec.europa.eu

**Source data:** http://bigearth.net/

<img src="https://cidportal.jrc.ec.europa.eu/services/shared/html/JRClogo2.png" width="200" height="200" /> <img src="https://cidportal.jrc.ec.europa.eu/services/shared/html/JRCBigDataPlatform_512.png" width="200" height="200" /> 

In [None]:
import numpy as np
import os, fnmatch, urllib.request
import pandas as pd
import json 
import matplotlib.pyplot as plt

In [None]:
# Please, download the script gdalRead.py into the working directory
from gdalRead import gdalRead

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
mainfolder = 'https://jeodpp.jrc.ec.europa.eu/ftp/public/MachineLearning/SatImNet'
collection = 'BigEarthNet-v1.0'

In [None]:
def Query(tree, condition, flagpath=''):
    def SeekLeaf(tree, condition, flagpath='', query=None):
        if query is None:
            query = []
        obj = ['branch', 'zipfile', 'leaf']
        if type(tree) == dict:
            if tree['type'] == 'file':
                keys = list(condition.keys())
                flag = True
                for k in keys:
                    if k == 'class' and len(tree[k]) > 0 and len(condition[k]) == 0:
                        flag = False
                        break
                    elif k == 'class' and len(condition[k]) != len(list(set(tree[k]) & set(condition[k]))):
                        flag = False
                        break
                    elif k == 'size' and type(condition[k]) == list and len(condition[k]) == 2:
                        if tree[k] < condition[k][0] or tree[k] > condition[k][1]:
                            flag = False
                            break
                    elif 'metainfo' in k:
                        k2 = k.split('_')[1]
                        if k2 in ['numofbands', 'rows', 'columns'] and type(condition[k]) == list and len(condition[k]) == 2:
                            try:
                                if tree['metainfo'][k2] < condition[k][0] or tree['metainfo'][k2] > condition[k][1]:
                                    flag = False
                                    break
                            except:
                                print(tree['metainfo'], obj, tree['name'])
                        elif condition[k] != tree['metainfo'][k2]:
                            flag = False
                            break
                    elif k == 'path' and type(condition[k]) == str:
                        if not condition[k] in tree[k]:
                            flag = False
                            break
                    elif k == 'name':
                        if not condition[k] in tree[k]:
                            flag = False
                            break
                    elif k != 'class' and condition[k] != tree[k]:
                        flag = False
                        break
                if flag:
                    if 'path' in flagpath:
                        query.append(tree['path'])
                    else:
                        query.append(tree)
            else:
                for t in obj:
                    if len(tree[t]) > 0:
                        query = SeekLeaf(tree[t], condition, flagpath, query)
        elif type(tree) == list and '.txt' in tree[0]:
            with urllib.request.urlopen(os.path.join(mainfolder, collection, tree[0])) as fidx:
                txt_content = json.loads(fidx.read().decode())
                query = SeekLeaf(txt_content, condition, flagpath, query)
        elif type(tree) == list and len(tree) > 0:
            for elem in range(len(tree)):
                query = SeekLeaf(tree[elem], condition, flagpath, query)
        return query
    query = SeekLeaf(tree, condition, flagpath, [])
    if flagpath != 'path_v':
        print('Found %d record%s.' % (len(query), '' if len(query)==1 else 's'))
    return query

# Get info for the dataset

In [None]:
# Read general info for all the datasets
df = pd.read_json(os.path.join(mainfolder, 'Table.json'))
cols = list(df.columns)
cols.remove('Feature')
df = df[['Feature']+cols]
df

In [None]:
# Read specific info for BigEarthNet-v1.0
pd.set_option('display.max_colwidth', 200)
try: df.set_index('Feature', inplace=True);
except: pass
df[[collection]]

# Read the structure of the BigEarthNet-v1.0 dataset

In [None]:
with urllib.request.urlopen(os.path.join(mainfolder, collection, 'content_public.json')) as f:
    content = json.loads(f.read().decode())

# Get class notation

In [None]:
classes = content['classes']
classes

# Search for images according to some criteria

In [None]:
# Use the string 'path' as 3rd argument in case you would like to retrieve the file paths only.
# Search the class: 'Coastal lagoons'
query = Query(content['tree'], 
               {'class': ['Coastal lagoons']}, 'path')
query

# Read the content of an image file

In [None]:
infile = '/vsizip//vsicurl/https://jeodpp.jrc.ec.europa.eu/ftp/public/MachineLearning/SatImNet/BigEarthNet-v1.0/2017/07/201707_10.zip/S2A_MSIL2A_20170704T112111_10_20/S2A_MSIL2A_20170704T112111_10_20_B04.tif'
Info, I = gdalRead(infile)
Info

In [None]:
# Display images
fig, axarr = plt.subplots(1, 1, figsize=(6, 6))
axarr.axis('off')
axarr.imshow(I)
plt.tight_layout(h_pad=0.1, w_pad=0.1)
plt.show()

> **In order to read many images via the _vsicurl_ driver, please use the command _gdal.VSICurlClearCache()_ after every _gdalRead_ command.**