# DOTA training set @JEODPP

- This notebook demonstrates how the user can have access to the **DOTA** collection stored at the JEODPP EOS storage system.
- More specifically, it contains different ways of retrieving the **DOTA** images (input images and masks containing the class labels) which can be used as training data.

**For more information:** 

- @GitLab: https://jeodpp.jrc.ec.europa.eu/apps/gitlab/jeodpp-services/training-sets-for-earth-observation-applications/-/wikis/home
- @Connected: https://connected.cnect.cec.eu.int/groups/bigdataeoss 
- @Internet: https://jeodpp.jrc.ec.europa.eu/home/

**Contacts:**  jrc-jeodpp@ec.europa.eu

**Source data:** https://captain-whu.github.io/DOTA/dataset.html

<img src="https://cidportal.jrc.ec.europa.eu/services/shared/html/JRClogo2.png" width="200" height="200" /> <img src="https://cidportal.jrc.ec.europa.eu/services/shared/html/JRCBigDataPlatform_512.png" width="200" height="200" /> 

In [None]:
import numpy as np
import os, fnmatch, urllib.request
import pandas as pd
import json 
import matplotlib.pyplot as plt

In [None]:
# Please, download the scripts Query.py and gdalRead.py into the working directory
from Query import Query
from gdalRead import gdalRead

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
mainfolder = 'https://jeodpp.jrc.ec.europa.eu/ftp/public/MachineLearning/SatImNet'
collection = 'DOTA'

# Get info for the dataset

In [None]:
# Read general info for all the datasets
df = pd.read_json(os.path.join(mainfolder, 'Table.json'))
cols = list(df.columns)
cols.remove('Feature')
df = df[['Feature']+cols]
df

In [None]:
# Read specific info for DOTA
pd.set_option('display.max_colwidth', 200)
try: df.set_index('Feature', inplace=True);
except: pass
df[[collection]]

# Read the structure of the DOTA dataset

In [None]:
with urllib.request.urlopen(os.path.join(mainfolder, collection, 'content_public.json')) as f:
    content = json.loads(f.read().decode())

# Get class notation

In [None]:
classes = content['classes']
classes

# Search for images according to some criteria

In [None]:
# Use the string 'path' as 3rd argument in case you would like to retrieve the file paths only.
# Search for png and two classes: 'plane', 'small-vehicle'
query = Query(content['tree'], 
               {'genre': 'png', 'class': ['plane', 'small-vehicle']}, 'path')
query

In [None]:
# Use the string 'path' as 3rd argument in case you would like to retrieve the file paths only.
# Search for files having specific size in terms of rows and columns
query = Query(content['tree'], 
               {'type': 'file', 'metainfo_columns': [2000, 5000], 'metainfo_rows': [2000, 4000]})
query

# Read the content of an image file

In [None]:
# Read a mask containing class labels
infile = '/vsizip//vsicurl/https://jeodpp.jrc.ec.europa.eu/ftp/public/MachineLearning/SatImNet/DOTA/train/labelTxt-v1.0/labelTxt.zip/P0119.png'
InfoMask, Mask = gdalRead(infile)
InfoMask

In [None]:
# Read the respective image
query = Query(content['tree'], {'name': os.path.basename(infile), 'class': []}, 'path')
if len(query) > 0:
    Info, Image = gdalRead(query[0])
    print(Info)

In [None]:
# Display images
fig, axarr = plt.subplots(1, 2, figsize=(15, 10))
axarr[0].axis('off')
axarr[1].axis('off')
axarr[0].imshow(Image)
axarr[1].imshow(Mask)
plt.tight_layout(h_pad=0.1, w_pad=0.1)
plt.show()

In [None]:
plt.figure(figsize=(20, 20), num=None)
plt.imshow(Image)
plt.imshow(Mask, alpha=0.7)
plt.show()

# Imageshow

In [None]:
# Use the string 'path' as 3rd argument in case you would like to retrieve the file paths only.
# Search for png and two class: 'swimming-pool'
query = Query(content['tree'], 
               {'genre': 'png', 'class': ['swimming-pool']}, 'path')

In [None]:
# Press x to exit
from IPython.display import display, clear_output
f, axarr = plt.subplots(5, 2)
f.set_size_inches(20, 18)
val = 0
for idx in np.arange(0,len(query),10):
    pos = 0
    for q in range(5):
        _, I = gdalRead(query[idx+pos])
        queryI = Query(content['tree'], {'name': os.path.basename(query[idx+pos]), 'class': []}, 'path_v')
        _, In = gdalRead(queryI[0])
        axarr[q,0].axis('off')
        axarr[q,0].imshow(In)
        axarr[q,1].axis('off')
        axarr[q,1].imshow(I==classes['swimming-pool'])
        pos += 2
    h = plt.subplots_adjust(hspace=0.05, wspace=0.001)
    display(f)
    if val != 'p':
        val = input("Press Enter to continue...")
        if val == 'x':
            clear_output(wait = True)
            break
    clear_output(wait = True)

> **In order to read many images via the _vsicurl_ driver, please use the command _gdal.VSICurlClearCache()_ after every _gdalRead_ command.**