# Meta data collection

In [1]:
import os
import functools
import requests
import xmltodict
import validators
import pandas as pd
import webdataset as wds
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

from dotenv import load_dotenv

In [2]:
load_dotenv()

True

## Load Dataset

In [3]:
root = os.getenv('DATA_ROOT')

In [4]:
train_loc = f'file:{root}' + '/dataset/poc/aerpoc-train-{000000..000022}.tar'
val_loc = f'file:{root}' + '/dataset/poc/aerpoc-val-{000000..000007}.tar'
test_loc = f'file:{root}' + '/dataset/poc/aerpoc-test-{000000..000007}.tar'

In [5]:
ds_train = (wds.WebDataset(train_loc, shardshuffle=False)
    .decode("pil")
    .to_tuple("jpg", "json")
    .map(lambda x: x[1]))

ds_val = (wds.WebDataset(val_loc, shardshuffle=False)
    .decode("pil")
    .to_tuple("jpg", "json")
    .map(lambda x: x[1]))

ds_test = (wds.WebDataset(test_loc, shardshuffle=False)
    .decode("pil")
    .to_tuple("jpg", "json")
    .map(lambda x: x[1]))

In [6]:
next(iter(ds_train))

{'type': 'FeatureCollection',
 'features': [{'id': '0',
   'type': 'Feature',
   'properties': {'state': 'California',
    'tile_id': 'R04C09',
    'block_r': 194,
    'block_c': 65,
    'index_right': 72,
    'Link': 'https://earthexplorer.usgs.gov/scene/metadata/full/5e83d8e4870f4473/ARA0016004A0640/',
    'Date': '1956/11/27',
    'Photo_ID': 'A0016004A0640'},
   'geometry': {'type': 'Polygon',
    'coordinates': [[[-120.03820629053243, 39.97114942129131],
      [-120.03820629053243, 39.978048482673344],
      [-120.04510535191447, 39.978048482673344],
      [-120.04510535191447, 39.97114942129131],
      [-120.03820629053243, 39.97114942129131]]]}},
  {'id': '1',
   'type': 'Feature',
   'properties': {'state': 'California',
    'tile_id': 'R04C09',
    'block_r': 194,
    'block_c': 65,
    'index_right': 55,
    'Link': 'https://earthexplorer.usgs.gov/scene/metadata/full/5e83d8e4870f4473/ARA010908610979/',
    'Date': '1954/09/05',
    'Photo_ID': 'A010908610979'},
   'geometry':

## Extract Metadata

In [21]:
def get_fields(xml: dict) -> dict:
    fields = xml['eemetadata:scene']['eemetadata:metadataFields']['eemetadata:metadataField']
    return {
        field['@name']: field['eemetadata:metadataValue'].get('#text')
        for field in fields
    }

@functools.lru_cache(maxsize=15_000)
def get_xml(link: str) -> dict:
    params = { 'responseType': 'viewXml' }
    res = requests.get(link, params)
    res.raise_for_status()
    body = res.content
    return xmltodict.parse(body)

def get_links(meta: dict) -> list[str]:
    links = {features['properties']['Link'] for features in meta['features']}
    return list(links)

def get_empty_fields(link: str, ttype: str, props: dict) -> dict:
    return {
        'Entity  ID': None,
        'Agency': None,
        'Vendor ID': None,
        'Recording Technique': None,
        'Project': None,
        'Event': None,
        'Roll': None,
        'Frame': None,
        'Acquisition Date': None,
        'Scale': None,
        'High Resolution Download Avail': None,
        'Strip Number': None,
        'Image Type': None,
        'Quality': None,
        'Cloud Cover': None,
        'Photo ID': None,
        'Flying Height in Feet': None,
        'Film Length and Width': None,
        'Focal Length': None,
        'Stereo Overlap': None,
        'Other': None,
        'Center Latitude': None,
        'Center Longitude': None,
        'NW Corner Lat': None,
        'NW Corner Long': None,
        'NE Corner lat': None,
        'NE Corner Long': None,
        'SE Corner Lat': None,
        'SE Corner Long': None,
        'SW Corner Lat': None,
        'SW Corner Long': None,
        'Center Latitude dec': None,
        'Center Longitude dec': None,
        'NW Corner Lat dec': None,
        'NW Corner Long dec': None,
        'NE Corner Lat dec': None,
        'NE Corner Long dec': None,
        'SE Corner Lat dec': None,
        'SE Corner Long dec': None,
        'SW Corner Lat dec': None,
        'SW Corner Long dec': None,
        'TrainType': ttype,
        'Link': link,
        'BlockR': props['block_r'],
        'BlockC': props['block_c'],
        'TileId': props['tile_id'],
        'State': props['state']
    }

def fetch_metadata(link: str, ttype: str, props: dict) -> dict:
    if link is None or not validators.url(link):
        return get_empty_fields(link, ttype, props)

    xml = get_xml(link)
    fields = get_fields(xml)
    fields['TrainType'] = ttype
    fields['Link'] = link
    fields['BlockR'] = props['block_r']
    fields['BlockC'] = props['block_c']
    fields['TileId'] = props['tile_id']
    fields['State'] = props['state']
    return fields

def get_all_metadata(ds, ttype: str, workers: int=10) -> list[dict]:
    processed = []

    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = []
        total = 0

        for meta in ds:
            links = get_links(meta)
            props = meta['features'][0]['properties']
            total += len(links)

            for link in links:
                futures.append(executor.submit(fetch_metadata, link, ttype, props))

        with tqdm(total=total) as pbar:
            for task in as_completed(futures):
                processed.append(task.result())
                pbar.update(1)

    return processed

def correct_types(df):
    keys = ['Recording Technique',
        'Frame',
        'Scale',
        'High Resolution Download Avail',
        'Image Type',
        'Quality',
        'Cloud Cover',
        'Flying Height in Feet',
        'Stereo Overlap',
        'Center Latitude dec',
        'Center Longitude dec',
        'NW Corner Lat dec',
        'NW Corner Long dec',
        'NE Corner Lat dec',
        'NE Corner Long dec',
        'SE Corner Lat dec',
        'SE Corner Long dec',
        'SW Corner Lat dec',
        'SW Corner Long dec']

    for k in keys:
        df[k] = pd.to_numeric(df[k], errors='coerce')

    return df

In [30]:
it = iter(ds_train)

def iter_to(limit=1):
    return (next(it) for _ in range(limit))

In [None]:
meta_train = get_all_metadata(ds_train, 'Train', workers=16)

 32%|███▏      | 20420/64152 [26:15<56:15, 12.96it/s]  


In [33]:
meta_train_1 = get_all_metadata(iter_to(22_000), 'Train', workers=16)
df_train = pd.DataFrame(meta_train_1)
df_train = correct_types(df_train)
df_train.to_csv(f'{root}/metadata_train_1.csv', index=False)
del meta_train_1
df_train.head()

Unnamed: 0,Entity ID,Agency,Vendor ID,Recording Technique,Project,Event,Roll,Frame,Acquisition Date,Scale,...,SE Corner Lat dec,SE Corner Long dec,SW Corner Lat dec,SW Corner Long dec,TrainType,Link,BlockR,BlockC,TileId,State
0,AR1VASL00020079,1,,1.0,VASL0,,2,79.0,1963/05/14,20000.0,...,40.288895,-122.247969,40.287993,-122.30022,Train,https://earthexplorer.usgs.gov/scene/metadata/...,53,31,R04C05,California
1,ARA010907209088,A,,1.0,01090,,72,9088.0,1954/09/05,60000.0,...,40.030219,-120.338994,40.023114,-120.498849,Train,https://earthexplorer.usgs.gov/scene/metadata/...,133,133,R04C08,California
2,ARA010907509636,A,,1.0,01090,,75,9636.0,1954/09/05,60000.0,...,40.256337,-120.301364,40.257533,-120.462025,Train,https://earthexplorer.usgs.gov/scene/metadata/...,49,153,R04C08,California
3,AR1VDYK00050168,1,,1.0,VDYK0,,5,168.0,1975/09/24,78000.0,...,40.107275,-121.477772,40.107166,-121.681031,Train,https://earthexplorer.usgs.gov/scene/metadata/...,92,124,R04C06,California
4,ARA010907709738,A,,1.0,01090,,77,9738.0,1954/09/05,60000.0,...,40.099927,-121.665096,40.102513,-121.825503,Train,https://earthexplorer.usgs.gov/scene/metadata/...,98,30,R04C06,California


In [34]:
meta_train_2 = get_all_metadata(it, 'Train', workers=16)
df_train = pd.DataFrame(meta_train_2)
df_train = correct_types(df_train)
df_train.to_csv(f'{root}/metadata_train_2.csv', index=False)
del meta_train_2
df_train.head()

100%|██████████| 32132/32132 [18:40<00:00, 28.69it/s]  


Unnamed: 0,Entity ID,Agency,Vendor ID,Recording Technique,Project,Event,Roll,Frame,Acquisition Date,Scale,...,SE Corner Lat dec,SE Corner Long dec,SW Corner Lat dec,SW Corner Long dec,TrainType,Link,BlockR,BlockC,TileId,State
0,ARA001450675409,A,,1.0,00145,,67,5409.0,1954/11/20,63000.0,...,33.94424,-115.21538,33.9357,-115.37779,Train,https://earthexplorer.usgs.gov/scene/metadata/...,229,10,R15C18,California
1,AR1VBOI00010025,1,,1.0,VBOI0,,1,25.0,1966/11/01,24000.0,...,32.531777,-116.991522,32.528436,-117.048155,Train,https://earthexplorer.usgs.gov/scene/metadata/...,144,204,R18C14,California
2,AR1VAD000030004,1,,1.0,VAD00,,3,4.0,1953/06/27,37400.0,...,39.504774,-120.66907,39.504812,-120.765655,Train,https://earthexplorer.usgs.gov/scene/metadata/...,148,3,R05C08,California
3,ARA001450060562,A,,1.0,00145,,6,562.0,1954/07/09,63000.0,...,36.04365,-118.824,36.04869,-118.98009,Train,https://earthexplorer.usgs.gov/scene/metadata/...,215,64,R11C11,California
4,ARA001450796313,A,,1.0,00145,,79,6313.0,1955/02/03,63000.0,...,32.88778,-118.30744,32.78573,-118.44353,Train,https://earthexplorer.usgs.gov/scene/metadata/...,234,44,R17C12,California


In [35]:
meta_train_1 = pd.read_csv(f'{root}/metadata_train_1.csv')
meta_train_2 = pd.read_csv(f'{root}/metadata_train_2.csv')
df_train = pd.concat([meta_train_1, meta_train_2])
df_train.to_csv(f'{root}/metadata_train.csv', index=False)
del meta_train_1, meta_train_2
df_train.head()

Unnamed: 0,Entity ID,Agency,Vendor ID,Recording Technique,Project,Event,Roll,Frame,Acquisition Date,Scale,...,SE Corner Lat dec,SE Corner Long dec,SW Corner Lat dec,SW Corner Long dec,TrainType,Link,BlockR,BlockC,TileId,State
0,AR1VASL00020079,1,,1.0,VASL0,,2,79.0,1963/05/14,20000.0,...,40.288895,-122.247969,40.287993,-122.30022,Train,https://earthexplorer.usgs.gov/scene/metadata/...,53,31,R04C05,California
1,ARA010907209088,A,,1.0,01090,,72,9088.0,1954/09/05,60000.0,...,40.030219,-120.338994,40.023114,-120.498849,Train,https://earthexplorer.usgs.gov/scene/metadata/...,133,133,R04C08,California
2,ARA010907509636,A,,1.0,01090,,75,9636.0,1954/09/05,60000.0,...,40.256337,-120.301364,40.257533,-120.462025,Train,https://earthexplorer.usgs.gov/scene/metadata/...,49,153,R04C08,California
3,AR1VDYK00050168,1,,1.0,VDYK0,,5,168.0,1975/09/24,78000.0,...,40.107275,-121.477772,40.107166,-121.681031,Train,https://earthexplorer.usgs.gov/scene/metadata/...,92,124,R04C06,California
4,ARA010907709738,A,,1.0,01090,,77,9738.0,1954/09/05,60000.0,...,40.099927,-121.665096,40.102513,-121.825503,Train,https://earthexplorer.usgs.gov/scene/metadata/...,98,30,R04C06,California


In [66]:
meta_val = get_all_metadata(ds_val, 'Val')
df_val = pd.DataFrame(meta_val)
df_val = correct_types(df_val)
df_val.to_csv(f'{root}/metadata_val.csv', index=False)
del meta_val
df_val.head()

Unnamed: 0,Entity ID,Agency,Vendor ID,Recording Technique,Project,Event,Roll,Frame,Acquisition Date,Scale,...,SE Corner Lat dec,SE Corner Long dec,SW Corner Lat dec,SW Corner Long dec,TrainType,Link,BlockR,BlockC,TileId,State
0,ARA001450564381,A,,1,00145,,56,4381,1954/09/28,63000,...,32.70684,-115.28248,32.70759,-115.4491,Val,https://earthexplorer.usgs.gov/scene/metadata/...,71,212,R18C17,California
1,ARA001450735849,A,,1,00145,,73,5849,1954/12/26,63000,...,32.67667,-115.77771,32.68028,-115.94621,Val,https://earthexplorer.usgs.gov/scene/metadata/...,74,234,R18C16,California
2,ARA001450564427,A,,1,00145,,56,4427,1954/09/28,63000,...,32.68127,-115.62463,32.67125,-115.79147,Val,https://earthexplorer.usgs.gov/scene/metadata/...,74,54,R18C17,California
3,AR1VDXP00010038,1,,1,VDXP0,,1,38,1975/10/16,80000,...,32.807555,-114.968419,32.807439,-115.15816,Val,https://earthexplorer.usgs.gov/scene/metadata/...,7,110,R18C18,California
4,ARA001450786211,A,,1,00145,,78,6211,1955/01/27,63000,...,32.71725,-114.51154,32.71849,-114.68477,Val,https://earthexplorer.usgs.gov/scene/metadata/...,18,102,R18C19,California


In [None]:
meta_test = get_all_metadata(ds_test, 'Test')
df_test = pd.DataFrame(meta_test)
df_test = correct_types(df_test)
df_test.to_csv(f'{root}/metadata_test.csv', index=False)
del meta_test
df_test.head()

100%|██████████| 21630/21630 [1:10:56<00:00,  5.08it/s]


Unnamed: 0,Entity ID,Agency,Vendor ID,Recording Technique,Project,Event,Roll,Frame,Acquisition Date,Scale,...,SE Corner Lat dec,SE Corner Long dec,SW Corner Lat dec,SW Corner Long dec,TrainType,Link,BlockR,BlockC,TileId,State
0,AR1VBOI00010049,1,,1,VBOI0,,1,49,1966/11/01,24000,...,32.856368,-117.204078,32.84338,-117.258933,Test,https://earthexplorer.usgs.gov/scene/metadata/...,1,120,R18C14,California
1,AR1VBOI00010065,1,,1,VBOI0,,1,65,1966/11/01,24000,...,32.855472,-117.213536,32.856649,-117.270495,Test,https://earthexplorer.usgs.gov/scene/metadata/...,1,120,R18C14,California
2,ARA001450796314,A,,1,00145,,79,6314,1955/02/03,63000,...,32.83455,-118.26036,32.73298,-118.3959,Test,https://earthexplorer.usgs.gov/scene/metadata/...,7,69,R18C12,California
3,ARA001450735880,A,,1,00145,,73,5880,1954/12/26,63000,...,32.77611,-116.08686,32.77507,-116.25208,Test,https://earthexplorer.usgs.gov/scene/metadata/...,7,78,R18C16,California
4,ARA001450796314,A,,1,00145,,79,6314,1955/02/03,63000,...,32.83455,-118.26036,32.73298,-118.3959,Test,https://earthexplorer.usgs.gov/scene/metadata/...,20,69,R18C12,California


## Cleanup

In [None]:
ds_train.close()
ds_val.close()
ds_test.close()