In [1]:
# Collect all bathymetry data download URLS from the NCEI website

In [1]:
import requests
from requests_html import HTMLSession
import pandas as pd
from html.parser import HTMLParser
import os.path
import re
import random

In [3]:
# Collect and retrieve all of the urls for the various MB voyage indices
base = r'https://www.ngdc.noaa.gov/ships/'
session = HTMLSession()
b = session.get(base)
ships = [session.get(u) for u in b.html.absolute_links]
all_voyage_index_links = [l for c in ships for l in c.html.absolute_links ] 
print('%s indices found' % len(all_voyage_index_links))

4197 indices found


In [5]:
# Collect and retrieve all of the urls for the various bag voyage indices
base = r'https://www.ngdc.noaa.gov/nos/'
session = HTMLSession()
b = session.get(base)
ships = [session.get(u) for u in b.html.absolute_links]
all_nos_index_links = [l for c in ships for l in c.html.absolute_links ] 
print('%s indices found' % len(all_nos_index_links))

18447 indices found


In [6]:
all_index_links = all_voyage_index_links + all_nos_index_links

In [7]:
# Collect the surveys listed by the bathymetric data viewer
fp = r'C:\Users\tristan.sebens\Documents\surveys\survey_names.txt'
with open(fp) as f:
    surveys = [l.split(' ')[0] for l in f]
print('%s surveys found' % len(surveys))

255 surveys found


In [8]:
# Filter out the indexes of the voyages we don't need
def is_needed_survey(u):
    return any([s in u for s in surveys])

survey_indexes = [u for u in all_index_links if is_needed_survey(u)]
print('Found %s matching indexes' % len(survey_indexes))

Found 207 matching indexes


In [9]:
# Collect file links from all of the needed survey indexes
survey_htmls = [session.get(u) for u in survey_indexes]
voyage_data = [l for u in survey_htmls for l in u.html.absolute_links]

In [10]:
len(voyage_data)

55310

In [11]:
def get_exts(u):
    return u.split('/')[-1].split('.')[1:]

def is_type(u, t):
    return t in get_exts(u)

def get_mb_type(exts):
    for e in exts:
        if 'mb' in e:
            return e
    return None

def is_mb(u):
    return any(['mb' in e for e in get_exts(u)])

def is_xyz(u):
    return is_type(u, 'xyz')

def is_xml(u):
    return is_type(u, 'xml')
        
def is_acc_ext(u, acc_exts):
    return any([e in acc_exts for e in get_exts(u)])

In [12]:
acc_non_mb_exts = ['xyz', 'xyb', 'bag', 'ascii', 'xml']
dl_links = [l for l in voyage_data if is_acc_ext(l, acc_non_mb_exts)]
o_fp = r'C:\Users\tristan.sebens\Documents\surveys\dl_links.txt'
with open(o_fp, 'w') as f:
    [f.write(l + '\n') for l in dl_links]

In [40]:
o_fp = r'C:\Users\tristan.sebens\Documents\surveys\xml_links.txt'
with open(o_fp, 'w') as f:
    [f.write(l + '\n') for l in dl_links if is_xml(l)]

In [14]:
# Determine total size of selected files
s = pd.DataFrame(dl_links, columns=['url',])

In [15]:
s.iloc[100]['url']


'http://data.ngdc.noaa.gov/platforms/ocean/ships/healy/HLY1603/multibeam/data/version2/MB/em122/0094_20160923_080008_HEALY-EM122156.xyz'

In [16]:
# Populate the size field of the data files
def get_size(row):
    try:
        return requests.head(row['url']).headers['content-length']
    except: 
        return -1
    
s['size'] = s.apply(lambda row: get_size(row), axis=1)

In [38]:
s['size'].sum(axis=0)

TypeError: can only concatenate str (not "int") to str

In [23]:
print('Expected size on disk: %s GB' % int(total_size) / 1073741824)

ValueError: invalid literal for int() with base 10: 'http://www.ngdc.noaa.gov/metadata/published/NOAA/NESDIS/NGDC/MGG/Multibeam/iso/xml/HLY11TF_Multibeam.xmlhttp://www.ngdc.noaa.gov/metadata/published/NOAA/NESDIS/NGDC/MGG/Multibeam/iso/xml/HLY0404_Mult