# dbCamHD Update

This notebook updates the dbcamhd.json metadata database.

#### Load the current database

In [None]:
import pandas as pd
dbcamhd = pd.read_json('dbcamhd.json', orient="records", lines=True).sort_values(by=['timestamp'])
dbcamhd.tail()

#### Get the last date in the database

In [None]:
from datetime import date
year = int(dbcamhd.filename.iloc[-1].split('/')[7])
month = int(dbcamhd.filename.iloc[-1].split('/')[8])
day = int(dbcamhd.filename.iloc[-1].split('/')[9])
start_date = date(year, month, day)
start_date

#### Get a list of the files since that date and convert to dataframe

In [None]:
import pycamhd as camhd

In [None]:
%%time
file_list, file_sizes = camhd.get_file_list(start_date=start_date);

In [None]:
dbcamhd_new = pd.DataFrame(
    {'filename': file_list,
     'filesize': file_sizes,
    })
dbcamhd_new.tail()

#### Get some additional information about the files
This cell takes a couple of hours to run in a single thread, which is why it is commented out. How much faster would it go using Dask Delayed and a bunch of Dask workers?

In [None]:
%%time
timestamp = []
frame_count = []
moov = []

for i in dbcamhd_new.index:
    filename = dbcamhd_new['filename'][i]
    try:
        moov_atom = camhd.get_moov_atom(filename)
        timestamp.append(camhd.get_timestamp(filename, moov_atom))
        frame_count.append(camhd.get_frame_count(filename, moov_atom))
        moov.append(True)
    except:
        timestamp.append(0)
        frame_count.append(0)
        moov.append(False)

#### Add these to the new dataframe

In [None]:
dbcamhd_new['moov'] = pd.Series(moov, index=dbcamhd_new.index)
dbcamhd_new['timestamp'] = pd.Series(timestamp, index=dbcamhd_new.index)
dbcamhd_new['frame_count'] = pd.Series(frame_count, index=dbcamhd_new.index)
dbcamhd_new.tail()

#### Add deployment numbers to the new dataframe

See the [asset management](https://github.com/ooi-integration/asset-management/blob/master/deployment/RS03ASHS_Deploy.csv) page for deployment information.

In [None]:
dt = pd.to_datetime(dbcamhd_new.timestamp, unit='s')
dbcamhd_new['deployment'] = dbcamhd_new.timestamp*0
dbcamhd_new.loc[dt < '2016-07-26 21:18:00', 'deployment'] = 2
dbcamhd_new.loc[dt >= '2016-07-26 21:18:00', 'deployment'] = 3
dbcamhd_new.loc[dt >= '2017-08-14 06:00:00', 'deployment'] = 4
dbcamhd_new.loc[dt >= '2018-07-04 00:00:00', 'deployment'] = 5

#### Concatenate and deduplicate

In [None]:
dbcamhd = pd.concat([dbcamhd, dbcamhd_new], ignore_index=True, sort=True).drop_duplicates(subset=['filename'])

#### Save dataframe to JSON file

In [None]:
dbcamhd.to_json('dbcamhd.json', orient="records", lines=True)

### References

https://github.com/tjcrone/pycamhd<br>
https://rawdata.oceanobservatories.org/files/RS03ASHS/PN03B/06-CAMHDA301/<br>
https://pandas.pydata.org/