## Processing pipeline
Start with a zip file and:
 - Unzip it
 - For each file inside it
   - ODV Parse it
   - Filter it or what not
   - append to parsed list
   - Combine with metadata pulled from other places
   - Create columns specifically for DwC
 - Merge parsed list into single DF
 - write to DwC
   - Take column matching dictionaries
   - Rename columns to match DwC standards
   - Write to CSV

In [1]:
# Std
import zipfile
import pathlib
import os
# NonStd
import dask.dataframe as ddf
import pandas as pd
# Custom
import pyodv

### Vars and paths


In [2]:
input_zip = './input/order_60837_1.zip'
temp_folder = './temp/'
output_dwc = './output/'


### Unzip
Input:
  - Zipped file path
  - Location to unzip to

Output
  - Unzipped files in folder

In [3]:
zipped_path = pathlib.Path(input_zip)
file_name = zipped_path.stem
unzipped_path = pathlib.Path(temp_folder).joinpath(file_name)

with zipfile.ZipFile(input_zip, 'r') as zip_ref:
    zip_ref.extractall(unzipped_path)
    

### Parse and Merge files
Input:
  - Unzipped dir
Output:
  - Parsed and merged dataframe object.

In [4]:
config = {'occurrenceStatus_hardcode': 'present'}

odv_list  = []
df_list = [] 
ref_list = []
for filename in os.listdir(unzipped_path):
    f = os.path.join(unzipped_path, filename)
    # checking if it is a file
    if os.path.isfile(f):
        try:
            print(f'===== {f} =====')
            parsed_file = pyodv.ODV_Struct(f) 
            odv_list.append(parsed_file) 
            ref_list.append(parsed_file.refs[0])
            this_df = pd.concat([parsed_file.df_data, parsed_file.df_var],axis=1)
            this_df['scope'] = parsed_file.refs[0]['@sdn:scope']
            this_df['defined_by'] = parsed_file.refs[0]['@xlink:href']
            this_df['occurrenceStatus'] = config.get('occurrenceStatus_hardcode', 'present')
            
            df_list.append(this_df)
        except Exception as err:
            print(err)
            print('==============================================================')
            
        

===== temp/order_60837_1/000729_BODV_Bent_138791_V1.txt =====
--Problem Parsing Refs--
syntax error: line 1, column 0
SeaDataNet ODV   - Generated by Replication Manager 1.0.51 (with Octopus 1.7.1) - 2022-04-05 12:34:50+0200
Has 13 params
Has 1 refs.
Good file shape
===== temp/order_60837_1/000729_BODV_Bent_137824_V1.txt =====
--Problem Parsing Refs--
syntax error: line 1, column 0
SeaDataNet ODV   - Generated by Replication Manager 1.0.51 (with Octopus 1.7.1) - 2022-04-05 12:29:54+0200
Has 13 params
Has 1 refs.
Good file shape
===== temp/order_60837_1/000729_BODV_Bent_131628_V1.txt =====
--Problem Parsing Refs--
syntax error: line 1, column 0
SeaDataNet ODV   - Generated by Replication Manager 1.0.51 (with Octopus 1.7.1) - 2022-04-05 12:30:42+0200
Has 13 params
Has 1 refs.
Good file shape
===== temp/order_60837_1/000729_BODV_Bent_133214_V1.txt =====
--Problem Parsing Refs--
syntax error: line 1, column 0
SeaDataNet ODV   - Generated by Replication Manager 1.0.51 (with Octopus 1.7.1) -

In [5]:
metadata_path = os.path.join(unzipped_path, 'data.csv')
metadata_df = pd.read_csv(metadata_path)
metadata_df['LOCAL_CDI_ID_split'] = metadata_df['LOCAL_CDI_ID'].str.split(pat="/").str[0]
merged_df = pd.concat(df_list, axis=0) 
merged_df = merged_df.join(metadata_df.set_index('LOCAL_CDI_ID_split'), on='LOCAL_CDI_ID', how='left', rsuffix = '_meta')
merged_df.reset_index(level=None, drop=True, inplace = True)

In [7]:
pd.set_option("display.max_columns", None)
merged_df.head()

Unnamed: 0,Cruise,Station,Bot. Depth [m],LOCAL_CDI_ID,yyyy-mm-ddThh:mm:ss.sss,Type,Longitude [degrees_east],EDMO_code,Latitude [degrees_north],MinimumObservationDepth [m],MaximumObservationDepth [m],SampleID [#],SamplingEffort [m^2],SubsampleID [#],SubSamplingCoefficient [none],ScientificName [none],ScientificNameID [#],Sex [none],LifeStage [none],ObservedIndividualCount [#],WWBiom_Samp_BE007117 [g],DWBiom_Samp_BE007117 [g],scope,defined_by,occurrenceStatus,Data Set name,Discipline,Category,Variables measured,Data format,Data format version,Data size,Data Set creation Date,Latitude 1,Latitude 2,Longitude 1,Longitude 2,Datum,Measuring area type,Water depth (m),Depth reference,Minimum instrument depth (m),Maximum instrument depth (m),Start Date,Start time,End Date,End time,Vertical resolution,Vertical resolution unit,Instrument / gear type,Track resolution,Track resolution unit,Frequency,Frequency unit,Platform type,Cruise name,Alternative cruise name,Cruise start Date,Station name,Alternative station name,Station start Date,Originator,Data Holding centre,Project name,Project type,EDMED references,CSR references,CSR platform name,CSR platform code,Publication references,Data Distributor,Database reference,Access/ordering of data,Access restriction,CDI-record creation date,CDI-record id,LOCAL_CDI_ID_meta,CDI-partner
0,unknown,LISD0025,,Bent_138791,1997-04-22T00:00:00.000,*,8.5661,729.0,55.0934,0.0,0.0,31147,0.0079,,,Hydrobia sp.,urn:lsid:marinespecies.org:taxname:138081,U,U,20.0,,,729:Bent_138791,https://cdi.seadatanet.org/report/edmo/729/Ben...,present,Benthosset_138791,Biological oceanography (DS01)|Marine geology ...,Other biological measurements (B027)|Rock and ...,Reference numbers (ACYC)|Sampling parameters (...,bodv,0.4,5227,20220405,8.5661,8.5661,55.093399,55.093403,World Geodetic System 84 (4326),point (004),0.287,Lowest Astronomical Tide (D01),,,19970422,00:00:00,19970422,00:00:00,,,benthos samplers (24),,,,,research vessel (31),,,,LISD0025,LISD0025,19800505,"Aarhus University, Department of Bioscience, M...","Aarhus University, Department of Bioscience, M...",,,Time series of zoobenthos in Danish waters (19...,,,,,"Aarhus University,Danish Centre for Environmen...",,web data access with registration,Creative Commons Attribution 4.0 International...,20220405,2049364,Bent_138791/v1,"Aarhus University, Department of Bioscience, M..."
1,unknown,LISD0025,,Bent_138791,1997-04-22T00:00:00.000,*,8.5661,729.0,55.0934,0.0,0.0,31148,0.0079,,,Hydrobia sp.,urn:lsid:marinespecies.org:taxname:138081,U,U,17.0,,,729:Bent_138791,https://cdi.seadatanet.org/report/edmo/729/Ben...,present,Benthosset_138791,Biological oceanography (DS01)|Marine geology ...,Other biological measurements (B027)|Rock and ...,Reference numbers (ACYC)|Sampling parameters (...,bodv,0.4,5227,20220405,8.5661,8.5661,55.093399,55.093403,World Geodetic System 84 (4326),point (004),0.287,Lowest Astronomical Tide (D01),,,19970422,00:00:00,19970422,00:00:00,,,benthos samplers (24),,,,,research vessel (31),,,,LISD0025,LISD0025,19800505,"Aarhus University, Department of Bioscience, M...","Aarhus University, Department of Bioscience, M...",,,Time series of zoobenthos in Danish waters (19...,,,,,"Aarhus University,Danish Centre for Environmen...",,web data access with registration,Creative Commons Attribution 4.0 International...,20220405,2049364,Bent_138791/v1,"Aarhus University, Department of Bioscience, M..."
2,unknown,LISD0025,,Bent_138791,1997-04-22T00:00:00.000,*,8.5661,729.0,55.0934,0.0,0.0,31149,0.0079,,,Hydrobia sp.,urn:lsid:marinespecies.org:taxname:138081,U,U,28.0,,,729:Bent_138791,https://cdi.seadatanet.org/report/edmo/729/Ben...,present,Benthosset_138791,Biological oceanography (DS01)|Marine geology ...,Other biological measurements (B027)|Rock and ...,Reference numbers (ACYC)|Sampling parameters (...,bodv,0.4,5227,20220405,8.5661,8.5661,55.093399,55.093403,World Geodetic System 84 (4326),point (004),0.287,Lowest Astronomical Tide (D01),,,19970422,00:00:00,19970422,00:00:00,,,benthos samplers (24),,,,,research vessel (31),,,,LISD0025,LISD0025,19800505,"Aarhus University, Department of Bioscience, M...","Aarhus University, Department of Bioscience, M...",,,Time series of zoobenthos in Danish waters (19...,,,,,"Aarhus University,Danish Centre for Environmen...",,web data access with registration,Creative Commons Attribution 4.0 International...,20220405,2049364,Bent_138791/v1,"Aarhus University, Department of Bioscience, M..."
3,unknown,LISD0025,,Bent_138791,1997-04-22T00:00:00.000,*,8.5661,729.0,55.0934,0.0,0.0,31150,0.0079,,,Hydrobia sp.,urn:lsid:marinespecies.org:taxname:138081,U,U,50.0,,,729:Bent_138791,https://cdi.seadatanet.org/report/edmo/729/Ben...,present,Benthosset_138791,Biological oceanography (DS01)|Marine geology ...,Other biological measurements (B027)|Rock and ...,Reference numbers (ACYC)|Sampling parameters (...,bodv,0.4,5227,20220405,8.5661,8.5661,55.093399,55.093403,World Geodetic System 84 (4326),point (004),0.287,Lowest Astronomical Tide (D01),,,19970422,00:00:00,19970422,00:00:00,,,benthos samplers (24),,,,,research vessel (31),,,,LISD0025,LISD0025,19800505,"Aarhus University, Department of Bioscience, M...","Aarhus University, Department of Bioscience, M...",,,Time series of zoobenthos in Danish waters (19...,,,,,"Aarhus University,Danish Centre for Environmen...",,web data access with registration,Creative Commons Attribution 4.0 International...,20220405,2049364,Bent_138791/v1,"Aarhus University, Department of Bioscience, M..."
4,unknown,LISD0025,,Bent_138791,1997-04-22T00:00:00.000,*,8.5661,729.0,55.0934,0.0,0.0,31151,0.0079,,,Hydrobia sp.,urn:lsid:marinespecies.org:taxname:138081,U,U,63.0,,,729:Bent_138791,https://cdi.seadatanet.org/report/edmo/729/Ben...,present,Benthosset_138791,Biological oceanography (DS01)|Marine geology ...,Other biological measurements (B027)|Rock and ...,Reference numbers (ACYC)|Sampling parameters (...,bodv,0.4,5227,20220405,8.5661,8.5661,55.093399,55.093403,World Geodetic System 84 (4326),point (004),0.287,Lowest Astronomical Tide (D01),,,19970422,00:00:00,19970422,00:00:00,,,benthos samplers (24),,,,,research vessel (31),,,,LISD0025,LISD0025,19800505,"Aarhus University, Department of Bioscience, M...","Aarhus University, Department of Bioscience, M...",,,Time series of zoobenthos in Danish waters (19...,,,,,"Aarhus University,Danish Centre for Environmen...",,web data access with registration,Creative Commons Attribution 4.0 International...,20220405,2049364,Bent_138791/v1,"Aarhus University, Department of Bioscience, M..."


### Write to DwC
When writing to DwC archive there are a couple of xml files created, a folder with the data in it and then it's all zipped.

Taking a look here: https://ipt.gbif.org/manual/en/ipt/2.5/dwca-guide#publishing-dwc-a-manually

Steps required are:
  - Prepare data files
  - Create a Metafile
  - Create a metadata file (eml.xml)
  - Ensure the data files, the metafile (meta.xml) and metadata file (eml.xml) are in the same directory or folder. Compress the folder using one of the support compression formats. The result is a DwC-A.


In [8]:
event_mapping = {
        'eventID':'SampleID [#]',
        'eventDate':'yyyy-mm-ddThh:mm:ss.sss',
        'decimalLatitude':'Latitude [degrees_north]',
        'decimalLongitude':'Longitude [degrees_east]',
        'institutionCode':'EDMO_code',
        'datasetName':'Data Set name',
        'maximumDepthInMeters':'MinimumObservationDepth [m]',
        'minimumDepthInMeters':'MaximumObservationDepth [m]',
        'coordinateUncertaintyInMeters':None,
        'footprintWKT':None,
        'type':None,
        'parenEventID':None,
        'dataGeneralizations':None,
        'eventRemarks':None,
        'samplingProtocol':None,
        'locationID':'Station',
        'locality':None,
        'locationRemarks':None,
        }

In [9]:
occ_mapping = {}

In [10]:
emof_mapping = {}

In [11]:
def odv_dwc_mapping(df, map_dict):
    dwc_col_list = []
    for dwc_colname, odv_colname in map_dict.items():
        if odv_colname is not None:
            try:
                dwc_col = df[odv_colname]
                dwc_col.name = dwc_colname
                dwc_col_list.append(dwc_col)
            except Exception as err:
                print(err)
    mapped_df = pd.concat(dwc_col_list, axis=1)
    return mapped_df

In [12]:
dwc_event = odv_dwc_mapping(merged_df, event_mapping)
# dwc_occ = odv_dwc_mapping(merged_df, event_mapping)

In [13]:
dwc_event

Unnamed: 0,eventID,eventDate,decimalLatitude,decimalLongitude,institutionCode,datasetName,maximumDepthInMeters,minimumDepthInMeters,locationID
0,31147,1997-04-22T00:00:00.000,55.0934,8.5661,729.0,Benthosset_138791,0.0,0.0,LISD0025
1,31148,1997-04-22T00:00:00.000,55.0934,8.5661,729.0,Benthosset_138791,0.0,0.0,LISD0025
2,31149,1997-04-22T00:00:00.000,55.0934,8.5661,729.0,Benthosset_138791,0.0,0.0,LISD0025
3,31150,1997-04-22T00:00:00.000,55.0934,8.5661,729.0,Benthosset_138791,0.0,0.0,LISD0025
4,31151,1997-04-22T00:00:00.000,55.0934,8.5661,729.0,Benthosset_138791,0.0,0.0,LISD0025
...,...,...,...,...,...,...,...,...,...
2921,28448,2002-09-09T00:00:00.000,55.1826,8.5902,729.0,Benthosset_137951,0.0,0.0,JUVD0011
2922,28446,2002-09-09T00:00:00.000,55.1826,8.5902,729.0,Benthosset_137951,0.0,0.0,JUVD0011
2923,28448,2002-09-09T00:00:00.000,55.1826,8.5902,729.0,Benthosset_137951,0.0,0.0,JUVD0011
2924,28444,2002-09-09T00:00:00.000,55.1826,8.5902,729.0,Benthosset_137951,0.0,0.0,JUVD0011
