# Timing tests for writing 

Import packages, and check working directory + environment

In [2]:
import os
import sys
import pandas as pd
import logging
import pickle
import multiprocessing as mp
import pyarrow.feather as feather
import pyarrow.parquet as pq

sys.path.append("C:/SMW/Gliders_Moorings/Gliders/gdm")
from gdm import GliderDataModel
from gdm.gliders.slocum import load_slocum_dba #, get_dbas

In [3]:
os.chdir(os.path.join('C:/SMW/Gliders_Moorings/Gliders', 'Glider-Data'))
print(os.getcwd())
!conda info

C:\SMW\Gliders_Moorings\Gliders\Glider-Data

     active environment : gdm
    active env location : C:\Users\sam.woodman\Anaconda3\envs\gdm
            shell level : 2
       user config file : C:\Users\sam.woodman\.condarc
 populated config files : C:\Users\sam.woodman\.condarc
          conda version : 4.11.0
    conda-build version : 3.21.7
         python version : 3.8.8.final.0
       virtual packages : __win=0=0
                          __archspec=1=x86_64
       base environment : C:\Users\sam.woodman\Anaconda3  (writable)
      conda av data dir : C:\Users\sam.woodman\Anaconda3\etc\conda
  conda av metadata url : None
           channel URLs : https://conda.anaconda.org/default/win-64
                          https://conda.anaconda.org/default/noarch
                          https://conda.anaconda.org/conda-forge/win-64
                          https://conda.anaconda.org/conda-forge/noarch
          package cache : C:\Users\sam.woodman\Anaconda3\pkgs
                      

## Glider data processing

Set variables and file paths

In [3]:
project = 'FREEBYRD'
deployment = 'amlr01-20211121'
mode = 'delayed'

In [7]:
num_cores = mp.cpu_count() - 1
print('num_cores: {:}'.format(num_cores))

if mode == 'delayed':
     binary_folder = 'debd'
else: 
    binary_folder = 'stbd'
     

### Set path/file variables
year = deployment[7:11]
logging.info('Year, extracted from deployment: {:}'.format(year))
deployment_mode = deployment + '-' + mode
deployment_path = os.path.join(project, year, deployment, 'glider')
logging.info('Deployment path: {:}'.format(deployment_path))

ascii_path  = os.path.join(deployment_path, 'data', 'in', 'ascii', binary_folder)
config_path = os.path.join(deployment_path, 'config', 'ngdac')
nc_ngdac_path = os.path.join(deployment_path, 'data', 'out', 'nc', 'ngdac', mode)
nc_trajectory_path = os.path.join(deployment_path, 'data', 'out', 'nc', 'trajectory', mode)

pkl_file_path = os.path.join(deployment_path, 'data', 'pkl', deployment_mode + '-gdm.pkl')
tmp_path = os.path.join(deployment_path, 'data', 'tmp')

num_cores: 7


Create gdm object and get dba files

In [5]:
gdm = GliderDataModel(config_path)
gdm

<GliderNetCDF(cfg=True, data=(0, 0), profiles=0)>

In [6]:
# Add data from dba files to gdm
dba_files_list = list(map(lambda x: os.path.join(ascii_path, x), os.listdir(ascii_path)))
dba_files = pd.DataFrame(dba_files_list, columns = ['dba_file'])
dba_files.head()

Unnamed: 0,dba_file
0,FREEBYRD\2021\amlr01-20211121\glider\data\in\a...
1,FREEBYRD\2021\amlr01-20211121\glider\data\in\a...
2,FREEBYRD\2021\amlr01-20211121\glider\data\in\a...
3,FREEBYRD\2021\amlr01-20211121\glider\data\in\a...
4,FREEBYRD\2021\amlr01-20211121\glider\data\in\a...


Now we load the dba files

In [8]:
%%time
pool = mp.Pool(num_cores)
load_slocum_dba_list = pool.map(load_slocum_dba, dba_files_list)
pool.close()   

load_slocum_dba_list_unzipped = list(zip(*load_slocum_dba_list))
dba = pd.concat(load_slocum_dba_list_unzipped[0]).sort_index()
pro_meta = pd.concat(load_slocum_dba_list_unzipped[1]).sort_index()            

gdm.data = dba 
gdm.profiles = pro_meta
gdm

Wall time: 9min 56s


<GliderNetCDF(cfg=True, data=(1203725, 1806), profiles=1583)>

## File storage testing

Now that we have the glider data loaded, we can test file storage options. All files will be written to the glider/data/tmp folder

#### pickle

First we test saving and reading the file as a [pkl file](https://docs.python.org/3/library/pickle.html). This is the 'easiest' as it save the hole gdm object, with no additional processing. 

However, the pkl file type is specific to Python and thus not language-agnostic like Apache Arrow

First we save the file:

In [17]:
%%time
pkl_file_path = os.path.join(tmp_path, deployment_mode + '.pkl')
with open(pkl_file_path, 'wb') as outp:
    pickle.dump(gdm, outp, pickle.HIGHEST_PROTOCOL)
del outp

Wall time: 24.4 s


And then we read it back in

In [18]:
%%time
with open(pkl_file_path, 'rb') as inp:
    gdm_pkl = pickle.load(inp)
del inp

Wall time: 29.1 s


Compare equality with gdm, and then delete object to clear file space

In [19]:
print(gdm)
print(gdm_pkl)
print(gdm.data.equals(gdm_pkl.data))
print(gdm.profiles.equals(gdm_pkl.profiles))

<GliderNetCDF(cfg=True, data=(1203725, 1806), profiles=1583)>
<GliderNetCDF(cfg=True, data=(1203725, 1806), profiles=1583)>
True
True


In [20]:
del gdm_pkl

#### feather

[Feather](https://arrow.apache.org/docs/python/feather.html) is part of the [Apache Arrow Platform](https://arrow.apache.org/docs/index.html), and provides a "fast, language-agnostic data frame storage solution".

Some cons of feather:

- Feather is only for data frames, and thus the pieces of the gdm obeject (data and profile dataframes) must be saved separately. Therefore, the gdm object must be recreated when the data are read.

- Feather files do not store the indexes of Pandas data frames, and thus the indices must be moved to a column before saving, and the index must be set when recreating the gdm object

NOTE: feather unable to allocate enough space on Sam's local computer. Will try in GCP

In [26]:
# %%time
# feather_data_file_path = os.path.join(tmp_path, deployment_mode + '-data.feather')
# feather_prof_file_path = os.path.join(tmp_path, deployment_mode + '-profiles.feather')

# gdm.data.reset_index().to_feather(feather_data_file_path, version = 1)
# gdm.profiles.reset_index().to_feather(feather_prof_file_path)

In [27]:
# %%time
# gdm_feather = GliderDataModel(config_path)
# gdm_feather.data = feather.read_feather(feather_data_file_path).set_index('time')
# gdm_feather.profiles = feather.read_feather(feather_prof_file_path).set_index('midpoint_time')

#### parquet

[Parquet](https://arrow.apache.org/docs/python/parquet.html) is part of the [Apache Arrow Platform](https://arrow.apache.org/docs/index.html), and provides a "standardized open-source columnar storage format". Parquet file formats are generally considered to be more stable than feather.

Like feather, parquet is for data frames, and thus gdm.data and gdm.profiles must be saved separately.

In [9]:
%%time
parquet_data_file_path = os.path.join(tmp_path, deployment_mode + '-data.parquet')
parquet_prof_file_path = os.path.join(tmp_path, deployment_mode + '-profiles.parquet')

gdm.data.to_parquet(parquet_data_file_path, version="2.6", index = True)
gdm.profiles.to_parquet(parquet_prof_file_path, version="2.6", index = True)

Wall time: 31.9 s


In [10]:
%%time
gdm_pq = GliderDataModel(config_path)
gdm_pq.data = pd.read_parquet(parquet_data_file_path)
gdm_pq.profiles = pd.read_parquet(parquet_prof_file_path)

Wall time: 1min 19s


Test equality with original gdm object

In [15]:
print(gdm)
print(gdm_pq)
print('data dataframes are equal: {:}'.format(gdm.data.equals(gdm_pq.data)))
print('profiles dataframes are equal: {:}'.format(gdm.profiles.equals(gdm_pq.profiles)))

<GliderNetCDF(cfg=True, data=(1203725, 1806), profiles=1583)>
<GliderNetCDF(cfg=True, data=(1203725, 1806), profiles=1583)>
data dataframes are equal: True
profiles dataframes are equal: True


In [None]:
del gdm_pq

## Summary

Overall, pickle is the simplest to use because it saves the entire gdm object. However, pickle cannot be read in to R or Matlab, and the saved file is very large (16 GB for amlr01-20211121-delayed). 

Thus, moving forward Sam will use parquet. Parquet can save dataframes with their index (indicies), which minimizes the reconstruction that needs to happen. Plus, for whatever reason it doesn't run into the memory allocation issues that feather hits. 