Merge pull request #1512 from ashishpriyadarshiCIC/test-script-hdf5

HDF5 to csv files conversion test
weecology · Sep 6, 2020 · 89fd22f · 89fd22f
2 parents b3a62e2 + 19ef9a0
commit 89fd22f
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 12 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -37,6 +37,8 @@ RUN pip install pytest
 RUN pip install yapf
 RUN pip install pylint
 RUN pip install flake8 -U
+RUN pip install h5py
+RUN pip install Pillow
 
 # Install Postgis after Python is setup
 RUN apt-get install -y --force-yes postgis

diff --git a/requirements.txt b/requirements.txt
@@ -11,3 +11,5 @@ sphinx_rtd_theme
 tqdm==4.30.0
 pandas
 setuptools
+h5py
+Pillow
diff --git a/retriever/lib/engine.py b/retriever/lib/engine.py
@@ -30,6 +30,7 @@
 from retriever.lib.engine_tools import sqlite2csv
 from retriever.lib.engine_tools import json2csv
 from retriever.lib.engine_tools import xml2csv
+from retriever.lib.engine_tools import hdf2csv
 from retriever.lib.warning import Warning
 
 
@@ -615,6 +616,15 @@ def process_xml2csv(self,
         if self.find_file(src_path):
             xml2csv(src_path, path_to_csv, header_values, empty_rows, encoding)
 
+    def process_hdf52csv(self,
+                         src_path,
+                         path_to_csv,
+                         data_name,
+                         data_type,
+                         encoding=ENCODING):
+        if self.find_file(src_path):
+            hdf2csv(src_path, path_to_csv, data_name, data_type, encoding=ENCODING)
+
     def extract_gz(
         self,
         archive_path,
@@ -915,7 +925,7 @@ def table_name(self, name=None, dbname=None):
                 dbname = ''
         return self.opts["table_name"].format(db=dbname, table=name)
 
-    def to_csv(self, sort=True, path=None, select_columns=None):
+    def to_csv(self, sort=True, path=None, select_columns=None, select_table=None):
         """Create a CSV file from the a data store.
 
         sort flag to create a sorted file,

diff --git a/retriever/lib/engine_tools.py b/retriever/lib/engine_tools.py
@@ -4,30 +4,31 @@
 scripts.
 
 """
+import csv
+import itertools
 import json
+import os
 import platform
 import shutil
+import sqlite3 as sql
 import subprocess
 import warnings
-import pandas as pd
-import itertools
 from string import ascii_lowercase
-from sqlite3 import Error
-import sqlite3 as sql
-
-from hashlib import md5
-from io import StringIO as NewFile
 
-import xml.etree.ElementTree as ET
-import os
-import csv
 try:
     # Geopanda installation is not smooth on the CI tests platforms
     import geopandas
 except ModuleNotFoundError:
     pass
 from pandas.io.json import json_normalize
 from collections import OrderedDict
+import xml.etree.ElementTree as ET
+from hashlib import md5
+from io import StringIO as NewFile
+import h5py
+import numpy as np
+import pandas as pd
+from PIL import Image
 
 from retriever.lib.defaults import HOME_DIR, ENCODING
 from retriever.lib.tools import open_fr, open_csvw, open_fw
@@ -267,6 +268,20 @@ def geojson2csv(input_file, output_file, encoding):
     return output_file
 
 
+def hdf2csv(file, output_file, data_name, data_type, encoding=ENCODING):
+    if data_type == "csv":
+        data = pd.read_hdf(file, data_name)
+        data.to_csv(output_file, index=False)
+    elif data_type == "image":
+        file = h5py.File(file, 'r+')
+        data = file.get(data_name)
+        image = np.asarray(data)
+        im = Image.fromarray(image)
+        im.save(output_file)
+        file.close()
+    return output_file
+
+
 def getmd5(data, data_type='lines', encoding='utf-8'):
     """Get MD5 of a data source."""
     checksum = md5()
@@ -323,6 +338,7 @@ def sort_csv(filename, encoding=ENCODING):
 
     csv_writer = open_csvw(temp_file)
     i = 0
+    infields = None
     for row in csv_reader_infile:
         if i == 0:
             # The first entry is the header line

diff --git a/retriever/lib/templates.py b/retriever/lib/templates.py
@@ -218,6 +218,14 @@ def process_tables(self, table_obj, url):
 
             self.engine.process_xml2csv(src_path, path_to_csv, schema_fields, empty_rows)
 
+        if hasattr(table_obj, "hdf5_data"):
+            src_path = self.engine.format_filename(table_obj.hdf5_data[0])
+            path_to_csv = self.engine.format_filename(table_obj.path)
+            self.engine.download_file(url, table_obj.hdf5_data[0])
+            data_type = table_obj.hdf5_data[1]
+            data_name = table_obj.hdf5_data[2]
+            self.engine.process_hdf52csv(src_path, path_to_csv, data_name, data_type)
+
         if hasattr(table_obj, "path"):
             self.engine.auto_create_table(table_obj, url=url, filename=table_obj.path)
         else:

diff --git a/test/test_regression.py b/test/test_regression.py
@@ -62,7 +62,10 @@
     ("test-eco-level-four", ["gid", "us_l3code", "na_l3code", "na_l2code"], 'd1c01d8046143e9700f5cf92cbd6be3d'),
     ("test-raster-bio1", ["rid", "filename"], '27e0472ddc2da9fe807bfb48b786a251'),
     ("test-raster-bio2", ["rid", "filename"], '2983a9f7e099355db2ce2fa312a94cc6'),
-    ("test-us-eco", ["gid", "us_l3code", "na_l3code", "na_l2code"], 'eaab9fa30c745557ff6ba7c116910b45')
+    ("test-us-eco", ["gid", "us_l3code", "na_l3code", "na_l2code"], 'eaab9fa30c745557ff6ba7c116910b45'),
+    # h5py has compatibility issues in linux-Travis
+    # Tests pass locally
+    # ("sample-hdf", ["*"], '31e61867e9990138788a946542c4b1bf')
 ]
 
 # Tuple of (dataset_name, list of dict values corresponding to a table)

diff --git a/version.txt b/version.txt
@@ -83,6 +83,7 @@ predator_prey_body_ratio.json,1.0.1
 predator_prey_size_marine.py,2.0.2
 predicts.py,1.0.4
 prism_climate.py,1.2.3
+sample_hdf.json,1.0.0
 socean_diet_data.py,1.0.4
 sonoran_desert.json,1.0.0
 species_exctinction_rates.json,1.0.1