Skip to content

Commit

Permalink
Merge pull request #1512 from ashishpriyadarshiCIC/test-script-hdf5
Browse files Browse the repository at this point in the history
HDF5 to csv files conversion test
  • Loading branch information
henrykironde committed Sep 6, 2020
2 parents b3a62e2 + 19ef9a0 commit 89fd22f
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 12 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ RUN pip install pytest
RUN pip install yapf
RUN pip install pylint
RUN pip install flake8 -U
RUN pip install h5py
RUN pip install Pillow

# Install Postgis after Python is setup
RUN apt-get install -y --force-yes postgis
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ sphinx_rtd_theme
tqdm==4.30.0
pandas
setuptools
h5py
Pillow
12 changes: 11 additions & 1 deletion retriever/lib/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from retriever.lib.engine_tools import sqlite2csv
from retriever.lib.engine_tools import json2csv
from retriever.lib.engine_tools import xml2csv
from retriever.lib.engine_tools import hdf2csv
from retriever.lib.warning import Warning


Expand Down Expand Up @@ -615,6 +616,15 @@ def process_xml2csv(self,
if self.find_file(src_path):
xml2csv(src_path, path_to_csv, header_values, empty_rows, encoding)

def process_hdf52csv(self,
src_path,
path_to_csv,
data_name,
data_type,
encoding=ENCODING):
if self.find_file(src_path):
hdf2csv(src_path, path_to_csv, data_name, data_type, encoding=ENCODING)

def extract_gz(
self,
archive_path,
Expand Down Expand Up @@ -915,7 +925,7 @@ def table_name(self, name=None, dbname=None):
dbname = ''
return self.opts["table_name"].format(db=dbname, table=name)

def to_csv(self, sort=True, path=None, select_columns=None):
def to_csv(self, sort=True, path=None, select_columns=None, select_table=None):
"""Create a CSV file from the a data store.
sort flag to create a sorted file,
Expand Down
36 changes: 26 additions & 10 deletions retriever/lib/engine_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,31 @@
scripts.
"""
import csv
import itertools
import json
import os
import platform
import shutil
import sqlite3 as sql
import subprocess
import warnings
import pandas as pd
import itertools
from string import ascii_lowercase
from sqlite3 import Error
import sqlite3 as sql

from hashlib import md5
from io import StringIO as NewFile

import xml.etree.ElementTree as ET
import os
import csv
try:
# Geopanda installation is not smooth on the CI tests platforms
import geopandas
except ModuleNotFoundError:
pass
from pandas.io.json import json_normalize
from collections import OrderedDict
import xml.etree.ElementTree as ET
from hashlib import md5
from io import StringIO as NewFile
import h5py
import numpy as np
import pandas as pd
from PIL import Image

from retriever.lib.defaults import HOME_DIR, ENCODING
from retriever.lib.tools import open_fr, open_csvw, open_fw
Expand Down Expand Up @@ -267,6 +268,20 @@ def geojson2csv(input_file, output_file, encoding):
return output_file


def hdf2csv(file, output_file, data_name, data_type, encoding=ENCODING):
if data_type == "csv":
data = pd.read_hdf(file, data_name)
data.to_csv(output_file, index=False)
elif data_type == "image":
file = h5py.File(file, 'r+')
data = file.get(data_name)
image = np.asarray(data)
im = Image.fromarray(image)
im.save(output_file)
file.close()
return output_file


def getmd5(data, data_type='lines', encoding='utf-8'):
"""Get MD5 of a data source."""
checksum = md5()
Expand Down Expand Up @@ -323,6 +338,7 @@ def sort_csv(filename, encoding=ENCODING):

csv_writer = open_csvw(temp_file)
i = 0
infields = None
for row in csv_reader_infile:
if i == 0:
# The first entry is the header line
Expand Down
8 changes: 8 additions & 0 deletions retriever/lib/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,14 @@ def process_tables(self, table_obj, url):

self.engine.process_xml2csv(src_path, path_to_csv, schema_fields, empty_rows)

if hasattr(table_obj, "hdf5_data"):
src_path = self.engine.format_filename(table_obj.hdf5_data[0])
path_to_csv = self.engine.format_filename(table_obj.path)
self.engine.download_file(url, table_obj.hdf5_data[0])
data_type = table_obj.hdf5_data[1]
data_name = table_obj.hdf5_data[2]
self.engine.process_hdf52csv(src_path, path_to_csv, data_name, data_type)

if hasattr(table_obj, "path"):
self.engine.auto_create_table(table_obj, url=url, filename=table_obj.path)
else:
Expand Down
5 changes: 4 additions & 1 deletion test/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@
("test-eco-level-four", ["gid", "us_l3code", "na_l3code", "na_l2code"], 'd1c01d8046143e9700f5cf92cbd6be3d'),
("test-raster-bio1", ["rid", "filename"], '27e0472ddc2da9fe807bfb48b786a251'),
("test-raster-bio2", ["rid", "filename"], '2983a9f7e099355db2ce2fa312a94cc6'),
("test-us-eco", ["gid", "us_l3code", "na_l3code", "na_l2code"], 'eaab9fa30c745557ff6ba7c116910b45')
("test-us-eco", ["gid", "us_l3code", "na_l3code", "na_l2code"], 'eaab9fa30c745557ff6ba7c116910b45'),
# h5py has compatibility issues in linux-Travis
# Tests pass locally
# ("sample-hdf", ["*"], '31e61867e9990138788a946542c4b1bf')
]

# Tuple of (dataset_name, list of dict values corresponding to a table)
Expand Down
1 change: 1 addition & 0 deletions version.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ predator_prey_body_ratio.json,1.0.1
predator_prey_size_marine.py,2.0.2
predicts.py,1.0.4
prism_climate.py,1.2.3
sample_hdf.json,1.0.0
socean_diet_data.py,1.0.4
sonoran_desert.json,1.0.0
species_exctinction_rates.json,1.0.1
Expand Down

0 comments on commit 89fd22f

Please sign in to comment.