Skip to content

Commit

Permalink
Merge pull request #1514 from ashishpriyadarshiCIC/hdf5-engine-new
Browse files Browse the repository at this point in the history
HDF5 engine
  • Loading branch information
henrykironde committed Sep 19, 2020
2 parents b8eaf50 + f4a7ea0 commit 3ce10ab
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 20 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ sphinx_rtd_theme
tqdm==4.30.0
pandas
setuptools
tables
h5py
Pillow
16 changes: 1 addition & 15 deletions retriever/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,21 +231,7 @@ def main():
else:
raise Exception("no dataset specified.")
if scripts:
if args.dataset.endswith('.zip') or (hasattr(args, 'hash_value') and
args.hash_value):
_install(vars(args), debug=debug, use_cache=use_cache)
return
for dataset in scripts:
print("=> Installing", dataset.name)
try:
dataset.download(engine, debug=debug)
dataset.engine.final_cleanup()
except KeyboardInterrupt:
pass
except Exception as e:
print(e)
if debug:
raise
_install(vars(args), debug=debug, use_cache=use_cache)
print("Done!")
else:
print("Run 'retriever ls' to see a list of currently available datasets.")
Expand Down
2 changes: 1 addition & 1 deletion retriever/engines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

engines = [
"mysql", "postgres", "sqlite", "msaccess", "csvengine", "download_only", "jsonengine",
"xmlengine"
"xmlengine", "hdf5"
]

engine_module_list = [
Expand Down
71 changes: 71 additions & 0 deletions retriever/engines/hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os

import pandas as pd
from pandas import HDFStore
import sqlite3 as dbapi

from retriever.lib.defaults import DATA_DIR
from retriever.lib.dummy import DummyConnection
from retriever.lib.models import Engine


class engine(Engine):
"""Engine instance for writing data to a HDF5 file."""

name = "HDF5"
abbreviation = "hdf5"
insert_limit = 1000
required_opts = [
("file", "Enter the filename of your HDF5 file", "hdf5.h5"),
("table_name", "Format of table name", "{db}_{table}"),
("data_dir", "Install directory", DATA_DIR),
]

def create_db(self):
"""Override create_db since an SQLite dataset needs to be created
first followed by the creation of an empty HDFStore file.
"""
file_path = os.path.join(self.opts["data_dir"], self.opts["file"])
self.file = HDFStore(file_path)

def create_table(self):
"""Don't create table for HDF5
HDF5 doesn't create tables. Each database is a file which has been
created. This overloads`create_table` to do nothing in this case.
"""
return None

def insert_data_from_file(self, filename):
"""Fill the table by fetching the dataframe from the
SQLite engine and putting it into the HDFStore file.
"""
table_name = self.table_name()
df = self.fetch_table(table_name)
self.file.put(table_name, df, data_columns=True)

def fetch_table(self, table_name):
"""Return a table from sqlite dataset as pandas dataframe."""
connection = self.get_sqlite_connection()
sql_query = "SELECT * FROM {};".format(table_name)
return pd.read_sql_query(sql_query, connection)

def get_sqlite_connection(self):
# self.get_input()
file = self.opts["file"]
file = (file.split("."))[0] + ".db"
db_file = self.opts["data_dir"]
full_path = os.path.join(db_file, file)
return dbapi.connect(os.path.normpath(full_path))

def get_connection(self):
"""Gets the db connection."""
self.get_input()
return DummyConnection()

def disconnect(self):
"""Close the file after being written"""
self.file.close()
file = self.opts["file"]
file = (file.split("."))[0] + ".db"
os.remove(file)
3 changes: 2 additions & 1 deletion retriever/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .install import install_postgres
from .install import install_sqlite
from .install import install_xml
from .install import install_hdf5
from .provenance import commit, commit_log
from .repository import check_for_updates
from .engine_tools import reset_retriever
Expand All @@ -25,6 +26,6 @@
'check_for_updates', 'commit', 'commit_log', 'create_package', 'datasets',
'dataset_names', 'download', 'reload_scripts', 'reset_retriever', 'install_csv',
'install_mysql', 'install_postgres', 'install_sqlite', 'install_msaccess',
'install_json', 'install_xml', 'fetch', 'get_script_upstream',
'install_json', 'install_xml', 'install_hdf5', 'fetch', 'get_script_upstream',
'get_dataset_names_upstream', 'get_script_citation', "__version__"
]
33 changes: 33 additions & 0 deletions retriever/lib/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,20 @@ def _install(args, use_cache, debug):
data_sets_scripts = name_matches(script_list, args['dataset'])
if data_sets_scripts:
for data_sets_script in data_sets_scripts:
print("=> Installing", data_sets_script.name)
try:
if engine.name == "HDF5":
sqlite_opts = {
'command': 'install',
'dataset': data_sets_script,
'engine': 'sqlite',
'file': (args["file"].split("."))[0] + ".db",
'table_name': args["table_name"],
'data_dir': args["data_dir"]
}
sqlite_engine = choose_engine(sqlite_opts)
data_sets_script.download(sqlite_engine, debug=debug)
data_sets_script.engine.final_cleanup()
engine.script_table_registry = OrderedDict()
data_sets_script.download(engine, debug=debug)
data_sets_script.engine.final_cleanup()
Expand Down Expand Up @@ -211,3 +224,23 @@ def install_xml(dataset,
'hash_value': hash_value
}
return _install(args, use_cache, debug)


def install_hdf5(dataset,
file='hdf5.h5',
table_name='{db}_{table}',
data_dir=DATA_DIR,
debug=False,
use_cache=True,
hash_value=None):
"""Install datasets into hdf5."""
args = {
'command': 'install',
'dataset': dataset,
'engine': 'hdf5',
'file': file,
'table_name': table_name,
'data_dir': data_dir,
'hash_value': hash_value
}
return _install(args, use_cache, debug)
2 changes: 1 addition & 1 deletion test/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
mysqldb_host = "mysqldb_retriever"

mysql_engine, postgres_engine, sqlite_engine, msaccess_engine, \
csv_engine, download_engine, json_engine, xml_engine = engine_list
csv_engine, download_engine, json_engine, xml_engine, _ = engine_list

simple_csv = {
'name': 'simple_csv',
Expand Down
2 changes: 1 addition & 1 deletion test/test_provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

file_location = os.path.normpath(os.path.dirname(os.path.realpath(__file__)))
mysql_engine, postgres_engine, sqlite_engine, msaccess_engine, \
csv_engine, download_engine, json_engine, xml_engine = engine_list
csv_engine, download_engine, json_engine, xml_engine, _ = engine_list

test_commit_details = [
(
Expand Down
2 changes: 1 addition & 1 deletion test/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
mysqldb_host = "mysqldb_retriever"

mysql_engine, postgres_engine, sqlite_engine, msaccess_engine, \
csv_engine, download_engine, json_engine, xml_engine = engine_list
csv_engine, download_engine, json_engine, xml_engine, _ = engine_list
file_location = os.path.dirname(os.path.realpath(__file__))
retriever_root_dir = os.path.abspath(os.path.join(file_location, os.pardir))
working_script_dir = os.path.abspath(os.path.join(retriever_root_dir, "scripts"))
Expand Down

0 comments on commit 3ce10ab

Please sign in to comment.