Merge pull request #1395 from henrykironde/resource-excel-archive

Update file processing
weecology · Oct 29, 2019 · 34bc859 · 34bc859
2 parents 2f1a4d9 + dc8deb5
commit 34bc859
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 11 deletions.
diff --git a/retriever/lib/engine.py b/retriever/lib/engine.py
@@ -21,7 +21,8 @@
 from collections import OrderedDict
 from math import ceil
 from tqdm import tqdm
-from retriever.lib.tools import open_fr, open_fw, open_csvw, walk_relative_path
+from retriever.lib.tools import (open_fr, open_fw, open_csvw,
+                                 walk_relative_path, excel_csv)
 from setuptools import archive_util
 from retriever.lib.defaults import DATA_DIR, DATA_SEARCH_PATHS, DATA_WRITE_PATH, ENCODING
 from retriever.lib.cleanup import no_cleanup
@@ -547,6 +548,11 @@ def executemany(self, statement, values, commit=True):
         if commit:
             self.connection.commit()
 
+    def excel_to_csv(self, src_path, path_to_csv, excel_info=None, encoding=ENCODING):
+        """Convert excel files to csv files."""
+        if self.find_file(src_path) and excel_info:
+            excel_csv(src_path, path_to_csv, excel_info, encoding)
+
     def extract_gz(self, archive_path, archivedir_write_path, file_name=None,
                    open_archive_file=None, archive=None):
         """Extract gz files.

diff --git a/retriever/lib/templates.py b/retriever/lib/templates.py
@@ -111,8 +111,8 @@ def download(self, engine=None, debug=False):
             elif self.url:
                 url = self.url
 
-            # Extract compressed source files
-            if hasattr(self, "archived"):
+            # Extract archived files if a resource or the script has archived
+            if hasattr(self, "archived") or hasattr(table_obj, "archived"):
                 self.process_archived_data(table_obj, url)
 
             # Create tables
@@ -137,9 +137,9 @@ def download(self, engine=None, debug=False):
             self.engine.disconnect_files()
 
     def process_tabular_insert(self, table_obj, url):
-        if hasattr(self, "archived"):
-            self.engine.insert_data_from_file(
-                self.engine.format_filename(table_obj.path))
+        if hasattr(self, "archived") or hasattr(table_obj, "path"):
+            path_to_file = self.engine.format_filename(table_obj.path)
+            self.engine.insert_data_from_file(path_to_file)
         else:
             self.engine.insert_data_from_url(url)
 
@@ -150,27 +150,52 @@ def process_spatial_insert(self, table_obj):
             self.engine.insert_vector(self.engine.format_filename(table_obj.path))
 
     def process_tables(self, table_obj, url):
-        if hasattr(self, "archived"):
-            self.engine.auto_create_table(table_obj, filename=table_obj.path)
+        """Obtain the clean file and create a table
+
+        if xls_sheets, convert excel to csv
+        Create the table from the file
+        """
+        if hasattr(table_obj, "xls_sheets"):
+            src_path = self.engine.format_filename(table_obj.xls_sheets[1])
+            path_to_csv = self.engine.format_filename(table_obj.path)
+            self.engine.download_file(url, table_obj.xls_sheets[1])
+            self.engine.excel_to_csv(src_path, path_to_csv, table_obj.xls_sheets, self.encoding)
+
+        if hasattr(table_obj, "path"):
+            self.engine.auto_create_table(table_obj, url=url, filename=table_obj.path)
         else:
             self.engine.auto_create_table(table_obj, url=url)
 
     def process_archived_data(self, table_obj, url):
         """Pre-process archived files.
 
+        Archive info is specified for a single resource or entire data package.
         Extract the files from the archived source based on
-        the specifications. Either extact a single file or the
-        entire files.
+        the specifications. Either extract a single file or all files.
+        If the archived data is excel, use the
+        xls_sheets to obtain the files to be extracted.
         """
-        archive_type = self.archived
+        archive_type = "zip"
         keep_in_dir = False
         archive_name = None
+        files = None
+
+        # First check the resource for the archived info, else check the table object
+        if hasattr(table_obj, "archived"):
+            archive_type = table_obj.archived
+        elif hasattr(self, "archived"):
+            archive_type = self.archived
 
         if hasattr(self, "extract_all"):
             if self.extract_all:
                 files = None
         else:
             files = [table_obj.path]
+
+        if hasattr(table_obj, "xls_sheets"):
+            # xls_sheets has [index of sheet, excel_filename]
+            files = [table_obj.xls_sheets[1]]
+
         if hasattr(self, "keep_in_dir"):
             keep_in_dir = self.keep_in_dir
         if hasattr(self, "archive_name"):

diff --git a/retriever/lib/tools.py b/retriever/lib/tools.py
@@ -3,9 +3,33 @@
 import os
 import sys
 
+import xlrd
+
 from retriever.lib.defaults import ENCODING
 
 
+def excel_csv(src_path, path_to_csv, excel_info=None, encoding=ENCODING):
+    """Convert an excel sheet to csv
+
+    Read src_path excel file and write the excel sheet to path_to_csv
+    excel_info contains the index of the sheet and the excel file name
+    """
+    book = xlrd.open_workbook(src_path, encoding_override=encoding, on_demand=True)
+    sheet_object = book.sheet_by_index(excel_info[0])
+    rows = sheet_object.nrows
+    dest_path = path_to_csv
+    new_data = open_fw(dest_path)
+    csv_writer = open_csvw(new_data)
+    for index in range(0, rows):
+        row = sheet_object.row(index)
+        # Get each row and format the sell value.
+        row_as_list = [to_str(column_value.value) for column_value in row]
+        csv_writer.writerow(row_as_list)
+    new_data.close()
+    book.release_resources()
+    del book
+
+
 def open_fr(file_name, encoding=ENCODING, encode=True):
     """Open file for reading respecting Python version and OS differences.
 
@@ -49,6 +73,7 @@ def open_csvw(csv_file):
 
 
 def to_str(object, object_encoding=sys.stdout, object_decoder=ENCODING):
+    """Convert encoded values to string"""
     enc = object_encoding.encoding
     return str(object).encode(enc, errors='backslashreplace').decode(object_decoder)