Skip to content

Commit

Permalink
Merge pull request #1395 from henrykironde/resource-excel-archive
Browse files Browse the repository at this point in the history
Update file processing
  • Loading branch information
ethanwhite committed Oct 29, 2019
2 parents 2f1a4d9 + dc8deb5 commit 34bc859
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 11 deletions.
8 changes: 7 additions & 1 deletion retriever/lib/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
from collections import OrderedDict
from math import ceil
from tqdm import tqdm
from retriever.lib.tools import open_fr, open_fw, open_csvw, walk_relative_path
from retriever.lib.tools import (open_fr, open_fw, open_csvw,
walk_relative_path, excel_csv)
from setuptools import archive_util
from retriever.lib.defaults import DATA_DIR, DATA_SEARCH_PATHS, DATA_WRITE_PATH, ENCODING
from retriever.lib.cleanup import no_cleanup
Expand Down Expand Up @@ -547,6 +548,11 @@ def executemany(self, statement, values, commit=True):
if commit:
self.connection.commit()

def excel_to_csv(self, src_path, path_to_csv, excel_info=None, encoding=ENCODING):
"""Convert excel files to csv files."""
if self.find_file(src_path) and excel_info:
excel_csv(src_path, path_to_csv, excel_info, encoding)

def extract_gz(self, archive_path, archivedir_write_path, file_name=None,
open_archive_file=None, archive=None):
"""Extract gz files.
Expand Down
45 changes: 35 additions & 10 deletions retriever/lib/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ def download(self, engine=None, debug=False):
elif self.url:
url = self.url

# Extract compressed source files
if hasattr(self, "archived"):
# Extract archived files if a resource or the script has archived
if hasattr(self, "archived") or hasattr(table_obj, "archived"):
self.process_archived_data(table_obj, url)

# Create tables
Expand All @@ -137,9 +137,9 @@ def download(self, engine=None, debug=False):
self.engine.disconnect_files()

def process_tabular_insert(self, table_obj, url):
if hasattr(self, "archived"):
self.engine.insert_data_from_file(
self.engine.format_filename(table_obj.path))
if hasattr(self, "archived") or hasattr(table_obj, "path"):
path_to_file = self.engine.format_filename(table_obj.path)
self.engine.insert_data_from_file(path_to_file)
else:
self.engine.insert_data_from_url(url)

Expand All @@ -150,27 +150,52 @@ def process_spatial_insert(self, table_obj):
self.engine.insert_vector(self.engine.format_filename(table_obj.path))

def process_tables(self, table_obj, url):
if hasattr(self, "archived"):
self.engine.auto_create_table(table_obj, filename=table_obj.path)
"""Obtain the clean file and create a table
if xls_sheets, convert excel to csv
Create the table from the file
"""
if hasattr(table_obj, "xls_sheets"):
src_path = self.engine.format_filename(table_obj.xls_sheets[1])
path_to_csv = self.engine.format_filename(table_obj.path)
self.engine.download_file(url, table_obj.xls_sheets[1])
self.engine.excel_to_csv(src_path, path_to_csv, table_obj.xls_sheets, self.encoding)

if hasattr(table_obj, "path"):
self.engine.auto_create_table(table_obj, url=url, filename=table_obj.path)
else:
self.engine.auto_create_table(table_obj, url=url)

def process_archived_data(self, table_obj, url):
"""Pre-process archived files.
Archive info is specified for a single resource or entire data package.
Extract the files from the archived source based on
the specifications. Either extact a single file or the
entire files.
the specifications. Either extract a single file or all files.
If the archived data is excel, use the
xls_sheets to obtain the files to be extracted.
"""
archive_type = self.archived
archive_type = "zip"
keep_in_dir = False
archive_name = None
files = None

# First check the resource for the archived info, else check the table object
if hasattr(table_obj, "archived"):
archive_type = table_obj.archived
elif hasattr(self, "archived"):
archive_type = self.archived

if hasattr(self, "extract_all"):
if self.extract_all:
files = None
else:
files = [table_obj.path]

if hasattr(table_obj, "xls_sheets"):
# xls_sheets has [index of sheet, excel_filename]
files = [table_obj.xls_sheets[1]]

if hasattr(self, "keep_in_dir"):
keep_in_dir = self.keep_in_dir
if hasattr(self, "archive_name"):
Expand Down
25 changes: 25 additions & 0 deletions retriever/lib/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,33 @@
import os
import sys

import xlrd

from retriever.lib.defaults import ENCODING


def excel_csv(src_path, path_to_csv, excel_info=None, encoding=ENCODING):
"""Convert an excel sheet to csv
Read src_path excel file and write the excel sheet to path_to_csv
excel_info contains the index of the sheet and the excel file name
"""
book = xlrd.open_workbook(src_path, encoding_override=encoding, on_demand=True)
sheet_object = book.sheet_by_index(excel_info[0])
rows = sheet_object.nrows
dest_path = path_to_csv
new_data = open_fw(dest_path)
csv_writer = open_csvw(new_data)
for index in range(0, rows):
row = sheet_object.row(index)
# Get each row and format the sell value.
row_as_list = [to_str(column_value.value) for column_value in row]
csv_writer.writerow(row_as_list)
new_data.close()
book.release_resources()
del book


def open_fr(file_name, encoding=ENCODING, encode=True):
"""Open file for reading respecting Python version and OS differences.
Expand Down Expand Up @@ -49,6 +73,7 @@ def open_csvw(csv_file):


def to_str(object, object_encoding=sys.stdout, object_decoder=ENCODING):
"""Convert encoded values to string"""
enc = object_encoding.encoding
return str(object).encode(enc, errors='backslashreplace').decode(object_decoder)

Expand Down

0 comments on commit 34bc859

Please sign in to comment.