Skip to content

Commit

Permalink
Merge pull request #502 from Deborah-Digges/issue-290
Browse files Browse the repository at this point in the history
Issue 290: Out of memory error when downloading large dataset
  • Loading branch information
henrykironde committed Jun 13, 2016
2 parents 7df67ca + f4d34a0 commit 003887e
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 21 deletions.
29 changes: 12 additions & 17 deletions lib/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,15 @@ def auto_create_table(self, table, url=None, filename=None, pk=None):

source = (skip_rows,
(self.table.column_names_row - 1,
(open, (file_path, "rb"))))
(open, (file_path, "rU"))))
lines = gen_from_source(source)

header = next(lines)
lines.close()

source = (skip_rows,
(self.table.header_rows,
(open, (file_path, "rb"))))
(open, (file_path, "rU"))))

if not self.table.delimiter:
self.auto_get_delimiter(header)
Expand Down Expand Up @@ -383,20 +383,14 @@ def database_name(self, name=None):

return db_name

def download_file(self, url, filename, clean_line_endings=True):
def download_file(self, url, filename):
"""Downloads a file to the raw data directory."""
if not self.find_file(filename):
path = self.format_filename(filename)
self.create_raw_data_dir()
print("Downloading " + filename + "...")
file = urllib.request.urlopen(url)
local_file = open(path, 'wb')
if clean_line_endings and (filename.split('.')[-1].lower() not in ["exe", "zip", "xls"]):
local_file.write(file.read().replace("\r\n", "\n").replace("\r", "\n"))
else:
local_file.write(file.read())
local_file.close()
file.close()
response = urllib.urlretrieve(url, path)


def download_files_from_archive(self, url, filenames, filetype="zip",
keep_in_dir=False, archivename=None):
Expand Down Expand Up @@ -425,21 +419,22 @@ def download_files_from_archive(self, url, filenames, filetype="zip",
else:
self.create_raw_data_dir()
if not downloaded:
self.download_file(url, archivename, clean_line_endings=False)
self.download_file(url, archivename)
downloaded = True

if filetype == 'zip':
archive = zipfile.ZipFile(archivename)
open_archive_file = archive.open(filename)
open_archive_file = archive.open(filename, 'r')
elif filetype == 'gz':
# gzip archives can only contain a single file
open_archive_file = gzip.open(archivename)
open_archive_file = gzip.open(archivename, 'r')
elif filetype == 'tar':
archive = tarfile.open(filename)
archive = tarfile.open(filename, 'r')
open_archive_file = archive.extractfile(filename)

fileloc = self.format_filename(os.path.join(archivebase,
os.path.basename(filename)))

unzipped_file = open(fileloc, 'wb')
for line in open_archive_file:
unzipped_file.write(line)
Expand Down Expand Up @@ -605,7 +600,7 @@ def insert_data_from_file(self, filename):
for inserting bulk data from files can override this function."""
data_source = (skip_rows,
(self.table.header_rows,
(open, (filename, 'r'))))
(open, (filename, 'rU'))))
self.add_to_table(data_source)

def insert_data_from_url(self, url):
Expand Down Expand Up @@ -684,7 +679,7 @@ def filename_from_url(url):
def gen_from_source(source):
"""Returns a generator from a source tuple.
Source tuples are of the form (callable, args) where callable(*args)
returns either a generator or another source tuple.
returns either a generator or another source tuple.
This allows indefinite regeneration of data sources.
"""
while isinstance(source, tuple):
Expand Down
7 changes: 4 additions & 3 deletions lib/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@ def __init__(self, **kwargs):
def download(self, engine=None, debug=False):
if engine.name != "Download Only":
raise Exception("This dataset contains only non-tabular data files, and can only be used with the 'download only' engine.\nTry 'retriever download datasetname instead.")
Script.download(self, engine, debug)
for filename, url in list(self.urls.items()):
self.engine.download_file(url, filename, clean_line_endings=False)
Script.download(self, engine, debug)

for filename, url in self.urls.items():
self.engine.download_file(url, filename)
if os.path.exists(self.engine.format_filename(filename)):
shutil.copy(self.engine.format_filename(filename), DATA_DIR)
else:
Expand Down
2 changes: 1 addition & 1 deletion scripts/gentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def download(self, engine=None, debug=False):
# Currently all_Excel.zip is missing CURUYUQU.xls
# Download it separately and add it to the file list
if not self.engine.find_file('CURUYUQU.xls'):
self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls", clean_line_endings=False)
self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls")
filelist.append('CURUYUQU.xls')

lines = []
Expand Down

0 comments on commit 003887e

Please sign in to comment.