Merge pull request #502 from Deborah-Digges/issue-290

Issue 290: Out of memory error when downloading large dataset
weecology · Jun 13, 2016 · 003887e · 003887e
2 parents 7df67ca + f4d34a0
commit 003887e
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 21 deletions.
diff --git a/lib/engine.py b/lib/engine.py
@@ -154,15 +154,15 @@ def auto_create_table(self, table, url=None, filename=None, pk=None):
 
         source = (skip_rows,
                   (self.table.column_names_row - 1,
-                   (open, (file_path, "rb"))))
+                   (open, (file_path, "rU"))))
         lines = gen_from_source(source)
 
         header = next(lines)
         lines.close()
 
         source = (skip_rows,
                   (self.table.header_rows,
-                   (open, (file_path, "rb"))))
+                   (open, (file_path, "rU"))))
 
         if not self.table.delimiter:
             self.auto_get_delimiter(header)
@@ -383,20 +383,14 @@ def database_name(self, name=None):
 
         return db_name
 
-    def download_file(self, url, filename, clean_line_endings=True):
+    def download_file(self, url, filename):
         """Downloads a file to the raw data directory."""
         if not self.find_file(filename):
             path = self.format_filename(filename)
             self.create_raw_data_dir()
             print("Downloading " + filename + "...")
-            file = urllib.request.urlopen(url)
-            local_file = open(path, 'wb')
-            if clean_line_endings and (filename.split('.')[-1].lower() not in ["exe", "zip", "xls"]):
-                local_file.write(file.read().replace("\r\n", "\n").replace("\r", "\n"))
-            else:
-                local_file.write(file.read())
-            local_file.close()
-            file.close()
+            response = urllib.urlretrieve(url, path)
+
 
     def download_files_from_archive(self, url, filenames, filetype="zip",
                                     keep_in_dir=False, archivename=None):
@@ -425,21 +419,22 @@ def download_files_from_archive(self, url, filenames, filetype="zip",
             else:
                 self.create_raw_data_dir()
                 if not downloaded:
-                    self.download_file(url, archivename, clean_line_endings=False)
+                    self.download_file(url, archivename)
                     downloaded = True
 
                 if filetype == 'zip':
                     archive = zipfile.ZipFile(archivename)
-                    open_archive_file = archive.open(filename)
+                    open_archive_file = archive.open(filename, 'r')
                 elif filetype == 'gz':
                     # gzip archives can only contain a single file
-                    open_archive_file = gzip.open(archivename)
+                    open_archive_file = gzip.open(archivename, 'r')
                 elif filetype == 'tar':
-                    archive = tarfile.open(filename)
+                    archive = tarfile.open(filename, 'r')
                     open_archive_file = archive.extractfile(filename)
 
                 fileloc = self.format_filename(os.path.join(archivebase,
                                                             os.path.basename(filename)))
+
                 unzipped_file = open(fileloc, 'wb')
                 for line in open_archive_file:
                     unzipped_file.write(line)
@@ -605,7 +600,7 @@ def insert_data_from_file(self, filename):
         for inserting bulk data from files can override this function."""
         data_source = (skip_rows,
                        (self.table.header_rows,
-                        (open, (filename, 'r'))))
+                        (open, (filename, 'rU'))))
         self.add_to_table(data_source)
 
     def insert_data_from_url(self, url):
@@ -684,7 +679,7 @@ def filename_from_url(url):
 def gen_from_source(source):
     """Returns a generator from a source tuple.
     Source tuples are of the form (callable, args) where callable(*args)
-    returns either a generator or another source tuple.
+    returns either a generator or another source tuple. 
     This allows indefinite regeneration of data sources.
     """
     while isinstance(source, tuple):

diff --git a/lib/templates.py b/lib/templates.py
@@ -117,9 +117,10 @@ def __init__(self, **kwargs):
     def download(self, engine=None, debug=False):
         if engine.name != "Download Only":
             raise Exception("This dataset contains only non-tabular data files, and can only be used with the 'download only' engine.\nTry 'retriever download datasetname instead.")
-        Script.download(self, engine, debug)
-        for filename, url in list(self.urls.items()):
-            self.engine.download_file(url, filename, clean_line_endings=False)
+        Script.download(self, engine, debug) 
+
+        for filename, url in self.urls.items():
+            self.engine.download_file(url, filename)
             if os.path.exists(self.engine.format_filename(filename)):
                 shutil.copy(self.engine.format_filename(filename), DATA_DIR)
             else:

diff --git a/scripts/gentry.py b/scripts/gentry.py
@@ -56,7 +56,7 @@ def download(self, engine=None, debug=False):
         # Currently all_Excel.zip is missing CURUYUQU.xls
         # Download it separately and add it to the file list
         if not self.engine.find_file('CURUYUQU.xls'):
-            self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls", clean_line_endings=False)
+            self.engine.download_file("http://www.mobot.org/mobot/gentry/123/samerica/CURUYUQU.xls", "CURUYUQU.xls")
             filelist.append('CURUYUQU.xls')
 
         lines = []