From 91c7ca6620a153deded7aab3c3520e7935c9be25 Mon Sep 17 00:00:00 2001 From: Victor Calderon Date: Mon, 3 Dec 2018 18:10:09 -0600 Subject: [PATCH] Fixed 'url_file_list' --- cosmo_utils/utils/web_utils.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cosmo_utils/utils/web_utils.py b/cosmo_utils/utils/web_utils.py index c3dd5f0..16a0f71 100644 --- a/cosmo_utils/utils/web_utils.py +++ b/cosmo_utils/utils/web_utils.py @@ -100,11 +100,17 @@ def url_file_list(url, ext): page = requests.get(url).text # Converting to BeautifulSoup format soup = BeautifulSoup(page, 'html.parser') - # Obtaining list of files - files_arr = np.array([url + '/' + node.get('href') for node in - soup.find_all('a') if node.get('href').endswith(ext)]) + ## Obtaining list of files + # Removing files that are NOT strings + files_arr_pre = np.array([ xx.get('href') for xx in soup.find_all('a') + if isinstance(xx.get('href'), str)]) + # Only those finishing with certain extension + files_pre_ext = np.array([xx for xx in files_arr_pre if xx.endswith(ext)]) + # Checking if file contains string 'http://' + files_pre_web = np.array([(url + '/' + xx) if not ('//' in xx) else xx + for xx in files_pre_ext]) # Sorting out file array - files_arr = np.sort(files_arr) + files_arr = np.sort(files_pre_web) return files_arr