Use Panda for Excel to CSV (#1630)

* Use openpyxl and Panda to transform excel to csv * Add install openpyxl * Update scripts plant-life-hist-eu: Use encoding to process files fao-global-capture-product: Use panda to_csv
weecology · Nov 5, 2021 · ea5150d · ea5150d
1 parent 326aa0d
commit ea5150d
Show file tree

Hide file tree

Showing 7 changed files with 18 additions and 43 deletions.
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -97,7 +97,7 @@ jobs:
       - name: Install python dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist -U
+          python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist openpyxl -U
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       
       - name: Install retriever

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -87,7 +87,7 @@ jobs:
       - name: Install python dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist -U
+          python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist openpyxl -U
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
       
       - name: Install retriever

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 future
 xlrd>=0.7
+openpyxl
 argcomplete
 kaggle
 PyMySQL>=0.4

diff --git a/retriever/lib/tools.py b/retriever/lib/tools.py
@@ -4,6 +4,7 @@
 import sys
 
 import xlrd
+import pandas as pd
 
 from retriever.lib.defaults import ENCODING
 
@@ -14,20 +15,8 @@ def excel_csv(src_path, path_to_csv, excel_info=None, encoding=ENCODING):
     Read src_path excel file and write the excel sheet to path_to_csv
     excel_info contains the index of the sheet and the excel file name
     """
-    book = xlrd.open_workbook(src_path, encoding_override=encoding, on_demand=True)
-    sheet_object = book.sheet_by_index(excel_info[0])
-    rows = sheet_object.nrows
-    dest_path = path_to_csv
-    new_data = open_fw(dest_path)
-    csv_writer = open_csvw(new_data)
-    for index in range(0, rows):
-        row = sheet_object.row(index)
-        # Get each row and format the sell value.
-        row_as_list = [to_str(column_value.value) for column_value in row]
-        csv_writer.writerow(row_as_list)
-    new_data.close()
-    book.release_resources()
-    del book
+    df = pd.read_excel(src_path, sheet_name=excel_info[1])
+    df.to_csv(path_to_csv, sep=',', encoding=encoding, index=False, header=True)
 
 
 def open_fr(file_name, encoding=ENCODING, encode=True):

diff --git a/scripts/fao_global_capture_product.py b/scripts/fao_global_capture_product.py
@@ -9,7 +9,7 @@
 reload(sys)
 if hasattr(sys, 'setdefaultencoding'):
     sys.setdefaultencoding("UTF-8")
-import xlrd
+import pandas as pd
 
 from retriever.lib.models import Table, Cleanup, correct_invalid_value
 from retriever.lib.templates import Script
@@ -33,7 +33,7 @@ def __init__(self, **kwargs):
         self.retriever_minimum_version = '2.1.dev'
         self.urls = {
             "capture": "http://www.fao.org/fishery/static/Data/Capture_2018.1.2.zip"}
-        self.version = '1.0.0'
+        self.version = '1.1.0'
         self.ref = "http://www.fao.org/fishery/statistics/global-capture-production/"
         self.citation = "FAO. 2018. FAO yearbook. Fishery and Aquaculture Statistics " \
                         "2016/FAO annuaire. Statistiques des pêches et de l'aquaculture " \
@@ -49,32 +49,16 @@ def __init__(self, **kwargs):
     def download(self, engine=None, debug=False):
         Script.download(self, engine, debug)
         engine = self.engine
-
         engine.download_files_from_archive(self.urls["capture"], archive_type="zip")
 
         # Convert xlsx to csv.
         xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx")
-        file_path = self.engine.format_filename("DSD_CAPTURE.csv")
-        book = xlrd.open_workbook(xlsx_file)
-        sh = book.sheet_by_index(0)
-        rows = sh.nrows
-
-        # Creating data files
-        new_data = open_fw(file_path)
-        csv_writer = open_csvw(new_data)
-        csv_writer.writerow(["Order", "Concept_id",
-                             "Role_Type", "Codelist_id",
-                             "Codelist_Code_id", "Description"])
-
-        for index in range(2, rows):
-            row = sh.row(index)
-            # Get each row and format the sell value.
-            # Data starts at index 2
-            row_as_list = [to_str(column_value.value) for column_value in row]
-            csv_writer.writerow(row_as_list)
-        new_data.close()
+        file_path = self.engine.format_filename("DSD_FI_CAPTURE.csv")
+        df = pd.read_excel(xlsx_file)
+        df.to_csv(file_path, sep=',', encoding=self.encoding, index=False, header=False)
 
         file_names = [
+            ('DSD_FI_CAPTURE.csv', 'capture_data'),
             ('CL_FI_UNIT.csv', 'unit_data'),
             ('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'),
             ('DSD_CAPTURE.csv', 'dsd_capture_data'),

diff --git a/scripts/plant_life_hist_eu.py b/scripts/plant_life_hist_eu.py
@@ -23,7 +23,7 @@ def __init__(self, **kwargs):
         self.title = "A database on the life history traits of the Northwest European flora"
         self.name = "plant-life-hist-eu"
         self.retriever_minimum_version = '2.0.dev'
-        self.version = '1.4.3'
+        self.version = '1.5.0'
         self.ref = "http://www.uni-oldenburg.de/en/biology/landeco/research/projects/leda/"
         self.urls = {
             "Age_of_first_flowering": "http://www.uni-oldenburg.de/fileadmin/user_upload/biologie/ag/landeco/download/LEDA/Data_files/age_of_first_flowering.txt",
@@ -53,6 +53,7 @@ def __init__(self, **kwargs):
             "Woodiness": "http://www.uni-oldenburg.de/fileadmin/user_upload/biologie/ag/landeco/download/LEDA/Data_files/ssd.txt",
             "Terminal_velocity": "http://www.uni-oldenburg.de/fileadmin/user_upload/biologie/ag/landeco/download/LEDA/Data_files/TV.txt",
         }
+        self.encoding = "latin-1"
         self.citation = "KLEYER, M., BEKKER, R.M., KNEVEL, I.C., BAKKER, J.P, THOMPSON, K., SONNENSCHEIN, M., POSCHLOD, P., VAN GROENENDAEL, J.M., KLIMES, L., KLIMESOVA, J., KLOTZ, S., RUSCH, G.M., HERMY, M., ADRIAENS, D., BOEDELTJE, G., BOSSUYT, B., DANNEMANN, A., ENDELS, P., GoeTZENBERGER, L., HODGSON, J.G., JACKEL, A-K., KueHN, I., KUNZMANN, D., OZINGA, W.A., RoeMERMANN, C., STADLER, M., SCHLEGELMILCH, J., STEENDAM, H.J., TACKENBERG, O., WILMANN, B., CORNELISSEN, J.H.C., ERIKSSON, O., GARNIER, E., PECO, B. (2008): The LEDA Traitbase: A database of life-history traits of Northwest European flora. Journal of Ecology 96: 1266-1274"
         self.keywords = ['plants', 'observational']
         self.description = "The LEDA Traitbase provides information on plant traits that describe three key features of plant dynamics: persistence, regeneration and dispersal. "
@@ -70,8 +71,8 @@ def download(self, engine=None, debug=False):
         for key in self.urls:
             self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1])
             new_file_path = self.engine.format_filename("new" + key)
-            old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]))
-            new_data = open_fw(new_file_path)
+            old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]), encoding=self.encoding)
+            new_data = open_fw(new_file_path, encoding=self.encoding)
             with old_data as file_block:
 
                 # after the metadata lines, set data to True

diff --git a/version.txt b/version.txt
@@ -26,7 +26,7 @@ croche_vegetation_data.json,1.0.2
 dicerandra_frutescens.json,1.0.2
 ecoregions_us.json,1.0.1
 elton_traits.json,1.2.1
-fao_global_capture_product.py,1.0.0
+fao_global_capture_product.py,1.1.0
 fish_parasite_hosts.json,1.2.1
 flensburg_food_web.py,1.0.4
 forest_biomass_china.json,1.2.1
@@ -73,7 +73,7 @@ pantheria.py,1.3.3
 partners_in_flight.json,1.0.1
 phytoplankton_size.json,1.2.1
 plant_comp_ok.json,1.2.1
-plant_life_hist_eu.py,1.4.3
+plant_life_hist_eu.py,1.5.0
 plant_occur_oosting.json,1.2.1
 plant_taxonomy_us.json,1.1.3
 poker_hands.json,1.2.2