Skip to content

Commit

Permalink
Use Panda for Excel to CSV (#1630)
Browse files Browse the repository at this point in the history
* Use openpyxl and Panda to transform excel to csv

* Add install openpyxl

* Update scripts

plant-life-hist-eu: Use encoding to process files
fao-global-capture-product: Use panda to_csv
  • Loading branch information
henrykironde committed Nov 5, 2021
1 parent 326aa0d commit ea5150d
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ jobs:
- name: Install python dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist -U
python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist openpyxl -U
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Install retriever
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ jobs:
- name: Install python dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist -U
python -m pip install flake8 pytest yapf codecov pytest-cov pytest-xdist openpyxl -U
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Install retriever
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
future
xlrd>=0.7
openpyxl
argcomplete
kaggle
PyMySQL>=0.4
Expand Down
17 changes: 3 additions & 14 deletions retriever/lib/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys

import xlrd
import pandas as pd

from retriever.lib.defaults import ENCODING

Expand All @@ -14,20 +15,8 @@ def excel_csv(src_path, path_to_csv, excel_info=None, encoding=ENCODING):
Read src_path excel file and write the excel sheet to path_to_csv
excel_info contains the index of the sheet and the excel file name
"""
book = xlrd.open_workbook(src_path, encoding_override=encoding, on_demand=True)
sheet_object = book.sheet_by_index(excel_info[0])
rows = sheet_object.nrows
dest_path = path_to_csv
new_data = open_fw(dest_path)
csv_writer = open_csvw(new_data)
for index in range(0, rows):
row = sheet_object.row(index)
# Get each row and format the sell value.
row_as_list = [to_str(column_value.value) for column_value in row]
csv_writer.writerow(row_as_list)
new_data.close()
book.release_resources()
del book
df = pd.read_excel(src_path, sheet_name=excel_info[1])
df.to_csv(path_to_csv, sep=',', encoding=encoding, index=False, header=True)


def open_fr(file_name, encoding=ENCODING, encode=True):
Expand Down
28 changes: 6 additions & 22 deletions scripts/fao_global_capture_product.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
reload(sys)
if hasattr(sys, 'setdefaultencoding'):
sys.setdefaultencoding("UTF-8")
import xlrd
import pandas as pd

from retriever.lib.models import Table, Cleanup, correct_invalid_value
from retriever.lib.templates import Script
Expand All @@ -33,7 +33,7 @@ def __init__(self, **kwargs):
self.retriever_minimum_version = '2.1.dev'
self.urls = {
"capture": "http://www.fao.org/fishery/static/Data/Capture_2018.1.2.zip"}
self.version = '1.0.0'
self.version = '1.1.0'
self.ref = "http://www.fao.org/fishery/statistics/global-capture-production/"
self.citation = "FAO. 2018. FAO yearbook. Fishery and Aquaculture Statistics " \
"2016/FAO annuaire. Statistiques des pêches et de l'aquaculture " \
Expand All @@ -49,32 +49,16 @@ def __init__(self, **kwargs):
def download(self, engine=None, debug=False):
Script.download(self, engine, debug)
engine = self.engine

engine.download_files_from_archive(self.urls["capture"], archive_type="zip")

# Convert xlsx to csv.
xlsx_file = self.engine.format_filename("DSD_FI_CAPTURE.xlsx")
file_path = self.engine.format_filename("DSD_CAPTURE.csv")
book = xlrd.open_workbook(xlsx_file)
sh = book.sheet_by_index(0)
rows = sh.nrows

# Creating data files
new_data = open_fw(file_path)
csv_writer = open_csvw(new_data)
csv_writer.writerow(["Order", "Concept_id",
"Role_Type", "Codelist_id",
"Codelist_Code_id", "Description"])

for index in range(2, rows):
row = sh.row(index)
# Get each row and format the sell value.
# Data starts at index 2
row_as_list = [to_str(column_value.value) for column_value in row]
csv_writer.writerow(row_as_list)
new_data.close()
file_path = self.engine.format_filename("DSD_FI_CAPTURE.csv")
df = pd.read_excel(xlsx_file)
df.to_csv(file_path, sep=',', encoding=self.encoding, index=False, header=False)

file_names = [
('DSD_FI_CAPTURE.csv', 'capture_data'),
('CL_FI_UNIT.csv', 'unit_data'),
('CL_FI_WATERAREA_GROUPS.csv', 'waterarea_groups'),
('DSD_CAPTURE.csv', 'dsd_capture_data'),
Expand Down
7 changes: 4 additions & 3 deletions scripts/plant_life_hist_eu.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, **kwargs):
self.title = "A database on the life history traits of the Northwest European flora"
self.name = "plant-life-hist-eu"
self.retriever_minimum_version = '2.0.dev'
self.version = '1.4.3'
self.version = '1.5.0'
self.ref = "http://www.uni-oldenburg.de/en/biology/landeco/research/projects/leda/"
self.urls = {
"Age_of_first_flowering": "http://www.uni-oldenburg.de/fileadmin/user_upload/biologie/ag/landeco/download/LEDA/Data_files/age_of_first_flowering.txt",
Expand Down Expand Up @@ -53,6 +53,7 @@ def __init__(self, **kwargs):
"Woodiness": "http://www.uni-oldenburg.de/fileadmin/user_upload/biologie/ag/landeco/download/LEDA/Data_files/ssd.txt",
"Terminal_velocity": "http://www.uni-oldenburg.de/fileadmin/user_upload/biologie/ag/landeco/download/LEDA/Data_files/TV.txt",
}
self.encoding = "latin-1"
self.citation = "KLEYER, M., BEKKER, R.M., KNEVEL, I.C., BAKKER, J.P, THOMPSON, K., SONNENSCHEIN, M., POSCHLOD, P., VAN GROENENDAEL, J.M., KLIMES, L., KLIMESOVA, J., KLOTZ, S., RUSCH, G.M., HERMY, M., ADRIAENS, D., BOEDELTJE, G., BOSSUYT, B., DANNEMANN, A., ENDELS, P., GoeTZENBERGER, L., HODGSON, J.G., JACKEL, A-K., KueHN, I., KUNZMANN, D., OZINGA, W.A., RoeMERMANN, C., STADLER, M., SCHLEGELMILCH, J., STEENDAM, H.J., TACKENBERG, O., WILMANN, B., CORNELISSEN, J.H.C., ERIKSSON, O., GARNIER, E., PECO, B. (2008): The LEDA Traitbase: A database of life-history traits of Northwest European flora. Journal of Ecology 96: 1266-1274"
self.keywords = ['plants', 'observational']
self.description = "The LEDA Traitbase provides information on plant traits that describe three key features of plant dynamics: persistence, regeneration and dispersal. "
Expand All @@ -70,8 +71,8 @@ def download(self, engine=None, debug=False):
for key in self.urls:
self.engine.download_file(self.urls[key], self.urls[key].rpartition('/')[-1])
new_file_path = self.engine.format_filename("new" + key)
old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]))
new_data = open_fw(new_file_path)
old_data = open_fr(self.engine.find_file(self.urls[key].rpartition('/')[-1]), encoding=self.encoding)
new_data = open_fw(new_file_path, encoding=self.encoding)
with old_data as file_block:

# after the metadata lines, set data to True
Expand Down
4 changes: 2 additions & 2 deletions version.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ croche_vegetation_data.json,1.0.2
dicerandra_frutescens.json,1.0.2
ecoregions_us.json,1.0.1
elton_traits.json,1.2.1
fao_global_capture_product.py,1.0.0
fao_global_capture_product.py,1.1.0
fish_parasite_hosts.json,1.2.1
flensburg_food_web.py,1.0.4
forest_biomass_china.json,1.2.1
Expand Down Expand Up @@ -73,7 +73,7 @@ pantheria.py,1.3.3
partners_in_flight.json,1.0.1
phytoplankton_size.json,1.2.1
plant_comp_ok.json,1.2.1
plant_life_hist_eu.py,1.4.3
plant_life_hist_eu.py,1.5.0
plant_occur_oosting.json,1.2.1
plant_taxonomy_us.json,1.1.3
poker_hands.json,1.2.2
Expand Down

0 comments on commit ea5150d

Please sign in to comment.