From eeeebf8171540a7cd14b373d20b08efbac7e3cd2 Mon Sep 17 00:00:00 2001 From: "Jennifer (Jenny) Bryan" Date: Mon, 26 Mar 2018 13:46:08 -0700 Subject: [PATCH] Parse _rels/.rels to get paths to package parts (#437) Fixes #435 Be able to read sheets produced by the Los Angeles Police Department * Parse _rels/.rels to get paths to package parts * Add NEWS bullet * Rename deSlash() --> removeLeadingSlashes(); address "all slash" edge case --- NEWS.md | 2 + src/XlsxWorkBook.h | 111 +++++++++++++++++++++++----- src/utils.h | 24 ++++++ src/zip.cpp | 1 - tests/testthat/test-compatibility.R | 14 ++++ 5 files changed, 133 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index c3ebba99..27f7ea7e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # readxl 1.0.0.9000 +* xlsx structured as a "minimal conformant SpreadsheetML package" can be read. Most obvious feature of such sheets is the lack of an `xl/` directory in the unzipped form. (xlsx, #435, #437) + * Reading xls sheet with exactly 65,536 rows no longer enters an infinite loop. (xls, #373, #416, #432 @vkapartzianis) * Datetimes coerced to character from xls have much higher precision, comparable to the xlsx behaviour. (xls, #430, #431) diff --git a/src/XlsxWorkBook.h b/src/XlsxWorkBook.h index e37f0ccc..aa67e84f 100644 --- a/src/XlsxWorkBook.h +++ b/src/XlsxWorkBook.h @@ -10,16 +10,58 @@ class XlsxWorkBook { - // holds objects related to sheet position, name, Id, and xml target file - class SheetRelations { + // holds objects related to package parts, such as file paths, but also + // worksheet position, name, and id + class PackageRelations { + // non-worksheet package parts + std::map part_; + + // worksheets int n_; Rcpp::CharacterVector names_; Rcpp::CharacterVector id_; std::map target_; + // populate workbook element of part_ + void parse_package_rels(const std::string& path) { + std::string rels_xml_file = zip_buffer(path, "_rels/.rels"); + rapidxml::xml_document<> rels_xml; + rels_xml.parse(&rels_xml_file[0]); + + rapidxml::xml_node<>* relationships = rels_xml.first_node("Relationships"); + if (relationships == NULL) { + Rcpp::stop("Spreadsheet has no package-level relationships"); + } + + std::map scratch; + for (rapidxml::xml_node<>* relationship = relationships->first_node(); + relationship; relationship = relationship->next_sibling()) { + + rapidxml::xml_attribute<>* type = relationship->first_attribute("Type"); + rapidxml::xml_attribute<>* target = relationship->first_attribute("Target"); + + if (type != NULL && target != NULL) { + // keys are derived by abbreviating Type + // in XML: http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument + // key is just the last part: officeDocument + scratch[baseName(type->value())] = target->value(); + } + } + + // identify and store the workbook part = the main part + // ECMA-376 Part 1 section 8.5 SpreadsheetML + std::map::iterator m = scratch.find("officeDocument"); + if (m == scratch.end()) { + Rcpp::stop("No workbook part found"); + } + // store 'xl/workbook.xml', not '/xl/workbook.xml' (rare, but have seen) + part_["officeDocument"] = removeLeadingSlashes(m->second); + } + // populates n_, names_, id_ void parse_workbook(const std::string& path) { - std::string workbookXml = zip_buffer(path, "xl/workbook.xml"); + std::string workbookXml = zip_buffer(path, part_["officeDocument"]); + rapidxml::xml_document<> workbook; workbook.parse(&workbookXml[0]); @@ -59,7 +101,13 @@ class XlsxWorkBook { // populates target_ void parse_workbook_rels(const std::string& path) { - std::string rels_xml_file = zip_buffer(path, "xl/_rels/workbook.xml.rels"); + const std::string workbook_path = part_["officeDocument"]; + const std::string workbook_dir = dirName(workbook_path); + std::string rels_xml_path = + workbook_dir + "/_rels/" + baseName(workbook_path) + ".rels"; + rels_xml_path = removeLeadingSlashes(rels_xml_path); + std::string rels_xml_file = zip_buffer(path, rels_xml_path); + rapidxml::xml_document<> rels_xml; rels_xml.parse(&rels_xml_file[0]); @@ -70,25 +118,42 @@ class XlsxWorkBook { for (rapidxml::xml_node<>* relationship = relationships->first_node(); relationship; relationship = relationship->next_sibling()) { + rapidxml::xml_attribute<>* id = relationship->first_attribute("Id"); + rapidxml::xml_attribute<>* type = relationship->first_attribute("Type"); rapidxml::xml_attribute<>* target = relationship->first_attribute("Target"); - if (id != NULL && target != NULL) { - static const std::string prefix = "/xl/"; - std::string target_value = target->value(); - if (target_value.substr(0, prefix.size()) == prefix) { - target_value = target_value.substr(prefix.size()); + + if (id != NULL && type != NULL && target != NULL) { + // store 'xl/blah' instead of '/xl/blah' (rare, but we've seen) + std::string target_value = removeLeadingSlashes(target->value()); + // abbreviate Type + // in XML: http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings + // whereas we only want: sharedStrings + std::string type_value = baseName(type->value()); + + // prepend workbook_dir iff needed; yes, we've seen both forms + // xl/blah --> xl/blah + // blah --> xl/blah + if (target_value.substr(0, workbook_dir.size()) != workbook_dir) { + target_value = workbook_dir + "/" + target_value; + } + + if (type_value.compare("worksheet") == 0) { // worksheet + target_[id->value()] = target_value; + } else { // not a worksheet, e.g. styles or sharedStrings + part_[type_value] = target_value; } - target_[id->value()] = target_value; } } } public: - SheetRelations(const std::string& path) : + PackageRelations(const std::string& path) : n_(100), names_(n_), id_(n_) { + parse_package_rels(path); parse_workbook(path); parse_workbook_rels(path); } @@ -109,7 +174,17 @@ class XlsxWorkBook { } return it->second; } - }; // end of class SheetRelations + + std::string part(std::string type) const { + std::map::const_iterator it = part_.find(type); + if (it == part_.end()) { + // e.g., sharedStrings may not be present + // downstream functions should handle non-existent path + return ""; + } + return it->second; + } + }; // end of class PackageRelations // common to Xls[x]WorkBook std::string path_; @@ -117,7 +192,7 @@ class XlsxWorkBook { std::set dateFormats_; // specific to XlsxWorkBook - SheetRelations rel_; + PackageRelations rel_; std::vector stringTable_; public: @@ -152,7 +227,7 @@ class XlsxWorkBook { } std::string sheetPath(int sheet_i) const { - return "xl/" + rel_.target(sheet_i); + return rel_.target(sheet_i); } const std::vector& stringTable() const { @@ -162,11 +237,11 @@ class XlsxWorkBook { private: void cacheStringTable() { - if (!zip_has_file(path_, "xl/sharedStrings.xml")) { + if (!zip_has_file(path_, rel_.part("sharedStrings"))) { return; } - std::string sharedStringsXml = zip_buffer(path_, "xl/sharedStrings.xml"); + std::string sharedStringsXml = zip_buffer(path_, rel_.part("sharedStrings")); rapidxml::xml_document<> sharedStrings; sharedStrings.parse(&sharedStringsXml[0]); @@ -191,7 +266,7 @@ class XlsxWorkBook { } void cacheDateFormats() { - std::string stylesXml = zip_buffer(path_, "xl/styles.xml"); + std::string stylesXml = zip_buffer(path_, rel_.part("styles")); rapidxml::xml_document<> styles; styles.parse(&stylesXml[0]); @@ -246,7 +321,7 @@ class XlsxWorkBook { } bool uses1904() { - std::string workbookXml = zip_buffer(path_, "xl/workbook.xml"); + std::string workbookXml = zip_buffer(path_, rel_.part("officeDocument")); rapidxml::xml_document<> workbook; workbook.parse(&workbookXml[0]); diff --git a/src/utils.h b/src/utils.h index 487a189a..a349f496 100644 --- a/src/utils.h +++ b/src/utils.h @@ -145,4 +145,28 @@ inline std::string trim(const std::string& s) { return s.substr(begin, end - begin + 1); } +inline std::string dirName(const std::string& path) { + std::size_t found = path.find_last_of("/"); + if (found == std::string::npos) { + return ""; + } + return path.substr(0, found); +} + +inline std::string baseName(const std::string& path) { + std::size_t found = path.find_last_of("/"); + if (found == std::string::npos) { + return path; + } + return path.substr(found + 1); +} + +inline std::string removeLeadingSlashes(const std::string& s) { + size_t start = s.find_first_not_of("/"); + if (start == std::string::npos) { + return ""; + } + return s.substr(start); +} + #endif diff --git a/src/zip.cpp b/src/zip.cpp index 2116064d..2898d51f 100644 --- a/src/zip.cpp +++ b/src/zip.cpp @@ -48,4 +48,3 @@ void zip_xml(const std::string& zip_path, std::string buffer = zip_buffer(zip_path, file_path); Rcout << xml_print(buffer); } - diff --git a/tests/testthat/test-compatibility.R b/tests/testthat/test-compatibility.R index 9db0d848..92caf301 100644 --- a/tests/testthat/test-compatibility.R +++ b/tests/testthat/test-compatibility.R @@ -44,6 +44,7 @@ test_that("we can read the BIFF5, LABEL record sheet", { expect_identical(df$Time[c(1, 14)], c("01:00", "14:00")) }) + ## https://github.com/tidyverse/readxl/pull/429 ## A2 + B2 test_that("formula cell with no v node does not cause crash", { @@ -52,3 +53,16 @@ test_that("formula cell with no v node does not cause crash", { ) expect_identical(df$`A + B`, NA) }) + +## https://github.com/tidyverse/readxl/issues/435 +## https://source.opennews.org/articles/how-we-found-new-patterns-la-homeless-arrest/ +## LAPD uses a tool to produce xlsx that implements the minimal SpreadsheetML +## package structure described on pp65-66 of ECMA 5th edition +test_that("we can read LAPD arrest sheets", { + expect_silent( + lapd <- read_excel(test_sheet("los-angeles-arrests-xlsx.xlsx"), skip = 2) + ) + expect_identical(dim(lapd), c(193L, 36L)) + expect_match(lapd$ARR_LOC[9], "HOLLYWOOD") + expect_identical(lapd$CHG_DESC[27], "EX CON W/ A GUN") +})