Skip to content

Commit

Permalink
Parse _rels/.rels to get paths to package parts (#437)
Browse files Browse the repository at this point in the history
Fixes #435 Be able to read sheets produced by the Los Angeles Police Department

* Parse _rels/.rels to get paths to package parts

* Add NEWS bullet

* Rename deSlash() --> removeLeadingSlashes(); address "all slash" edge case
  • Loading branch information
jennybc committed Mar 26, 2018
1 parent 20c61ce commit eeeebf8
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 19 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# readxl 1.0.0.9000

* xlsx structured as a "minimal conformant SpreadsheetML package" can be read. Most obvious feature of such sheets is the lack of an `xl/` directory in the unzipped form. (xlsx, #435, #437)

* Reading xls sheet with exactly 65,536 rows no longer enters an infinite loop. (xls, #373, #416, #432 @vkapartzianis)

* Datetimes coerced to character from xls have much higher precision, comparable to the xlsx behaviour. (xls, #430, #431)
Expand Down
111 changes: 93 additions & 18 deletions src/XlsxWorkBook.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,58 @@

class XlsxWorkBook {

// holds objects related to sheet position, name, Id, and xml target file
class SheetRelations {
// holds objects related to package parts, such as file paths, but also
// worksheet position, name, and id
class PackageRelations {
// non-worksheet package parts
std::map<std::string, std::string> part_;

// worksheets
int n_;
Rcpp::CharacterVector names_;
Rcpp::CharacterVector id_;
std::map<std::string, std::string> target_;

// populate workbook element of part_
void parse_package_rels(const std::string& path) {
std::string rels_xml_file = zip_buffer(path, "_rels/.rels");
rapidxml::xml_document<> rels_xml;
rels_xml.parse<rapidxml::parse_strip_xml_namespaces>(&rels_xml_file[0]);

rapidxml::xml_node<>* relationships = rels_xml.first_node("Relationships");
if (relationships == NULL) {
Rcpp::stop("Spreadsheet has no package-level relationships");
}

std::map<std::string, std::string> scratch;
for (rapidxml::xml_node<>* relationship = relationships->first_node();
relationship; relationship = relationship->next_sibling()) {

rapidxml::xml_attribute<>* type = relationship->first_attribute("Type");
rapidxml::xml_attribute<>* target = relationship->first_attribute("Target");

if (type != NULL && target != NULL) {
// keys are derived by abbreviating Type
// in XML: http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument
// key is just the last part: officeDocument
scratch[baseName(type->value())] = target->value();
}
}

// identify and store the workbook part = the main part
// ECMA-376 Part 1 section 8.5 SpreadsheetML
std::map<std::string, std::string>::iterator m = scratch.find("officeDocument");
if (m == scratch.end()) {
Rcpp::stop("No workbook part found");
}
// store 'xl/workbook.xml', not '/xl/workbook.xml' (rare, but have seen)
part_["officeDocument"] = removeLeadingSlashes(m->second);
}

// populates n_, names_, id_
void parse_workbook(const std::string& path) {
std::string workbookXml = zip_buffer(path, "xl/workbook.xml");
std::string workbookXml = zip_buffer(path, part_["officeDocument"]);

rapidxml::xml_document<> workbook;
workbook.parse<rapidxml::parse_strip_xml_namespaces>(&workbookXml[0]);

Expand Down Expand Up @@ -59,7 +101,13 @@ class XlsxWorkBook {

// populates target_
void parse_workbook_rels(const std::string& path) {
std::string rels_xml_file = zip_buffer(path, "xl/_rels/workbook.xml.rels");
const std::string workbook_path = part_["officeDocument"];
const std::string workbook_dir = dirName(workbook_path);
std::string rels_xml_path =
workbook_dir + "/_rels/" + baseName(workbook_path) + ".rels";
rels_xml_path = removeLeadingSlashes(rels_xml_path);
std::string rels_xml_file = zip_buffer(path, rels_xml_path);

rapidxml::xml_document<> rels_xml;
rels_xml.parse<rapidxml::parse_strip_xml_namespaces>(&rels_xml_file[0]);

Expand All @@ -70,25 +118,42 @@ class XlsxWorkBook {

for (rapidxml::xml_node<>* relationship = relationships->first_node();
relationship; relationship = relationship->next_sibling()) {

rapidxml::xml_attribute<>* id = relationship->first_attribute("Id");
rapidxml::xml_attribute<>* type = relationship->first_attribute("Type");
rapidxml::xml_attribute<>* target = relationship->first_attribute("Target");
if (id != NULL && target != NULL) {
static const std::string prefix = "/xl/";
std::string target_value = target->value();
if (target_value.substr(0, prefix.size()) == prefix) {
target_value = target_value.substr(prefix.size());

if (id != NULL && type != NULL && target != NULL) {
// store 'xl/blah' instead of '/xl/blah' (rare, but we've seen)
std::string target_value = removeLeadingSlashes(target->value());
// abbreviate Type
// in XML: http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings
// whereas we only want: sharedStrings
std::string type_value = baseName(type->value());

// prepend workbook_dir iff needed; yes, we've seen both forms
// xl/blah --> xl/blah
// blah --> xl/blah
if (target_value.substr(0, workbook_dir.size()) != workbook_dir) {
target_value = workbook_dir + "/" + target_value;
}

if (type_value.compare("worksheet") == 0) { // worksheet
target_[id->value()] = target_value;
} else { // not a worksheet, e.g. styles or sharedStrings
part_[type_value] = target_value;
}
target_[id->value()] = target_value;
}
}
}

public:
SheetRelations(const std::string& path) :
PackageRelations(const std::string& path) :
n_(100),
names_(n_),
id_(n_)
{
parse_package_rels(path);
parse_workbook(path);
parse_workbook_rels(path);
}
Expand All @@ -109,15 +174,25 @@ class XlsxWorkBook {
}
return it->second;
}
}; // end of class SheetRelations

std::string part(std::string type) const {
std::map<std::string, std::string>::const_iterator it = part_.find(type);
if (it == part_.end()) {
// e.g., sharedStrings may not be present
// downstream functions should handle non-existent path
return "";
}
return it->second;
}
}; // end of class PackageRelations

// common to Xls[x]WorkBook
std::string path_;
bool is1904_;
std::set<int> dateFormats_;

// specific to XlsxWorkBook
SheetRelations rel_;
PackageRelations rel_;
std::vector<std::string> stringTable_;

public:
Expand Down Expand Up @@ -152,7 +227,7 @@ class XlsxWorkBook {
}

std::string sheetPath(int sheet_i) const {
return "xl/" + rel_.target(sheet_i);
return rel_.target(sheet_i);
}

const std::vector<std::string>& stringTable() const {
Expand All @@ -162,11 +237,11 @@ class XlsxWorkBook {
private:

void cacheStringTable() {
if (!zip_has_file(path_, "xl/sharedStrings.xml")) {
if (!zip_has_file(path_, rel_.part("sharedStrings"))) {
return;
}

std::string sharedStringsXml = zip_buffer(path_, "xl/sharedStrings.xml");
std::string sharedStringsXml = zip_buffer(path_, rel_.part("sharedStrings"));
rapidxml::xml_document<> sharedStrings;
sharedStrings.parse<rapidxml::parse_strip_xml_namespaces>(&sharedStringsXml[0]);

Expand All @@ -191,7 +266,7 @@ class XlsxWorkBook {
}

void cacheDateFormats() {
std::string stylesXml = zip_buffer(path_, "xl/styles.xml");
std::string stylesXml = zip_buffer(path_, rel_.part("styles"));
rapidxml::xml_document<> styles;
styles.parse<rapidxml::parse_strip_xml_namespaces>(&stylesXml[0]);

Expand Down Expand Up @@ -246,7 +321,7 @@ class XlsxWorkBook {
}

bool uses1904() {
std::string workbookXml = zip_buffer(path_, "xl/workbook.xml");
std::string workbookXml = zip_buffer(path_, rel_.part("officeDocument"));
rapidxml::xml_document<> workbook;
workbook.parse<rapidxml::parse_strip_xml_namespaces>(&workbookXml[0]);

Expand Down
24 changes: 24 additions & 0 deletions src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,28 @@ inline std::string trim(const std::string& s) {
return s.substr(begin, end - begin + 1);
}

inline std::string dirName(const std::string& path) {
std::size_t found = path.find_last_of("/");
if (found == std::string::npos) {
return "";
}
return path.substr(0, found);
}

inline std::string baseName(const std::string& path) {
std::size_t found = path.find_last_of("/");
if (found == std::string::npos) {
return path;
}
return path.substr(found + 1);
}

inline std::string removeLeadingSlashes(const std::string& s) {
size_t start = s.find_first_not_of("/");
if (start == std::string::npos) {
return "";
}
return s.substr(start);
}

#endif
1 change: 0 additions & 1 deletion src/zip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,3 @@ void zip_xml(const std::string& zip_path,
std::string buffer = zip_buffer(zip_path, file_path);
Rcout << xml_print(buffer);
}

14 changes: 14 additions & 0 deletions tests/testthat/test-compatibility.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ test_that("we can read the BIFF5, LABEL record sheet", {
expect_identical(df$Time[c(1, 14)], c("01:00", "14:00"))
})


## https://github.com/tidyverse/readxl/pull/429
## <c r="C2" s="1" t="str"><f>A2 + B2</f></c>
test_that("formula cell with no v node does not cause crash", {
Expand All @@ -52,3 +53,16 @@ test_that("formula cell with no v node does not cause crash", {
)
expect_identical(df$`A + B`, NA)
})

## https://github.com/tidyverse/readxl/issues/435
## https://source.opennews.org/articles/how-we-found-new-patterns-la-homeless-arrest/
## LAPD uses a tool to produce xlsx that implements the minimal SpreadsheetML
## package structure described on pp65-66 of ECMA 5th edition
test_that("we can read LAPD arrest sheets", {
expect_silent(
lapd <- read_excel(test_sheet("los-angeles-arrests-xlsx.xlsx"), skip = 2)
)
expect_identical(dim(lapd), c(193L, 36L))
expect_match(lapd$ARR_LOC[9], "HOLLYWOOD")
expect_identical(lapd$CHG_DESC[27], "EX CON W/ A GUN")
})

0 comments on commit eeeebf8

Please sign in to comment.