From eeeebf8171540a7cd14b373d20b08efbac7e3cd2 Mon Sep 17 00:00:00 2001
From: "Jennifer (Jenny) Bryan" <jenny.f.bryan@gmail.com>
Date: Mon, 26 Mar 2018 13:46:08 -0700
Subject: [PATCH] Parse _rels/.rels to get paths to package parts (#437)

Fixes #435 Be able to read sheets produced by the Los Angeles Police Department

* Parse _rels/.rels to get paths to package parts

* Add NEWS bullet

* Rename deSlash() --> removeLeadingSlashes(); address "all slash" edge case
---
 NEWS.md                             |   2 +
 src/XlsxWorkBook.h                  | 111 +++++++++++++++++++++++-----
 src/utils.h                         |  24 ++++++
 src/zip.cpp                         |   1 -
 tests/testthat/test-compatibility.R |  14 ++++
 5 files changed, 133 insertions(+), 19 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index c3ebba99..27f7ea7e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
 # readxl 1.0.0.9000
 
+* xlsx structured as a "minimal conformant SpreadsheetML package" can be read. Most obvious feature of such sheets is the lack of an `xl/` directory in the unzipped form. (xlsx, #435, #437)
+
 * Reading xls sheet with exactly 65,536 rows no longer enters an infinite loop. (xls, #373, #416, #432 @vkapartzianis)
 
 * Datetimes coerced to character from xls have much higher precision, comparable to the xlsx behaviour. (xls, #430, #431)
diff --git a/src/XlsxWorkBook.h b/src/XlsxWorkBook.h
index e37f0ccc..aa67e84f 100644
--- a/src/XlsxWorkBook.h
+++ b/src/XlsxWorkBook.h
@@ -10,16 +10,58 @@
 
 class XlsxWorkBook {
 
-  // holds objects related to sheet position, name, Id, and xml target file
-  class SheetRelations {
+  // holds objects related to package parts, such as file paths, but also
+  // worksheet position, name, and id
+  class PackageRelations {
+    // non-worksheet package parts
+    std::map<std::string, std::string> part_;
+
+    // worksheets
     int n_;
     Rcpp::CharacterVector names_;
     Rcpp::CharacterVector id_;
     std::map<std::string, std::string> target_;
 
+    // populate workbook element of part_
+    void parse_package_rels(const std::string& path) {
+      std::string rels_xml_file = zip_buffer(path, "_rels/.rels");
+      rapidxml::xml_document<> rels_xml;
+      rels_xml.parse<rapidxml::parse_strip_xml_namespaces>(&rels_xml_file[0]);
+
+      rapidxml::xml_node<>* relationships = rels_xml.first_node("Relationships");
+      if (relationships == NULL) {
+        Rcpp::stop("Spreadsheet has no package-level relationships");
+      }
+
+      std::map<std::string, std::string> scratch;
+      for (rapidxml::xml_node<>* relationship = relationships->first_node();
+           relationship; relationship = relationship->next_sibling()) {
+
+        rapidxml::xml_attribute<>* type = relationship->first_attribute("Type");
+        rapidxml::xml_attribute<>* target = relationship->first_attribute("Target");
+
+        if (type != NULL && target != NULL) {
+          // keys are derived by abbreviating Type
+          // in XML: http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument
+          // key is just the last part: officeDocument
+          scratch[baseName(type->value())] = target->value();
+        }
+      }
+
+      // identify and store the workbook part = the main part
+      // ECMA-376 Part 1 section 8.5 SpreadsheetML
+      std::map<std::string, std::string>::iterator m = scratch.find("officeDocument");
+      if (m == scratch.end()) {
+        Rcpp::stop("No workbook part found");
+      }
+      // store 'xl/workbook.xml', not '/xl/workbook.xml' (rare, but have seen)
+      part_["officeDocument"] = removeLeadingSlashes(m->second);
+    }
+
     // populates n_, names_, id_
     void parse_workbook(const std::string& path) {
-      std::string workbookXml = zip_buffer(path, "xl/workbook.xml");
+      std::string workbookXml = zip_buffer(path, part_["officeDocument"]);
+
       rapidxml::xml_document<> workbook;
       workbook.parse<rapidxml::parse_strip_xml_namespaces>(&workbookXml[0]);
 
@@ -59,7 +101,13 @@ class XlsxWorkBook {
 
     // populates target_
     void parse_workbook_rels(const std::string& path) {
-      std::string rels_xml_file = zip_buffer(path, "xl/_rels/workbook.xml.rels");
+      const std::string workbook_path = part_["officeDocument"];
+      const std::string workbook_dir = dirName(workbook_path);
+      std::string rels_xml_path =
+        workbook_dir + "/_rels/" + baseName(workbook_path) + ".rels";
+      rels_xml_path = removeLeadingSlashes(rels_xml_path);
+      std::string rels_xml_file = zip_buffer(path, rels_xml_path);
+
       rapidxml::xml_document<> rels_xml;
       rels_xml.parse<rapidxml::parse_strip_xml_namespaces>(&rels_xml_file[0]);
 
@@ -70,25 +118,42 @@ class XlsxWorkBook {
 
       for (rapidxml::xml_node<>* relationship = relationships->first_node();
            relationship; relationship = relationship->next_sibling()) {
+
         rapidxml::xml_attribute<>* id = relationship->first_attribute("Id");
+        rapidxml::xml_attribute<>* type = relationship->first_attribute("Type");
         rapidxml::xml_attribute<>* target = relationship->first_attribute("Target");
-        if (id != NULL && target != NULL) {
-          static const std::string prefix = "/xl/";
-          std::string target_value = target->value();
-          if (target_value.substr(0, prefix.size()) == prefix) {
-            target_value = target_value.substr(prefix.size());
+
+        if (id != NULL && type != NULL && target != NULL) {
+          // store 'xl/blah' instead of '/xl/blah' (rare, but we've seen)
+          std::string target_value = removeLeadingSlashes(target->value());
+          // abbreviate Type
+          // in XML: http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings
+          // whereas we only want: sharedStrings
+          std::string type_value = baseName(type->value());
+
+          // prepend workbook_dir iff needed; yes, we've seen both forms
+          // xl/blah --> xl/blah
+          //    blah --> xl/blah
+          if (target_value.substr(0, workbook_dir.size()) != workbook_dir) {
+            target_value = workbook_dir + "/" + target_value;
+          }
+
+          if (type_value.compare("worksheet") == 0) { // worksheet
+            target_[id->value()] = target_value;
+          } else { // not a worksheet, e.g. styles or sharedStrings
+            part_[type_value] = target_value;
           }
-          target_[id->value()] = target_value;
         }
       }
     }
 
   public:
-    SheetRelations(const std::string& path) :
+    PackageRelations(const std::string& path) :
     n_(100),
     names_(n_),
     id_(n_)
     {
+      parse_package_rels(path);
       parse_workbook(path);
       parse_workbook_rels(path);
     }
@@ -109,7 +174,17 @@ class XlsxWorkBook {
       }
       return it->second;
     }
-  }; // end of class SheetRelations
+
+    std::string part(std::string type) const {
+      std::map<std::string, std::string>::const_iterator it = part_.find(type);
+      if (it == part_.end()) {
+        // e.g., sharedStrings may not be present
+        // downstream functions should handle non-existent path
+        return "";
+      }
+      return it->second;
+    }
+  }; // end of class PackageRelations
 
   // common to Xls[x]WorkBook
   std::string path_;
@@ -117,7 +192,7 @@ class XlsxWorkBook {
   std::set<int> dateFormats_;
 
   // specific to XlsxWorkBook
-  SheetRelations rel_;
+  PackageRelations rel_;
   std::vector<std::string> stringTable_;
 
 public:
@@ -152,7 +227,7 @@ class XlsxWorkBook {
   }
 
   std::string sheetPath(int sheet_i) const {
-    return "xl/" + rel_.target(sheet_i);
+    return rel_.target(sheet_i);
   }
 
   const std::vector<std::string>& stringTable() const {
@@ -162,11 +237,11 @@ class XlsxWorkBook {
 private:
 
   void cacheStringTable() {
-    if (!zip_has_file(path_, "xl/sharedStrings.xml")) {
+    if (!zip_has_file(path_, rel_.part("sharedStrings"))) {
       return;
     }
 
-    std::string sharedStringsXml = zip_buffer(path_, "xl/sharedStrings.xml");
+    std::string sharedStringsXml = zip_buffer(path_, rel_.part("sharedStrings"));
     rapidxml::xml_document<> sharedStrings;
     sharedStrings.parse<rapidxml::parse_strip_xml_namespaces>(&sharedStringsXml[0]);
 
@@ -191,7 +266,7 @@ class XlsxWorkBook {
   }
 
   void cacheDateFormats() {
-    std::string stylesXml = zip_buffer(path_, "xl/styles.xml");
+    std::string stylesXml = zip_buffer(path_, rel_.part("styles"));
     rapidxml::xml_document<> styles;
     styles.parse<rapidxml::parse_strip_xml_namespaces>(&stylesXml[0]);
 
@@ -246,7 +321,7 @@ class XlsxWorkBook {
   }
 
   bool uses1904() {
-    std::string workbookXml = zip_buffer(path_, "xl/workbook.xml");
+    std::string workbookXml = zip_buffer(path_, rel_.part("officeDocument"));
     rapidxml::xml_document<> workbook;
     workbook.parse<rapidxml::parse_strip_xml_namespaces>(&workbookXml[0]);
 
diff --git a/src/utils.h b/src/utils.h
index 487a189a..a349f496 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -145,4 +145,28 @@ inline std::string trim(const std::string& s) {
   return s.substr(begin, end - begin + 1);
 }
 
+inline std::string dirName(const std::string& path) {
+  std::size_t found = path.find_last_of("/");
+  if (found == std::string::npos) {
+    return "";
+  }
+  return path.substr(0, found);
+}
+
+inline std::string baseName(const std::string& path) {
+  std::size_t found = path.find_last_of("/");
+  if (found == std::string::npos) {
+    return path;
+  }
+  return path.substr(found + 1);
+}
+
+inline std::string removeLeadingSlashes(const std::string& s) {
+  size_t start = s.find_first_not_of("/");
+  if (start == std::string::npos) {
+    return "";
+  }
+  return s.substr(start);
+}
+
 #endif
diff --git a/src/zip.cpp b/src/zip.cpp
index 2116064d..2898d51f 100644
--- a/src/zip.cpp
+++ b/src/zip.cpp
@@ -48,4 +48,3 @@ void zip_xml(const std::string& zip_path,
   std::string buffer = zip_buffer(zip_path, file_path);
   Rcout << xml_print(buffer);
 }
-
diff --git a/tests/testthat/test-compatibility.R b/tests/testthat/test-compatibility.R
index 9db0d848..92caf301 100644
--- a/tests/testthat/test-compatibility.R
+++ b/tests/testthat/test-compatibility.R
@@ -44,6 +44,7 @@ test_that("we can read the BIFF5, LABEL record sheet", {
   expect_identical(df$Time[c(1, 14)], c("01:00", "14:00"))
 })
 
+
 ## https://github.com/tidyverse/readxl/pull/429
 ## <c r="C2" s="1" t="str"><f>A2 + B2</f></c>
 test_that("formula cell with no v node does not cause crash", {
@@ -52,3 +53,16 @@ test_that("formula cell with no v node does not cause crash", {
   )
   expect_identical(df$`A + B`, NA)
 })
+
+## https://github.com/tidyverse/readxl/issues/435
+## https://source.opennews.org/articles/how-we-found-new-patterns-la-homeless-arrest/
+## LAPD uses a tool to produce xlsx that implements the minimal SpreadsheetML
+## package structure described on pp65-66 of ECMA 5th edition
+test_that("we can read LAPD arrest sheets", {
+  expect_silent(
+    lapd <- read_excel(test_sheet("los-angeles-arrests-xlsx.xlsx"), skip = 2)
+  )
+  expect_identical(dim(lapd), c(193L, 36L))
+  expect_match(lapd$ARR_LOC[9], "HOLLYWOOD")
+  expect_identical(lapd$CHG_DESC[27], "EX CON W/ A GUN")
+})