Skip to content
Permalink
Browse files

Account for leap year bug; fixes #264 (#292)

* Account for leap year bug; fixes #264

* Put leap day adjustment inside POSIXctFromSerial
  • Loading branch information
jennybc committed Mar 13, 2017
1 parent 63ef215 commit c9a54ae9ce0394808f6d22e8ef1a7a647b2d92bb
@@ -27,6 +27,6 @@ Suggests:
rprojroot (>= 1.1),
testthat
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.0.0.9000
RoxygenNote: 6.0.1.9000
URL: https://github.com/tidyverse/readxl
BugReports: https://github.com/tidyverse/readxl/issues
@@ -1,5 +1,7 @@
# readxl 0.1.1.9000

* The [Lotus 1-2-3 leap year bug](https://support.microsoft.com/en-us/help/214326/excel-incorrectly-assumes-that-the-year-1900-is-a-leap-year) is accounted for. Date-times prior to March 1, 1900 import correctly. Date-times on the non-existent leap day February 29, 1900 import as NA and throw a warning. (#264, #148, #292 @jennybc)

* Selective column type guessing: `col_types` now accepts `"guess"` to allow user to specify some column types, while allowing others to be guessed (#286 @jennybc)

* Numeric data that appears in a `"date"` column is coerced to a date. Also throws a warning. (#277, #266 @jennybc)
@@ -277,7 +277,7 @@ class XlsCell {
}
}

double asDate(int offset) const {
double asDate(bool is1904) const {
switch(type_) {

case CELL_UNKNOWN:
@@ -288,7 +288,7 @@ class XlsCell {

case CELL_DATE:
case CELL_NUMERIC:
return dateRound((cell_->d - offset) * 86400);
return POSIXctFromSerial(cell_->d, is1904);
}
}

@@ -16,7 +16,7 @@ class XlsWorkBook {

// common to Xls[x]WorkBook
std::string path_;
double offset_;
bool is1904_;
std::set<int> dateStyles_;

// kept as data + accessor in XlsWorkBook vs. member function in XlsxWorkBook
@@ -39,7 +39,7 @@ class XlsWorkBook {
sheets_[i] = Rf_mkCharCE((char*) pWB_->sheets.sheet[i].name, CE_UTF8);
}

offset_ = dateOffset(pWB_->is1904);
is1904_ = pWB_->is1904;

int n_formats = pWB_->formats.count;
for (int i = 0; i < n_formats; ++i) {
@@ -65,8 +65,8 @@ class XlsWorkBook {
return sheets_;
}

double offset() const {
return offset_;
bool is1904() const {
return is1904_;
}

const std::set<int>& dateStyles() const {
@@ -205,7 +205,7 @@ class XlsWorkSheet {
i + 1, j + 1,
xcell->asStdString());
}
REAL(col)[row] = xcell->asDate(wb_.offset());
REAL(col)[row] = xcell->asDate(wb_.is1904());
break;

case COL_NUMERIC:
@@ -261,7 +261,7 @@ class XlsWorkSheet {
SET_VECTOR_ELT(col, row, Rf_ScalarLogical(xcell->asInteger()));
break;
case CELL_DATE: {
Rcpp::RObject cell_val = Rf_ScalarReal(xcell->asDate(wb_.offset()));
Rcpp::RObject cell_val = Rf_ScalarReal(xcell->asDate(wb_.is1904()));
cell_val.attr("class") = Rcpp::CharacterVector::create("POSIXct", "POSIXt");
cell_val.attr("tzone") = "UTC";
SET_VECTOR_ELT(col, row, cell_val);
@@ -255,7 +255,7 @@ class XlsxCell {
}
}

double asDate(int offset) const {
double asDate(bool is1904) const {
switch(type_) {

case CELL_UNKNOWN:
@@ -268,7 +268,7 @@ class XlsxCell {
case CELL_NUMERIC:
{
rapidxml::xml_node<>* v = cell_->first_node("v");
return dateRound((atof(v->value()) - offset) * 86400);
return POSIXctFromSerial(atof(v->value()), is1904);
}
}
}
@@ -108,7 +108,7 @@ class XlsxWorkBook {

// common to Xls[x]WorkBook
std::string path_;
double offset_;
bool is1904_;
std::set<int> dateStyles_;

// specific to XlsxWorkBook
@@ -121,7 +121,7 @@ class XlsxWorkBook {
path_(path),
rel_(path)
{
offset_ = dateOffset(is1904());
is1904_ = uses1904();
cacheStringTable();
cacheDateStyles();
}
@@ -138,8 +138,8 @@ class XlsxWorkBook {
return rel_.names();
}

double offset() const {
return offset_;
bool is1904() const {
return is1904_;
}

const std::set<int>& dateStyles() const {
@@ -229,7 +229,7 @@ class XlsxWorkBook {
}
}

bool is1904() {
bool uses1904() {
std::string workbookXml = zip_buffer(path_, "xl/workbook.xml");
rapidxml::xml_document<> workbook;
workbook.parse<0>(&workbookXml[0]);
@@ -219,7 +219,7 @@ class XlsxWorkSheet {
Rcpp::warning("Expecting date in [%i, %i]: got '%s'",
i + 1, j + 1, xcell->asStdString(wb_.stringTable()));
}
REAL(col)[row] = xcell->asDate(wb_.offset());
REAL(col)[row] = xcell->asDate(wb_.is1904());
break;

case COL_NUMERIC:
@@ -278,7 +278,7 @@ class XlsxWorkSheet {
}
case CELL_DATE: {
Rcpp::RObject cell_val =
Rf_ScalarReal(xcell->asDate(wb_.offset()));
Rf_ScalarReal(xcell->asDate(wb_.is1904()));
cell_val.attr("class") = Rcpp::CharacterVector::create("POSIXct", "POSIXt");
cell_val.attr("tzone") = "UTC";
SET_VECTOR_ELT(col, row, cell_val);
@@ -4,14 +4,35 @@
#include <cerrno>
#include "StringSet.h"

// The date offset needed to align Excel dates with R's use of 1970-01-01
// depends on the "date system".
//
// xls ------------------------------------------------------------------------
// Page and section numbers below refer to
// [MS-XLS]: Excel Binary File Format (.xls) Structure
// version, date, and download URL given in XlsCell.h
//
// 2.4.77 Date1904 p257 ... it boils down to a boolean
// 0 --> 1900 date system
// 1 --> 1904 date system
//
// xlsx ------------------------------------------------------------------------
// Page and section numbers below refer to
// ECMA-376
// version, date, and download URL given in XlsxCell.h
//
// 18.2.28 workbookPr (Workbook Properties) p1582
// date1904:
// Value that indicates whether to use a 1900 or 1904 date system when
// converting serial date-times in the workbook to dates.
// A value of 1 or true indicates the workbook uses the 1904 date system.
// A value of 0 or false indicates the workbook uses the 1900 date system. (See
// 18.17.4.1 for the definition of the date systems.)
// The default value for this attribute is false.
// in xl/workbook.xml, node workbook, child node workbookPr
// attribute date1904:
// 0 or false --> 1900 date system
// 1 or true --> 1904 date system (this is the default)
//
// 18.17.4.1 p2067 holds definition of the date systems
//
// Date systems ---------------------------------------------------------------
// 1900 system: first possible date is 1900-01-01 00:00:00,
// which has serial value of **1**
// 1904 system: origin 1904-01-01 00:00:00
inline double dateOffset(bool is1904) {
// as.numeric(as.Date("1899-12-30"))
// as.numeric(as.Date("1904-01-01"))
@@ -29,6 +50,27 @@ inline double dateRound(double dttm) {
return ms / 10000;
}

// this is even more horrible
// correct for Excel's faithful re-implementation of the Lotus 1-2-3 bug,
// in which February 29, 1900 is included in the date system, even though 1900
// was not actually a leap year
// https://support.microsoft.com/en-us/help/214326/excel-incorrectly-assumes-that-the-year-1900-is-a-leap-year
// How we address this:
// If date is *prior* to the non-existent leap day: add a day
// If date is on the non-existent leap day: make negative and, in due course, NA
// Otherwise: do nothing
inline double POSIXctFromSerial(double xlDate, bool is1904) {
if (!is1904 && xlDate < 61) {
xlDate = (xlDate < 60) ? ++xlDate : -1;
}
if (xlDate < 0) {
Rcpp::warning("NA inserted for impossible 1900-02-29 datetime");
return NA_REAL;
} else {
return dateRound((xlDate - dateOffset(is1904)) * 86400);
}
}

// Simple parser: does not check that order of numbers and letters is correct
inline std::pair<int, int> parseRef(const char* ref) {
int col = 0, row = 0;
Binary file not shown.
Binary file not shown.
@@ -21,3 +21,29 @@ test_that("date subsecond rounding works", {
df <- read_excel(test_sheet("datetime-rounding.xls"))
expect_identical(as.character(df$dttm), df$dttm_string)
})

## Lotus 1-2-3 leap year bug
## #264, #148
test_that("we get correct dates prior to March 1, 1900, in 1900 date system", {
## xlsx
expect_warning(
df <- read_excel(test_sheet("dates-leap-year-1900-xlsx.xlsx"),
col_types = c("date", "text", "logical")),
"NA inserted for impossible 1900-02-29 datetime"
)
dttms <- as.POSIXct(df$dttm_string, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
leap_day <- df$dttm_string == "1900-02-29 08:00:00"
expect_identical(df$dttm[!leap_day], dttms[!leap_day])
expect_true(is.na(df$dttm[leap_day]))

## xls
expect_warning(
df <- read_excel(test_sheet("dates-leap-year-1900-xls.xls"),
col_types = c("date", "text", "logical")),
"NA inserted for impossible 1900-02-29 datetime"
)
dttms <- as.POSIXct(df$dttm_string, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
leap_day <- df$dttm_string == "1900-02-29 08:00:00"
expect_identical(df$dttm[!leap_day], dttms[!leap_day])
expect_true(is.na(df$dttm[leap_day]))
})

0 comments on commit c9a54ae

Please sign in to comment.
You can’t perform that action at this time.