From 7e8027da6ecb8c5b5f07a529e5f80c23cd3be6ad Mon Sep 17 00:00:00 2001 From: ripley Date: Mon, 21 Apr 2008 17:32:58 +0000 Subject: [PATCH] see what happens if CHARSXPs are not allowed to have embedded nuls git-svn-id: https://svn.r-project.org/R/trunk@45416 00db46b3-68df-0310-9c12-caf00c1e9a41 --- NEWS | 6 +- doc/manual/R-ints.texi | 17 ++++++ src/library/base/man/Comparison.Rd | 6 -- src/library/base/man/abbreviate.Rd | 3 - src/library/base/man/agrep.Rd | 2 - src/library/base/man/cat.Rd | 3 +- src/library/base/man/char.expand.Rd | 2 - src/library/base/man/charmatch.Rd | 2 - src/library/base/man/chartr.Rd | 2 - src/library/base/man/duplicated.Rd | 3 - src/library/base/man/encodeString.Rd | 6 +- src/library/base/man/format.Rd | 2 - src/library/base/man/formatc.Rd | 2 - src/library/base/man/grep.Rd | 2 - src/library/base/man/iconv.Rd | 2 - src/library/base/man/identical.Rd | 5 +- src/library/base/man/make.names.Rd | 2 - src/library/base/man/make.unique.Rd | 2 - src/library/base/man/match.Rd | 2 - src/library/base/man/nchar.Rd | 9 --- src/library/base/man/paste.Rd | 2 - src/library/base/man/pmatch.Rd | 2 - src/library/base/man/rawConversion.Rd | 3 +- src/library/base/man/readChar.Rd | 6 +- src/library/base/man/scan.Rd | 6 +- src/library/base/man/serialize.Rd | 10 +--- src/library/base/man/sprintf.Rd | 2 - src/library/base/man/strsplit.Rd | 2 - src/library/base/man/strwrap.Rd | 2 - src/library/base/man/substr.Rd | 2 - src/library/base/man/unique.Rd | 3 - src/library/base/man/utf8Conversion.Rd | 3 +- src/main/envir.c | 37 +++++++++++- src/main/printutils.c | 79 +++++++++++++------------- src/main/scan.c | 32 +++++------ src/main/serialize.c | 7 +-- tests/reg-tests-2.R | 4 +- tests/reg-tests-2.Rout.save | 11 ++-- tests/reg-tests-3.R | 11 ---- tests/reg-tests-3.Rout.save | 16 +----- 40 files changed, 140 insertions(+), 180 deletions(-) diff --git a/NEWS b/NEWS index c5f1f8a4b2a..87833ac41a6 100644 --- a/NEWS +++ b/NEWS @@ -35,8 +35,6 @@ NEW FEATURES o tools::texi2dvi() has a new argument 'texinputs' to allow the TeX and bibtex input paths to be specified (even on MiKTeX). - o Encoding<-() now handles character strings with embedded nuls. - o setEPS() and setPS() gain '...' to allow other arguments to be passed to ps.options(), including overriding 'width' and 'height'. @@ -98,6 +96,10 @@ DEPRECATED & DEFUNCT o In package installation, SaveImage: yes is now ignored, and any use of the field will give a warning. + o unserialize() no longer accepts character strings as input -- + that was a format prior to R 2.4.0 which needs embedded nulls + in character strings. + o The C macro 'allocString' has been removed -- use 'mkChar', or 'allocVector' directly if really necessary. diff --git a/doc/manual/R-ints.texi b/doc/manual/R-ints.texi index b6adf8df127..c2d24ffa8d7 100644 --- a/doc/manual/R-ints.texi +++ b/doc/manual/R-ints.texi @@ -1283,6 +1283,23 @@ used to hold the finalizer function of a C finalizer (uncached) -- now @code{CHARSXP}s via @code{allocString} (removed in @R 2.8.0) and @code{allocVector(CHARSXP ...)} (deprecated in @R 2.8.0). +Currently @code{CHARSXP}s with embedded nulls can be created by + +@itemize +@item +parsing a character string containing @code{\0}. (New in @R{} 2.8.0.) +@item +using @code{scan(allowEscapes=TRUE} on a string containing +@code{\0}. (New in @R{} 2.8.0.) +@item +by @code{readChar}, @code{rawToChar} or @code{intToUtf8}. +@item +@code{load}ing a saved CHARSXP. (Broken for version 2 saves in @R{} 2.6.x.) +@end itemize + +@noindent +This may change before release. + @node Warnings and errors, S4 objects, The CHARSXP cache, R Internal Structures @section Warnings and errors diff --git a/src/library/base/man/Comparison.Rd b/src/library/base/man/Comparison.Rd index b82faef3ab3..ab3ac2c8aa8 100644 --- a/src/library/base/man/Comparison.Rd +++ b/src/library/base/man/Comparison.Rd @@ -58,12 +58,6 @@ x != y coerced to the type of the other, the (decreasing) order of precedence being character, complex, numeric, integer, logical and raw. - When comparisons are made between character strings, parts of the - strings after embedded \code{nul} characters are ignored. (This is - necessary as the position of \code{nul} in the collation sequence is - undefined, and we want one of \code{<}, \code{==} and \code{>} to be - true for any comparison.) - Missing values (\code{\link{NA}}) and \code{\link{NaN}} values are regarded as non-comparable even to themselves, so comparisons involving them will always result in \code{NA}. Missing values can diff --git a/src/library/base/man/abbreviate.Rd b/src/library/base/man/abbreviate.Rd index 3c95b96fef3..e82c796b4c8 100644 --- a/src/library/base/man/abbreviate.Rd +++ b/src/library/base/man/abbreviate.Rd @@ -42,9 +42,6 @@ abbreviate(names.arg, minlength = 4, use.classes = TRUE, If \code{use.classes} is \code{FALSE} then the only distinction is to be between letters and space. This has NOT been implemented. - - Elements of \code{names.arg} with embedded nul bytes will be truncated - at the first nul. } \value{ A character vector containing abbreviations for the strings in its diff --git a/src/library/base/man/agrep.Rd b/src/library/base/man/agrep.Rd index f056f7b8a45..d3499c82208 100644 --- a/src/library/base/man/agrep.Rd +++ b/src/library/base/man/agrep.Rd @@ -55,8 +55,6 @@ agrep(pattern, x, ignore.case = FALSE, value = FALSE, space it only supports the first 65536 characters of UTF-8 (where all the characters for human languages lie). Note that it can be quite slow in UTF-8, and \code{useBytes = TRUE} will be much faster. - - Inputs with embedded nul bytes will be truncated at the first nul. } \value{ Either a vector giving the indices of the elements that yielded a diff --git a/src/library/base/man/cat.Rd b/src/library/base/man/cat.Rd index cb8ecc4317a..5e14162cf99 100644 --- a/src/library/base/man/cat.Rd +++ b/src/library/base/man/cat.Rd @@ -55,8 +55,7 @@ cat(\dots , file = "", sep = " ", fill = FALSE, labels = NULL, are handled. Character strings are output \sQuote{as is} (unlike \code{\link{print.default}} which escapes non-printable characters and backslash --- use \code{\link{encodeString}} if you want to output - encoded strings using \code{cat}). (Character strings with embedded - nuls are truncated at the first nul.) Other types of \R object should be + encoded strings using \code{cat}). Other types of \R object should be converted (e.g. by \code{\link{as.character}} or \code{\link{format}}) before being passed to \code{cat}. diff --git a/src/library/base/man/char.expand.Rd b/src/library/base/man/char.expand.Rd index 0a978389e1c..38415e7e5d9 100644 --- a/src/library/base/man/char.expand.Rd +++ b/src/library/base/man/char.expand.Rd @@ -21,8 +21,6 @@ This function is particularly useful when abbreviations are allowed in function arguments, and need to be uniquely expanded with respect to a target table of possible values. - - Inputs with embedded nul bytes will be truncated at the first nul. } \seealso{ \code{\link{charmatch}} and \code{\link{pmatch}} for performing diff --git a/src/library/base/man/charmatch.Rd b/src/library/base/man/charmatch.Rd index a8180e0c889..845b30e09cf 100644 --- a/src/library/base/man/charmatch.Rd +++ b/src/library/base/man/charmatch.Rd @@ -32,8 +32,6 @@ charmatch(x, table, nomatch = NA_integer_) returned and if no match is found then \code{nomatch} is returned. \code{NA} values are treated as the string constant \code{"NA"}. - - Inputs with embedded nul bytes will be truncated at the first nul. } \value{ An integer vector of the same length as \code{x}, giving the diff --git a/src/library/base/man/chartr.Rd b/src/library/base/man/chartr.Rd index 2e2ae052c9a..5cbbfead839 100644 --- a/src/library/base/man/chartr.Rd +++ b/src/library/base/man/chartr.Rd @@ -41,8 +41,6 @@ casefold(x, upper = FALSE) \code{casefold} is a wrapper for \code{tolower} and \code{toupper} provided for compatibility with S-PLUS. - - Inputs with embedded nul bytes will be truncated at the first nul. } \value{ A character vector of the same length and with the same attributes as diff --git a/src/library/base/man/duplicated.Rd b/src/library/base/man/duplicated.Rd index 0d834ed7d25..69b4d8047a3 100644 --- a/src/library/base/man/duplicated.Rd +++ b/src/library/base/man/duplicated.Rd @@ -58,9 +58,6 @@ duplicated(x, incomparables = FALSE, \dots) Missing values are regarded as equal, but \code{NaN} is not equal to \code{NA_real_}. - - Strings with embedded nuls of the same length will be considered - equal if they agree when truncated at the first nul. } \section{Warning}{ Using this for lists is potentially slow, especially if the elements diff --git a/src/library/base/man/encodeString.Rd b/src/library/base/man/encodeString.Rd index 2bb7282d485..d7628e07deb 100644 --- a/src/library/base/man/encodeString.Rd +++ b/src/library/base/man/encodeString.Rd @@ -1,6 +1,6 @@ % File src/library/base/man/encodeString.Rd % Part of the R package, http://www.R-project.org -% Copyright 1995-2007 R Core Development Team +% Copyright 1995-2008 R Core Development Team % Distributed under GPL 2 or later \name{encodeString} @@ -33,8 +33,8 @@ encodeString(x, width = 0, quote = "", na.encode = TRUE, \details{ This escapes backslash and the control characters \code{\a} (bell), \code{\b} (backspace), \code{\f} (formfeed), \code{\n} (line feed), - \code{\r} (carriage return), \code{\t} (tab), \code{\v} (vertical tab) - and \code{\0} (nul) as well as any non-printable characters in a + \code{\r} (carriage return), \code{\t} (tab) and \code{\v} (vertical tab) + as well as any non-printable characters in a single-byte locale, which are printed in octal notation (\code{\xyz} with leading zeroes). #ifdef unix diff --git a/src/library/base/man/format.Rd b/src/library/base/man/format.Rd index 798b7b8c83a..fc63f2d9ceb 100644 --- a/src/library/base/man/format.Rd +++ b/src/library/base/man/format.Rd @@ -104,8 +104,6 @@ format(x, \dots) Raw vectors are converted to their 2-digit hexadecimal representation by \code{\link{as.character}}. - - Character inputs with embedded nul bytes will be truncated at the first nul. } \value{ An object of similar structure to \code{x} containing character diff --git a/src/library/base/man/formatc.Rd b/src/library/base/man/formatc.Rd index 55c0ec2ac8f..f6698edc86c 100644 --- a/src/library/base/man/formatc.Rd +++ b/src/library/base/man/formatc.Rd @@ -126,8 +126,6 @@ prettyNum(x, big.mark = "", big.interval = 3, unexpectedly if \code{x} is a \code{character} vector not resulting from something like \code{format()}: in particular it assumes that a period is a decimal mark. - - Character inputs with embedded nul bytes will be truncated at the first nul. } \author{ \code{formatC} was originally written by Bill Dunlap, later much diff --git a/src/library/base/man/grep.Rd b/src/library/base/man/grep.Rd index 67a0c4ed877..5f9a75a34f6 100644 --- a/src/library/base/man/grep.Rd +++ b/src/library/base/man/grep.Rd @@ -93,8 +93,6 @@ gregexpr(pattern, text, ignore.case = FALSE, extended = TRUE, PCRE only supports caseless matching for a non-ASCII pattern in a UTF-8 locale (and not for \code{useBytes = TRUE} in any locale). - - Inputs with embedded nul bytes will be truncated at the first nul. } \value{ For \code{grep} a vector giving either the indices of the elements of diff --git a/src/library/base/man/iconv.Rd b/src/library/base/man/iconv.Rd index fed65968def..f8c2ee9148e 100644 --- a/src/library/base/man/iconv.Rd +++ b/src/library/base/man/iconv.Rd @@ -60,8 +60,6 @@ iconvlist() As from \R 2.7.0 \code{"UTF8"} will be accepted as meaning the (more correct) \code{"UTF-8"}. - - Inputs \code{x} with embedded nul bytes will be handled completely. } \value{ A character vector of the same length and the same attributes as diff --git a/src/library/base/man/identical.Rd b/src/library/base/man/identical.Rd index 8ed28614591..19bd9100246 100644 --- a/src/library/base/man/identical.Rd +++ b/src/library/base/man/identical.Rd @@ -50,9 +50,8 @@ identical(x, y) \code{\link{NA_real_}}, but all \code{NaN}s are equal (and all \code{NA} of the same type are equal). - Comparison of character strings allows for embedded \code{nul} - characters. Comparison of attributes view them as a set (and not a - vector, so order is not tested). + Comparison of attributes view them as a set (and not a vector, so + order is not tested). } \value{ A single logical value, \code{TRUE} or \code{FALSE}, never \code{NA} diff --git a/src/library/base/man/make.names.Rd b/src/library/base/man/make.names.Rd index c0632c81ac0..d8fb0ce5190 100644 --- a/src/library/base/man/make.names.Rd +++ b/src/library/base/man/make.names.Rd @@ -43,8 +43,6 @@ make.names(names, unique = FALSE, allow_ = TRUE) \code{allow_ = FALSE} is also useful when creating names for export to applications which do not allow underline in names (for example, S-PLUS and some DBMSs). - - Inputs with embedded nul bytes will be truncated at the first nul. } \seealso{ \code{\link{make.unique}}, diff --git a/src/library/base/man/make.unique.Rd b/src/library/base/man/make.unique.Rd index 5dbd66fbc9b..4d2c0e2f174 100644 --- a/src/library/base/man/make.unique.Rd +++ b/src/library/base/man/make.unique.Rd @@ -31,8 +31,6 @@ make.unique(names, sep = ".") If character vector \code{A} is already unique, then \code{make.unique(c(A, B))} preserves \code{A}. - - Inputs with embedded nul bytes will be truncated at the first nul. } \author{Thomas P Minka} \seealso{ diff --git a/src/library/base/man/match.Rd b/src/library/base/man/match.Rd index 757c7b07727..ca1d7746e36 100644 --- a/src/library/base/man/match.Rd +++ b/src/library/base/man/match.Rd @@ -61,8 +61,6 @@ x \%in\% table For all types, \code{NA} matches \code{NA} and no other value. For real and complex values, \code{NaN} values are regarded as matching any other \code{NaN} value, but not matching \code{NA}. - - Character inputs with embedded nul bytes will be truncated at the first nul. } \references{ Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) diff --git a/src/library/base/man/nchar.Rd b/src/library/base/man/nchar.Rd index 3f65ebd6117..9663620c1eb 100644 --- a/src/library/base/man/nchar.Rd +++ b/src/library/base/man/nchar.Rd @@ -42,10 +42,6 @@ nzchar(x) These will often be the same, and almost always will be in single-byte locales. There will be differences between the first two with multibyte character sequences, e.g. in UTF-8 locales. - If the byte stream contains embedded \code{nul} bytes, - \code{type = "bytes"} looks at all the bytes whereas the other two - types look only at the string as printed by \code{cat}, up to the - first \code{nul} byte. The internal equivalent of the default method of \code{\link{as.character}} is performed on \code{x} (so there is no @@ -72,11 +68,6 @@ nzchar(x) will be used to \code{print()} the string. Use \code{\link{encodeString}} to find the characters used to print the string. - - Embedded \code{nul} bytes are included in the byte count (but not the - final \code{nul}). In contrast, characters are counted up to the - string terminator (the first \code{nul} that is not part of a - character representation). } \references{ Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) diff --git a/src/library/base/man/paste.Rd b/src/library/base/man/paste.Rd index 80e001ea22f..8971682e4d0 100644 --- a/src/library/base/man/paste.Rd +++ b/src/library/base/man/paste.Rd @@ -36,8 +36,6 @@ paste(\dots, sep = " ", collapse = NULL) If a value is specified for \code{collapse}, the values in the result are then concatenated into a single string, with the elements being separated by the value of \code{collapse}. - - Character inputs with embedded nul bytes will be truncated at the first nul. } \value{ A character vector of the concatenated values. This will be of length diff --git a/src/library/base/man/pmatch.Rd b/src/library/base/man/pmatch.Rd index 3cd19e2b168..0dac8908579 100644 --- a/src/library/base/man/pmatch.Rd +++ b/src/library/base/man/pmatch.Rd @@ -48,8 +48,6 @@ pmatch(x, table, nomatch = NA_integer_, duplicates.ok = FALSE) does match empty strings, and it does not allow multiple exact matches. \code{NA} values are treated as if they were the string constant \code{"NA"}. - - Inputs with embedded nul bytes will be truncated at the first nul. } \value{ An integer vector (possibly including \code{NA} if \code{nomatch = diff --git a/src/library/base/man/rawConversion.Rd b/src/library/base/man/rawConversion.Rd index fac566658b6..2a47d68991f 100644 --- a/src/library/base/man/rawConversion.Rd +++ b/src/library/base/man/rawConversion.Rd @@ -39,7 +39,8 @@ packBits(x, type = c("raw", "integer")) \code{rawToChar} converts raw bytes either to a single character string or a character vector of single bytes. (Note that a single - character string could contain embedded nuls.) + character string could contain embedded nuls, in which case it will be + truncated at the first nul with a warning.) \code{rawToBits} returns a raw vector of 8 times the length of a raw vector with entries 0 or 1. \code{intToBits} returns a raw vector diff --git a/src/library/base/man/readChar.Rd b/src/library/base/man/readChar.Rd index 556ca75777c..2867e7cb9a6 100644 --- a/src/library/base/man/readChar.Rd +++ b/src/library/base/man/readChar.Rd @@ -53,10 +53,8 @@ writeChar(object, con, should be returned. Character strings containing ASCII \code{nul}(s) will be read - correctly by \code{readChar} and appear with embedded nuls in the - character vector returned. \code{writeChar} can write strings with - embedded \code{nul}s, and for such strings inteprets \code{nchar} as - the number of bytes to be written. + correctly by \code{readChar} but truncated at the first + \code{nul} with a warning. If the character length requested for \code{readChar} is longer than the data available on the connection, what is available is diff --git a/src/library/base/man/scan.Rd b/src/library/base/man/scan.Rd index f344d47efc7..2df6ca016f8 100644 --- a/src/library/base/man/scan.Rd +++ b/src/library/base/man/scan.Rd @@ -243,10 +243,8 @@ scan(file = "", what = double(0), nmax = -1, n = -1, sep = "", chars: use an explicit separator to avoid this. Having \code{nul} bytes in fields may lead to interpretation of the - field being terminated at the \code{nul} (so they are fine in - character fields). \R 2.8.0 handles these better than earlier - versions, but they not normally present in text files -- see - \code{\link{readBin}}. + field being terminated at the \code{nul}. They not normally present + in text files -- see \code{\link{readBin}} and \code{\link{readChar}}. } \references{ Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) diff --git a/src/library/base/man/serialize.Rd b/src/library/base/man/serialize.Rd index 57f41474d5f..2c7aa958b09 100644 --- a/src/library/base/man/serialize.Rd +++ b/src/library/base/man/serialize.Rd @@ -25,8 +25,8 @@ unserialize(connection, refhook = NULL) \arguments{ \item{object}{\R object to serialize.} \item{connection}{an open connection or (for \code{serialize}) - \code{NULL} or (for \code{unserialize}) a raw vector or a length-one - character vector (see \sQuote{Details}).} + \code{NULL} or (for \code{unserialize}) a raw vector + (see \sQuote{Details}).} \item{file}{a connection or the name of the file where the R object is saved to or read from.} \item{ascii}{a logical. If \code{TRUE}, an ASCII representation is @@ -51,8 +51,7 @@ unserialize(connection, refhook = NULL) across separate calls to \code{serialize}. \code{unserialize} reads an object (as written by \code{serialize}) - from \code{connection} or a raw vector or (for compatibility with - earlier versions of \code{serialize}) a length-one character vector. + from \code{connection} or a raw vector. The \code{refhook} functions can be used to customize handling of non-system reference objects (all external pointers and weak @@ -89,9 +88,6 @@ unserialize(connection, refhook = NULL) \examples{ x <- serialize(list(1,2,3), NULL) unserialize(x) -## test earlier interface as a length-one character vector -y <- rawToChar(x) -unserialize(y) } \keyword{internal} \keyword{file} diff --git a/src/library/base/man/sprintf.Rd b/src/library/base/man/sprintf.Rd index e479386b05a..51504ad258b 100644 --- a/src/library/base/man/sprintf.Rd +++ b/src/library/base/man/sprintf.Rd @@ -109,8 +109,6 @@ gettextf(fmt, \dots, domain = NULL) There is a limit of 8192 bytes on elements of \code{fmt} and also on strings included by a \code{\%s} conversion specification. - - Character inputs with embedded nul bytes will be truncated at the first nul. } \value{ diff --git a/src/library/base/man/strsplit.Rd b/src/library/base/man/strsplit.Rd index a4db307743f..879857675f0 100644 --- a/src/library/base/man/strsplit.Rd +++ b/src/library/base/man/strsplit.Rd @@ -84,8 +84,6 @@ strsplit(x, split, extended = TRUE, fixed = FALSE, perl = FALSE) (non-empty) string, the first element of the output is \code{""}, but if there is a match at the end of the string, the output is the same as with the match removed. - - Inputs with embedded nul bytes will be truncated at the first nul. } \section{Warning}{ The standard regular expression code has been reported to be very slow diff --git a/src/library/base/man/strwrap.Rd b/src/library/base/man/strwrap.Rd index 16cc4251218..823f0f781db 100644 --- a/src/library/base/man/strwrap.Rd +++ b/src/library/base/man/strwrap.Rd @@ -41,8 +41,6 @@ strwrap(x, width = 0.9 * getOption("width"), indent = 0, Indentation is relative to the number of characters in the prefix string. - - Inputs with embedded nul bytes will be truncated at the first nul. } \examples{ ## Read in file 'THANKS'. diff --git a/src/library/base/man/substr.Rd b/src/library/base/man/substr.Rd index dc1a6e49102..cf350aac913 100644 --- a/src/library/base/man/substr.Rd +++ b/src/library/base/man/substr.Rd @@ -49,8 +49,6 @@ substring(text, first, last = 1000000) <- value the current locale (see \code{\link{Encoding}} if the corresponding input had a declared encoding and the current locale is either Latin-1 or UTF-8. - - Inputs with embedded nul bytes will be truncated at the first nul. } \value{ For \code{substr}, a character vector of the same length and with the diff --git a/src/library/base/man/unique.Rd b/src/library/base/man/unique.Rd index 20b0c4d7f16..a8397c7a83a 100644 --- a/src/library/base/man/unique.Rd +++ b/src/library/base/man/unique.Rd @@ -57,9 +57,6 @@ unique(x, incomparables = FALSE, \dots) Missing values are regarded as equal, but \code{NaN} is not equal to \code{NA_real_}. - - Strings with embedded nuls of the same length will be considered - equal if they agree when truncated at the first nul. } \value{ For a vector, an object of the same type of \code{x}, but with only diff --git a/src/library/base/man/utf8Conversion.Rd b/src/library/base/man/utf8Conversion.Rd index 9856018ce8d..e278e691635 100644 --- a/src/library/base/man/utf8Conversion.Rd +++ b/src/library/base/man/utf8Conversion.Rd @@ -30,7 +30,8 @@ intToUtf8(x, multiple = FALSE) \code{intToUtf8} converts a vector of (numeric) UTF-8 code points either to a single character string or a character vector of single characters. (Note that a single character string could contain - embedded nuls.) The \code{\link{Encoding}} is declared as + embedded nuls, in which case it will be truncated at the first nul, + with a warning.) The \code{\link{Encoding}} is declared as \code{"UTF-8"}. } \examples{\dontrun{ diff --git a/src/main/envir.c b/src/main/envir.c index 74bad400467..21a3665ba10 100644 --- a/src/main/envir.c +++ b/src/main/envir.c @@ -3230,7 +3230,24 @@ SEXP mkChar(const char *name) #ifndef USE_CHAR_HASHING SEXP mkCharLenCE(const char *name, int len, cetype_t enc) { - SEXP c = allocCharsxp(len); + int slen = strlen(name); + SEXP c; + if (slen < len) { + /* This is tricky: we want to make a reasonable job of + representing this string, and EncodeString() is the most + comprehensive */ + c = allocCharsxp(len); + memcpy(CHAR_RW(c), name, len); + switch(enc) { + case CE_UTF8: SET_UTF8(c); break; + case CE_LATIN1: SET_LATIN1(c); break; + default: break; + } + warning(_("truncating string with embedded nuls: '%s'"), + EncodeString(c, 0, 0, Rprt_adj_none)); + len = slen; + } + c = allocCharsxp(len); memcpy(CHAR_RW(c), name, len); if (enc && strIsASCII(name)) enc = 0; switch(enc) { @@ -3368,7 +3385,7 @@ SEXP mkCharLenCE(const char *name, int len, cetype_t enc) { SEXP cval, chain; unsigned int hashcode; - int need_enc; + int need_enc, slen = strlen(name); switch(enc){ case CE_NATIVE: @@ -3380,6 +3397,22 @@ SEXP mkCharLenCE(const char *name, int len, cetype_t enc) default: error("unknown encoding: %d", enc); } + if (slen < len) { + SEXP c; + /* This is tricky: we want to make a reasonable job of + representing this string, and EncodeString() is the most + comprehensive */ + c = allocCharsxp(len); + memcpy(CHAR_RW(c), name, len); + switch(enc) { + case CE_UTF8: SET_UTF8(c); break; + case CE_LATIN1: SET_LATIN1(c); break; + default: break; + } + warning(_("truncating string with embedded nuls: '%s'"), + EncodeString(c, 0, 0, Rprt_adj_none)); + len = slen; + } if (enc && IsASCII(name, len)) enc = CE_NATIVE; switch(enc) { diff --git a/src/main/printutils.c b/src/main/printutils.c index 030f281c771..2d330e148a0 100644 --- a/src/main/printutils.c +++ b/src/main/printutils.c @@ -209,7 +209,7 @@ const char *EncodeReal2(double x, int w, int d, int e) void z_prec_r(Rcomplex *r, Rcomplex *x, double digits); -const char +const char *EncodeComplex(Rcomplex x, int wr, int dr, int er, int wi, int di, int ei, char cdec) { @@ -255,7 +255,7 @@ const char On Windows with surrogate pairs it will not be canonical, but AFAIK they do not occur in any MBCS (so it would only matter if we implement - UTF-8, and then only if Windows has surrogate pairs switched on, + UTF-8, and then only if Windows has surrogate pairs switched on, which Western versions at least do not.). */ @@ -316,7 +316,7 @@ int Rstrwid(const char *str, int slen, cetype_t ienc, int quote) case L'\r': case L'\t': case L'\v': - case L'\0': + case L'\0': /* historical */ len += 2; break; default: /* print in octal */ @@ -354,19 +354,19 @@ int Rstrwid(const char *str, int slen, cetype_t ienc, int quote) len++; break; } } else switch(*p) { - case '\a': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - case '\v': - case '\0': - len += 2; break; - default: - /* print in octal */ - len += 4; break; - } + case '\a': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + case '\0': /* historical */ + len += 2; break; + default: + /* print in octal */ + len += 4; break; + } p++; } else { /* 8 bit char */ #ifdef Win32 /* It seems Windows does not know what is printable! */ @@ -438,7 +438,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) } } else #endif - { + { p = translateChar(s); if(p == CHAR(s)) { i = Rstrlen(s, quote); @@ -475,18 +475,18 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) mbstate_t mb_st; wchar_t wc; unsigned int k; /* not wint_t as it might be signed */ - Rboolean Unicode_warning = FALSE; + Rboolean Unicode_warning = FALSE; if(ienc != CE_UTF8) mbs_init(&mb_st); #ifdef Win32 - else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; } + else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; } #endif for (i = 0; i < cnt; i++) { res = (ienc == CE_UTF8) ? utf8toucs(&wc, p): mbrtowc(&wc, p, MB_CUR_MAX, NULL); if(res >= 0) { /* res = 0 is a terminator */ k = wc; - /* To be portable, treat \0 explicitly */ + /* historical: To be portable, treat \0 explicitly */ if(res == 0) {k = 0; wc = L'\0';} if(0x20 <= k && k < 0x7f && iswprint(wc)) { switch(wc) { @@ -510,7 +510,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) case L'\r': *q++ = '\\'; *q++ = 'r'; break; case L'\t': *q++ = '\\'; *q++ = 't'; break; case L'\v': *q++ = '\\'; *q++ = 'v'; break; - case L'\0': *q++ = '\\'; *q++ = '0'; break; + case L'\0': *q++ = '\\'; *q++ = '0'; break; /* historical */ default: /* print in octal */ @@ -524,7 +524,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) /* The problem here is that wc may be printable according to the Unicode tables, but it may not be printable on the ouput - device concerned. */ + device concerned. */ for(j = 0; j < res; j++) *q++ = *p++; } else { #ifndef Win32 @@ -566,22 +566,22 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) default: *q++ = *p; break; } } else switch(*p) { - /* ANSI Escapes */ - case '\a': *q++ = '\\'; *q++ = 'a'; break; - case '\b': *q++ = '\\'; *q++ = 'b'; break; - case '\f': *q++ = '\\'; *q++ = 'f'; break; - case '\n': *q++ = '\\'; *q++ = 'n'; break; - case '\r': *q++ = '\\'; *q++ = 'r'; break; - case '\t': *q++ = '\\'; *q++ = 't'; break; - case '\v': *q++ = '\\'; *q++ = 'v'; break; - case '\0': *q++ = '\\'; *q++ = '0'; break; - - default: - /* print in octal */ - snprintf(buf, 5, "\\%03o", (unsigned char) *p); - for(j = 0; j < 4; j++) *q++ = buf[j]; - break; - } + /* ANSI Escapes */ + case '\a': *q++ = '\\'; *q++ = 'a'; break; + case '\b': *q++ = '\\'; *q++ = 'b'; break; + case '\f': *q++ = '\\'; *q++ = 'f'; break; + case '\n': *q++ = '\\'; *q++ = 'n'; break; + case '\r': *q++ = '\\'; *q++ = 'r'; break; + case '\t': *q++ = '\\'; *q++ = 't'; break; + case '\v': *q++ = '\\'; *q++ = 'v'; break; + case '\0': *q++ = '\\'; *q++ = '0'; break; /* historical */ + + default: + /* print in octal */ + snprintf(buf, 5, "\\%03o", (unsigned char) *p); + for(j = 0; j < 4; j++) *q++ = buf[j]; + break; + } p++; } else { /* 8 bit char */ #ifdef Win32 /* It seems Windows does not know what is printable! */ @@ -598,7 +598,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify) } #ifdef Win32 - if(WinUTF8out && ienc == CE_UTF8) { memcpy(q, UTF8out, 3); q += 3; } + if(WinUTF8out && ienc == CE_UTF8) { memcpy(q, UTF8out, 3); q += 3; } #endif if(quote) *q++ = quote; if(b > 0 && justify != Rprt_adj_right) { @@ -833,4 +833,3 @@ void attribute_hidden VectorIndex(int i, int w) /* print index label "[`i']" , using total width `w' (left filling blanks) */ Rprintf("%*s[%ld]", w-IndexWidth(i)-2, "", i); } - diff --git a/src/main/scan.c b/src/main/scan.c index ee7d436d475..aaaadf8aee8 100644 --- a/src/main/scan.c +++ b/src/main/scan.c @@ -80,13 +80,13 @@ typedef struct { char convbuf[100]; } LocalData; -static SEXP insertString(char *str, int len, LocalData *l) +static SEXP insertString(char *str, LocalData *l) { if (!strIsASCII(str)) { - if (l->con->UTF8out || l->isUTF8) return mkCharLenCE(str, len, CE_UTF8); - else if (l->isLatin1) return mkCharLenCE(str, len, CE_LATIN1); + if (l->con->UTF8out || l->isUTF8) return mkCharCE(str, CE_UTF8); + else if (l->isLatin1) return mkCharCE(str, CE_LATIN1); } - return mkCharLen(str, len); + return mkChar(str); } static R_INLINE Rboolean Rspace(unsigned int c) @@ -313,7 +313,7 @@ static void scan_cleanup(void *data) */ static char * fillBuffer(SEXPTYPE type, int strip, int *bch, LocalData *d, - R_StringBuffer *buffer, int *nbytes) + R_StringBuffer *buffer) { /* The basic reader function, called from scanVector() and scanFrame(). Reads into _buffer_ which later will be read out by extractItem(). @@ -448,7 +448,6 @@ fillBuffer(SEXPTYPE type, int strip, int *bch, LocalData *d, } *bufp = '\0'; *bch = filled; - *nbytes = m; return buffer->data; } @@ -475,7 +474,7 @@ static R_INLINE void expected(char *what, char *got, LocalData *d) error(_("scan() expected '%s', got '%s'"), what, got); } -static void extractItem(char *buffer, SEXP ans, int i, int nbytes, LocalData *d) +static void extractItem(char *buffer, SEXP ans, int i, LocalData *d) { char *endp; switch(TYPEOF(ans)) { @@ -521,7 +520,7 @@ static void extractItem(char *buffer, SEXP ans, int i, int nbytes, LocalData *d) if (isNAstring(buffer, 1, d)) SET_STRING_ELT(ans, i, NA_STRING); else - SET_STRING_ELT(ans, i, insertString(buffer, nbytes, d)); + SET_STRING_ELT(ans, i, insertString(buffer, d)); break; case RAWSXP: if (isNAstring(buffer, 0, d)) @@ -541,7 +540,7 @@ static SEXP scanVector(SEXPTYPE type, int maxitems, int maxlines, int flush, SEXP stripwhite, int blskip, LocalData *d) { SEXP ans, bns; - int blocksize, c, i, n, linesread, nprev,strip, bch, nbytes; + int blocksize, c, i, n, linesread, nprev,strip, bch; char *buffer; R_StringBuffer strBuf = {NULL, 0, MAXELTSIZE}; @@ -579,14 +578,14 @@ static SEXP scanVector(SEXPTYPE type, int maxitems, int maxlines, PROTECT(ans); copyVector(ans, bns); } - buffer = fillBuffer(type, strip, &bch, d, &strBuf, &nbytes); + buffer = fillBuffer(type, strip, &bch, d, &strBuf); if (nprev == n && strlen(buffer)==0 && ((blskip && bch =='\n') || bch == R_EOF)) { if (d->ttyflag || bch == R_EOF) break; } else { - extractItem(buffer, ans, n, nbytes, d); + extractItem(buffer, ans, n, d); if (++n == maxitems) { if (d->ttyflag && bch != '\n') { /* MBCS-safe */ while ((c = scanchar(FALSE, d)) != '\n') @@ -652,7 +651,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush, { SEXP ans, new, old, w; char *buffer = NULL; - int blksize, c, i, ii, j, n, nc, linesread, colsread, strip, bch, nbytes; + int blksize, c, i, ii, j, n, nc, linesread, colsread, strip, bch; int badline, nstring = 0; R_StringBuffer buf = {NULL, 0, MAXELTSIZE}; @@ -701,7 +700,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush, if (fill) { buffer[0] = '\0'; for (ii = colsread; ii < nc; ii++) { - extractItem(buffer, VECTOR_ELT(ans, ii), n, nbytes, d); + extractItem(buffer, VECTOR_ELT(ans, ii), n, d); } n++; ii = 0; @@ -730,8 +729,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush, } } - buffer = fillBuffer(TYPEOF(VECTOR_ELT(ans, ii)), - strip, &bch, d, &buf, &nbytes); + buffer = fillBuffer(TYPEOF(VECTOR_ELT(ans, ii)), strip, &bch, d, &buf); if (colsread == 0 && strlen(buffer) == 0 && ((blskip && bch =='\n') || bch == R_EOF)) { @@ -739,7 +737,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush, break; } else { - extractItem(buffer, VECTOR_ELT(ans, ii), n, nbytes, d); + extractItem(buffer, VECTOR_ELT(ans, ii), n, d); ii++; colsread++; if (length(stripwhite) == length(what)) @@ -765,7 +763,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush, warning(_("number of items read is not a multiple of the number of columns")); buffer[0] = '\0'; /* this is an NA */ for (ii = colsread; ii < nc; ii++) { - extractItem(buffer, VECTOR_ELT(ans, ii), n, nbytes, d); + extractItem(buffer, VECTOR_ELT(ans, ii), n, d); } n++; } diff --git a/src/main/serialize.c b/src/main/serialize.c index 4082d268b72..faf7c3a30da 100644 --- a/src/main/serialize.c +++ b/src/main/serialize.c @@ -2066,11 +2066,8 @@ SEXP attribute_hidden R_unserialize(SEXP icon, SEXP fun) hook = fun != R_NilValue ? CallHook : NULL; if (TYPEOF(icon) == STRSXP && LENGTH(icon) > 0) { - struct membuf_st mbs; - void *data = (void *)CHAR(STRING_ELT(icon, 0)); /* FIXME, is this right? */ - int length = LENGTH(STRING_ELT(icon, 0)); - InitMemInPStream(&in, &mbs, data, length, hook, fun); - return R_Unserialize(&in); + /* was the format in R < 2.4.0, removed in R 2.8.0 */ + error("character vectors are no longer accepted by unserialize()"); } else if (TYPEOF(icon) == RAWSXP) { struct membuf_st mbs; void *data = RAW(icon); diff --git a/tests/reg-tests-2.R b/tests/reg-tests-2.R index 98aec7330d1..418c54626d8 100644 --- a/tests/reg-tests-2.R +++ b/tests/reg-tests-2.R @@ -485,7 +485,7 @@ rowsum(matrix(1:12, 3,4), c("Y","X","Y")) ## PR#1115 (saving strings with ascii=TRUE) x <- y <- unlist(as.list( parse(text=paste("\"\\", - as.character(structure(0:255,class="octmode")), + as.character(structure(1:255,class="octmode")), "\"",sep="")))) save(x, ascii=T, file=(fn <- tempfile())) load(fn) @@ -1292,7 +1292,7 @@ readBin(zz, "integer", n=100, size = 1) # read as small integers seek(zz, 0, "start") readBin(zz, "character", 100) # is confused by embedded nul. seek(zz, 0, "start") -readChar(zz, length(xx)) # correct +readChar(zz, length(xx)) # truncates at embedded nul seek(zz) # make sure current position is reported properly close(zz) unlink("testbin") diff --git a/tests/reg-tests-2.Rout.save b/tests/reg-tests-2.Rout.save index a3452a53fec..a1248ed3bda 100644 --- a/tests/reg-tests-2.Rout.save +++ b/tests/reg-tests-2.Rout.save @@ -1,5 +1,5 @@ -R version 2.7.0 Under development (unstable) (2008-03-21 r44828) +R version 2.8.0 Under development (unstable) (2008-04-21 r45412) Copyright (C) 2008 The R Foundation for Statistical Computing ISBN 3-900051-07-0 @@ -1422,7 +1422,7 @@ Y 4 10 16 22 > ## PR#1115 (saving strings with ascii=TRUE) > x <- y <- unlist(as.list( + parse(text=paste("\"\\", -+ as.character(structure(0:255,class="octmode")), ++ as.character(structure(1:255,class="octmode")), + "\"",sep="")))) > save(x, ascii=T, file=(fn <- tempfile())) > load(fn) @@ -4046,8 +4046,11 @@ In readBin(zz, "character", 100) : incomplete string at end of file has been discarded > seek(zz, 0, "start") [1] 18 -> readChar(zz, length(xx)) # correct -[1] "A test string\0more" +> readChar(zz, length(xx)) # truncates at embedded nul +[1] "A test string" +Warning message: +In readChar(zz, length(xx)) : + truncating string with embedded nuls: 'A test string\0more' > seek(zz) # make sure current position is reported properly [1] 18 > close(zz) diff --git a/tests/reg-tests-3.R b/tests/reg-tests-3.R index 2bc3bbf37fb..a4cd04a202d 100644 --- a/tests/reg-tests-3.R +++ b/tests/reg-tests-3.R @@ -84,14 +84,3 @@ if(require(MASS)) { print(1.001, digits=16) ## 2.4.1 gave 1.001000000000000 ## 2.5.0 errs on the side of caution. - - -## iconv to UCS-2 -x <- "A test string" -y <- try(iconv(x, "", "UCS-2BE")) -if(!inherits(y, "try-error")) { - print(y) - z <- iconv(y, "UCS-2BE", "") - stopifnot(identical(x, z)) -} -## works in R >= 2.6.0 diff --git a/tests/reg-tests-3.Rout.save b/tests/reg-tests-3.Rout.save index 490dceba33d..4d9fd9e2585 100644 --- a/tests/reg-tests-3.Rout.save +++ b/tests/reg-tests-3.Rout.save @@ -1,6 +1,6 @@ -R version 2.7.0 Under development (unstable) (2007-10-01 r43032) -Copyright (C) 2007 The R Foundation for Statistical Computing +R version 2.8.0 Under development (unstable) (2008-04-21 r45415) +Copyright (C) 2008 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. @@ -608,15 +608,3 @@ climb 1 320.99 > ## 2.4.1 gave 1.001000000000000 > ## 2.5.0 errs on the side of caution. > -> -> ## iconv to UCS-2 -> x <- "A test string" -> y <- try(iconv(x, "", "UCS-2BE")) -> if(!inherits(y, "try-error")) { -+ print(y) -+ z <- iconv(y, "UCS-2BE", "") -+ stopifnot(identical(x, z)) -+ } -[1] "\0A\0 \0t\0e\0s\0t\0 \0s\0t\0r\0i\0n\0g" -> ## works in R >= 2.6.0 ->