From 7e8027da6ecb8c5b5f07a529e5f80c23cd3be6ad Mon Sep 17 00:00:00 2001
From: ripley <ripley@00db46b3-68df-0310-9c12-caf00c1e9a41>
Date: Mon, 21 Apr 2008 17:32:58 +0000
Subject: [PATCH] see what happens if CHARSXPs are not allowed to have embedded
 nuls

git-svn-id: https://svn.r-project.org/R/trunk@45416 00db46b3-68df-0310-9c12-caf00c1e9a41
---
 NEWS                                   |  6 +-
 doc/manual/R-ints.texi                 | 17 ++++++
 src/library/base/man/Comparison.Rd     |  6 --
 src/library/base/man/abbreviate.Rd     |  3 -
 src/library/base/man/agrep.Rd          |  2 -
 src/library/base/man/cat.Rd            |  3 +-
 src/library/base/man/char.expand.Rd    |  2 -
 src/library/base/man/charmatch.Rd      |  2 -
 src/library/base/man/chartr.Rd         |  2 -
 src/library/base/man/duplicated.Rd     |  3 -
 src/library/base/man/encodeString.Rd   |  6 +-
 src/library/base/man/format.Rd         |  2 -
 src/library/base/man/formatc.Rd        |  2 -
 src/library/base/man/grep.Rd           |  2 -
 src/library/base/man/iconv.Rd          |  2 -
 src/library/base/man/identical.Rd      |  5 +-
 src/library/base/man/make.names.Rd     |  2 -
 src/library/base/man/make.unique.Rd    |  2 -
 src/library/base/man/match.Rd          |  2 -
 src/library/base/man/nchar.Rd          |  9 ---
 src/library/base/man/paste.Rd          |  2 -
 src/library/base/man/pmatch.Rd         |  2 -
 src/library/base/man/rawConversion.Rd  |  3 +-
 src/library/base/man/readChar.Rd       |  6 +-
 src/library/base/man/scan.Rd           |  6 +-
 src/library/base/man/serialize.Rd      | 10 +---
 src/library/base/man/sprintf.Rd        |  2 -
 src/library/base/man/strsplit.Rd       |  2 -
 src/library/base/man/strwrap.Rd        |  2 -
 src/library/base/man/substr.Rd         |  2 -
 src/library/base/man/unique.Rd         |  3 -
 src/library/base/man/utf8Conversion.Rd |  3 +-
 src/main/envir.c                       | 37 +++++++++++-
 src/main/printutils.c                  | 79 +++++++++++++-------------
 src/main/scan.c                        | 32 +++++------
 src/main/serialize.c                   |  7 +--
 tests/reg-tests-2.R                    |  4 +-
 tests/reg-tests-2.Rout.save            | 11 ++--
 tests/reg-tests-3.R                    | 11 ----
 tests/reg-tests-3.Rout.save            | 16 +-----
 40 files changed, 140 insertions(+), 180 deletions(-)

diff --git a/NEWS b/NEWS
index c5f1f8a4b2a..87833ac41a6 100644
--- a/NEWS
+++ b/NEWS
@@ -35,8 +35,6 @@ NEW FEATURES
     o	tools::texi2dvi() has a new argument 'texinputs' to allow the
 	TeX and bibtex input paths to be specified (even on MiKTeX).
 
-    o	Encoding<-() now handles character strings with embedded nuls.
-
     o	setEPS() and setPS() gain '...' to allow other arguments to be
     	passed to ps.options(), including overriding 'width' and 'height'.
 
@@ -98,6 +96,10 @@ DEPRECATED & DEFUNCT
     o	In package installation, SaveImage: yes is now ignored, and
 	any use of the field will give a warning.
 
+    o	unserialize() no longer accepts character strings as input --
+	that was a format prior to R 2.4.0 which needs embedded nulls
+	in character strings.
+
     o	The C macro 'allocString' has been removed -- use 'mkChar', 
 	or 'allocVector' directly if really necessary.
 
diff --git a/doc/manual/R-ints.texi b/doc/manual/R-ints.texi
index b6adf8df127..c2d24ffa8d7 100644
--- a/doc/manual/R-ints.texi
+++ b/doc/manual/R-ints.texi
@@ -1283,6 +1283,23 @@ used to hold the finalizer function of a C finalizer (uncached) -- now
 @code{CHARSXP}s via @code{allocString} (removed in @R 2.8.0) and
 @code{allocVector(CHARSXP ...)} (deprecated in @R 2.8.0).
 
+Currently @code{CHARSXP}s with embedded nulls can be created by
+
+@itemize
+@item 
+parsing a character string containing @code{\0}. (New in @R{} 2.8.0.)
+@item 
+using @code{scan(allowEscapes=TRUE} on a string containing
+@code{\0}. (New in @R{} 2.8.0.)
+@item 
+by @code{readChar}, @code{rawToChar} or @code{intToUtf8}.
+@item 
+@code{load}ing a saved CHARSXP.  (Broken for version 2 saves in @R{} 2.6.x.)
+@end itemize
+
+@noindent
+This may change before release.
+
 @node Warnings and errors, S4 objects, The CHARSXP cache, R Internal Structures
 @section Warnings and errors
 
diff --git a/src/library/base/man/Comparison.Rd b/src/library/base/man/Comparison.Rd
index b82faef3ab3..ab3ac2c8aa8 100644
--- a/src/library/base/man/Comparison.Rd
+++ b/src/library/base/man/Comparison.Rd
@@ -58,12 +58,6 @@ x != y
   coerced to the type of the other, the (decreasing) order of precedence
   being character, complex, numeric, integer, logical and raw.
 
-  When comparisons are made between character strings, parts of the
-  strings after embedded \code{nul} characters are ignored.  (This is
-  necessary as the position of \code{nul} in the collation sequence is
-  undefined, and we want one of \code{<}, \code{==} and \code{>} to be
-  true for any comparison.)
-
   Missing values (\code{\link{NA}}) and \code{\link{NaN}} values are
   regarded as non-comparable even to themselves, so comparisons
   involving them will always result in \code{NA}.  Missing values can
diff --git a/src/library/base/man/abbreviate.Rd b/src/library/base/man/abbreviate.Rd
index 3c95b96fef3..e82c796b4c8 100644
--- a/src/library/base/man/abbreviate.Rd
+++ b/src/library/base/man/abbreviate.Rd
@@ -42,9 +42,6 @@ abbreviate(names.arg, minlength = 4, use.classes = TRUE,
 
   If \code{use.classes} is \code{FALSE} then the only distinction is to
   be between letters and space.  This has NOT been implemented.
-  
-  Elements of \code{names.arg} with embedded nul bytes will be truncated
-  at the first nul.
 }
 \value{
   A character vector containing abbreviations for the strings in its
diff --git a/src/library/base/man/agrep.Rd b/src/library/base/man/agrep.Rd
index f056f7b8a45..d3499c82208 100644
--- a/src/library/base/man/agrep.Rd
+++ b/src/library/base/man/agrep.Rd
@@ -55,8 +55,6 @@ agrep(pattern, x, ignore.case = FALSE, value = FALSE,
   space it only supports the first 65536 characters of UTF-8 (where all
   the characters for human languages lie).  Note that it can be quite
   slow in UTF-8, and \code{useBytes = TRUE} will be much faster.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   Either a vector giving the indices of the elements that yielded a
diff --git a/src/library/base/man/cat.Rd b/src/library/base/man/cat.Rd
index cb8ecc4317a..5e14162cf99 100644
--- a/src/library/base/man/cat.Rd
+++ b/src/library/base/man/cat.Rd
@@ -55,8 +55,7 @@ cat(\dots , file = "", sep = " ", fill = FALSE, labels = NULL,
   are handled.  Character strings are output \sQuote{as is} (unlike
   \code{\link{print.default}} which escapes non-printable characters and
   backslash --- use \code{\link{encodeString}} if you want to output
-  encoded strings using \code{cat}).  (Character strings with embedded
-  nuls are truncated at the first nul.)  Other types of \R object should be
+  encoded strings using \code{cat}).  Other types of \R object should be
   converted (e.g. by \code{\link{as.character}} or \code{\link{format}})
   before being passed to \code{cat}.
 
diff --git a/src/library/base/man/char.expand.Rd b/src/library/base/man/char.expand.Rd
index 0a978389e1c..38415e7e5d9 100644
--- a/src/library/base/man/char.expand.Rd
+++ b/src/library/base/man/char.expand.Rd
@@ -21,8 +21,6 @@
   This function is particularly useful when abbreviations are allowed in
   function arguments, and need to be uniquely expanded with respect to a
   target table of possible values.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \seealso{
   \code{\link{charmatch}} and \code{\link{pmatch}} for performing
diff --git a/src/library/base/man/charmatch.Rd b/src/library/base/man/charmatch.Rd
index a8180e0c889..845b30e09cf 100644
--- a/src/library/base/man/charmatch.Rd
+++ b/src/library/base/man/charmatch.Rd
@@ -32,8 +32,6 @@ charmatch(x, table, nomatch = NA_integer_)
   returned and if no match is found then \code{nomatch} is returned.
 
   \code{NA} values are treated as the string constant \code{"NA"}.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   An integer vector of the same length as \code{x}, giving the
diff --git a/src/library/base/man/chartr.Rd b/src/library/base/man/chartr.Rd
index 2e2ae052c9a..5cbbfead839 100644
--- a/src/library/base/man/chartr.Rd
+++ b/src/library/base/man/chartr.Rd
@@ -41,8 +41,6 @@ casefold(x, upper = FALSE)
 
   \code{casefold} is a wrapper for \code{tolower} and \code{toupper}
   provided for compatibility with S-PLUS.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   A character vector of the same length and with the same attributes as
diff --git a/src/library/base/man/duplicated.Rd b/src/library/base/man/duplicated.Rd
index 0d834ed7d25..69b4d8047a3 100644
--- a/src/library/base/man/duplicated.Rd
+++ b/src/library/base/man/duplicated.Rd
@@ -58,9 +58,6 @@ duplicated(x, incomparables = FALSE, \dots)
 
   Missing values are regarded as equal, but \code{NaN} is not equal to
   \code{NA_real_}.
-
-  Strings with embedded nuls of the same length will be considered
-  equal if they agree when truncated at the first nul.
 }
 \section{Warning}{
   Using this for lists is potentially slow, especially if the elements
diff --git a/src/library/base/man/encodeString.Rd b/src/library/base/man/encodeString.Rd
index 2bb7282d485..d7628e07deb 100644
--- a/src/library/base/man/encodeString.Rd
+++ b/src/library/base/man/encodeString.Rd
@@ -1,6 +1,6 @@
 % File src/library/base/man/encodeString.Rd
 % Part of the R package, http://www.R-project.org
-% Copyright 1995-2007 R Core Development Team
+% Copyright 1995-2008 R Core Development Team
 % Distributed under GPL 2 or later
 
 \name{encodeString}
@@ -33,8 +33,8 @@ encodeString(x, width = 0, quote = "", na.encode = TRUE,
 \details{
   This escapes backslash and the control characters \code{\a} (bell),
   \code{\b} (backspace), \code{\f} (formfeed), \code{\n} (line feed),
-  \code{\r} (carriage return), \code{\t} (tab), \code{\v} (vertical tab)
-  and \code{\0} (nul) as well as any non-printable characters in a
+  \code{\r} (carriage return), \code{\t} (tab) and \code{\v} (vertical tab)
+  as well as any non-printable characters in a
   single-byte locale, which are printed in octal notation
   (\code{\xyz} with leading zeroes).
 #ifdef unix
diff --git a/src/library/base/man/format.Rd b/src/library/base/man/format.Rd
index 798b7b8c83a..fc63f2d9ceb 100644
--- a/src/library/base/man/format.Rd
+++ b/src/library/base/man/format.Rd
@@ -104,8 +104,6 @@ format(x, \dots)
 
   Raw vectors are converted to their 2-digit hexadecimal representation
   by \code{\link{as.character}}.
-
-  Character inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   An object of similar structure to \code{x} containing character
diff --git a/src/library/base/man/formatc.Rd b/src/library/base/man/formatc.Rd
index 55c0ec2ac8f..f6698edc86c 100644
--- a/src/library/base/man/formatc.Rd
+++ b/src/library/base/man/formatc.Rd
@@ -126,8 +126,6 @@ prettyNum(x, big.mark = "",   big.interval = 3,
   unexpectedly if \code{x} is a \code{character} vector not resulting from
   something like \code{format(<number>)}: in particular it assumes that
   a period is a decimal mark.
-
-  Character inputs with embedded nul bytes will be truncated at the first nul.
 }
 \author{
   \code{formatC} was originally written by Bill Dunlap, later much
diff --git a/src/library/base/man/grep.Rd b/src/library/base/man/grep.Rd
index 67a0c4ed877..5f9a75a34f6 100644
--- a/src/library/base/man/grep.Rd
+++ b/src/library/base/man/grep.Rd
@@ -93,8 +93,6 @@ gregexpr(pattern, text, ignore.case = FALSE, extended = TRUE,
 
   PCRE only supports caseless matching for a non-ASCII pattern in a
   UTF-8 locale (and not for \code{useBytes = TRUE} in any locale).
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   For \code{grep} a vector giving either the indices of the elements of
diff --git a/src/library/base/man/iconv.Rd b/src/library/base/man/iconv.Rd
index fed65968def..f8c2ee9148e 100644
--- a/src/library/base/man/iconv.Rd
+++ b/src/library/base/man/iconv.Rd
@@ -60,8 +60,6 @@ iconvlist()
 
   As from \R 2.7.0 \code{"UTF8"} will be accepted as meaning the (more
   correct) \code{"UTF-8"}.
-
-  Inputs \code{x} with embedded nul bytes will be handled completely.
 }
 \value{
   A character vector of the same length and the same attributes as
diff --git a/src/library/base/man/identical.Rd b/src/library/base/man/identical.Rd
index 8ed28614591..19bd9100246 100644
--- a/src/library/base/man/identical.Rd
+++ b/src/library/base/man/identical.Rd
@@ -50,9 +50,8 @@ identical(x, y)
   \code{\link{NA_real_}}, but all \code{NaN}s are equal (and all \code{NA}
   of the same type are equal).
 
-  Comparison of character strings allows for embedded \code{nul}
-  characters.  Comparison of attributes view them as a set (and not a
-  vector, so order is not tested).
+  Comparison of attributes view them as a set (and not a vector, so
+  order is not tested).
 }
 \value{
   A single logical value, \code{TRUE} or \code{FALSE}, never \code{NA}
diff --git a/src/library/base/man/make.names.Rd b/src/library/base/man/make.names.Rd
index c0632c81ac0..d8fb0ce5190 100644
--- a/src/library/base/man/make.names.Rd
+++ b/src/library/base/man/make.names.Rd
@@ -43,8 +43,6 @@ make.names(names, unique = FALSE, allow_ = TRUE)
   \code{allow_ = FALSE} is also useful when creating names for export to
   applications which do not allow underline in names (for example,
   S-PLUS and some DBMSs).
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \seealso{
   \code{\link{make.unique}},
diff --git a/src/library/base/man/make.unique.Rd b/src/library/base/man/make.unique.Rd
index 5dbd66fbc9b..4d2c0e2f174 100644
--- a/src/library/base/man/make.unique.Rd
+++ b/src/library/base/man/make.unique.Rd
@@ -31,8 +31,6 @@ make.unique(names, sep = ".")
     
   If character vector \code{A} is already unique, then
   \code{make.unique(c(A, B))} preserves \code{A}.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \author{Thomas P Minka}
 \seealso{
diff --git a/src/library/base/man/match.Rd b/src/library/base/man/match.Rd
index 757c7b07727..ca1d7746e36 100644
--- a/src/library/base/man/match.Rd
+++ b/src/library/base/man/match.Rd
@@ -61,8 +61,6 @@ x \%in\% table
   For all types, \code{NA} matches \code{NA} and no other value.
   For real and complex values, \code{NaN} values are regarded
   as matching any other \code{NaN} value, but not matching \code{NA}.
-
-  Character inputs with embedded nul bytes will be truncated at the first nul.
 }
 \references{
   Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
diff --git a/src/library/base/man/nchar.Rd b/src/library/base/man/nchar.Rd
index 3f65ebd6117..9663620c1eb 100644
--- a/src/library/base/man/nchar.Rd
+++ b/src/library/base/man/nchar.Rd
@@ -42,10 +42,6 @@ nzchar(x)
   These will often be the same, and almost always will be in single-byte
   locales.  There will be differences between the first two with
   multibyte character sequences, e.g. in UTF-8 locales.
-  If the byte stream contains embedded \code{nul} bytes,
-  \code{type = "bytes"} looks at all the bytes whereas the other two
-  types look only at the string as printed by \code{cat}, up to the
-  first \code{nul} byte.
   
   The internal equivalent of the default method of
   \code{\link{as.character}} is performed on \code{x} (so there is no
@@ -72,11 +68,6 @@ nzchar(x)
   will be used to \code{print()} the string.  Use
   \code{\link{encodeString}} to find the characters used to print the
   string.
-
-  Embedded \code{nul} bytes are included in the byte count (but not the
-  final \code{nul}).  In contrast, characters are counted up to the
-  string terminator (the first \code{nul} that is not part of a
-  character representation).
 }
 \references{
   Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
diff --git a/src/library/base/man/paste.Rd b/src/library/base/man/paste.Rd
index 80e001ea22f..8971682e4d0 100644
--- a/src/library/base/man/paste.Rd
+++ b/src/library/base/man/paste.Rd
@@ -36,8 +36,6 @@ paste(\dots, sep = " ", collapse = NULL)
   If a value is specified for \code{collapse}, the values in the result
   are then concatenated into a single string, with the elements being
   separated by the value of \code{collapse}.
-
-  Character inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   A character vector of the concatenated values.  This will be of length
diff --git a/src/library/base/man/pmatch.Rd b/src/library/base/man/pmatch.Rd
index 3cd19e2b168..0dac8908579 100644
--- a/src/library/base/man/pmatch.Rd
+++ b/src/library/base/man/pmatch.Rd
@@ -48,8 +48,6 @@ pmatch(x, table, nomatch = NA_integer_, duplicates.ok = FALSE)
   does match empty strings, and it does not allow multiple exact matches.
 
   \code{NA} values are treated as if they were the string constant \code{"NA"}.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   An integer vector (possibly including \code{NA} if \code{nomatch =
diff --git a/src/library/base/man/rawConversion.Rd b/src/library/base/man/rawConversion.Rd
index fac566658b6..2a47d68991f 100644
--- a/src/library/base/man/rawConversion.Rd
+++ b/src/library/base/man/rawConversion.Rd
@@ -39,7 +39,8 @@ packBits(x, type = c("raw", "integer"))
 
   \code{rawToChar} converts raw bytes either to a single character
   string or a character vector of single bytes.  (Note that a single
-  character string could contain embedded nuls.)
+  character string could contain embedded nuls, in which case it will be
+  truncated at the first nul with a warning.)
 
   \code{rawToBits} returns a raw vector of 8 times the length of a raw
   vector with entries 0 or 1.  \code{intToBits} returns a raw vector
diff --git a/src/library/base/man/readChar.Rd b/src/library/base/man/readChar.Rd
index 556ca75777c..2867e7cb9a6 100644
--- a/src/library/base/man/readChar.Rd
+++ b/src/library/base/man/readChar.Rd
@@ -53,10 +53,8 @@ writeChar(object, con,
   should be returned.  
 
   Character strings containing ASCII \code{nul}(s) will be read
-  correctly by \code{readChar} and appear with embedded nuls in the
-  character vector returned.  \code{writeChar} can write strings with
-  embedded \code{nul}s, and for such strings inteprets \code{nchar} as
-  the number of bytes to be written.
+  correctly by \code{readChar} but truncated at the first
+  \code{nul} with a warning.
   
   If the character length requested for \code{readChar} is longer than
   the data available on the connection, what is available is
diff --git a/src/library/base/man/scan.Rd b/src/library/base/man/scan.Rd
index f344d47efc7..2df6ca016f8 100644
--- a/src/library/base/man/scan.Rd
+++ b/src/library/base/man/scan.Rd
@@ -243,10 +243,8 @@ scan(file = "", what = double(0), nmax = -1, n = -1, sep = "",
   chars: use an explicit separator to avoid this.
 
   Having \code{nul} bytes in fields may lead to interpretation of the
-  field being terminated at the \code{nul} (so they are fine in
-  character fields).  \R 2.8.0 handles these better than earlier
-  versions, but they not normally present in text files -- see
-  \code{\link{readBin}}.
+  field being terminated at the \code{nul}.  They not normally present
+  in text files -- see \code{\link{readBin}} and \code{\link{readChar}}.
 }
 \references{
   Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
diff --git a/src/library/base/man/serialize.Rd b/src/library/base/man/serialize.Rd
index 57f41474d5f..2c7aa958b09 100644
--- a/src/library/base/man/serialize.Rd
+++ b/src/library/base/man/serialize.Rd
@@ -25,8 +25,8 @@ unserialize(connection, refhook = NULL)
 \arguments{
   \item{object}{\R object to serialize.}
   \item{connection}{an open connection or (for \code{serialize})
-    \code{NULL} or (for \code{unserialize}) a raw vector or a length-one
-    character vector (see \sQuote{Details}).}
+    \code{NULL} or (for \code{unserialize}) a raw vector
+    (see \sQuote{Details}).}
   \item{file}{a connection or the name of the file where the R object
     is saved to or read from.}
   \item{ascii}{a logical.  If \code{TRUE}, an ASCII representation is
@@ -51,8 +51,7 @@ unserialize(connection, refhook = NULL)
   across separate calls to \code{serialize}.
 
   \code{unserialize} reads an object (as written by \code{serialize})
-  from \code{connection} or a raw vector or (for compatibility with
-  earlier versions of \code{serialize}) a length-one character vector.
+  from \code{connection} or a raw vector.
 
   The \code{refhook} functions can be used to customize handling of
   non-system reference objects (all external pointers and weak
@@ -89,9 +88,6 @@ unserialize(connection, refhook = NULL)
 \examples{
 x <- serialize(list(1,2,3), NULL)
 unserialize(x)
-## test earlier interface as a length-one character vector
-y <- rawToChar(x)
-unserialize(y)
 }
 \keyword{internal}
 \keyword{file}
diff --git a/src/library/base/man/sprintf.Rd b/src/library/base/man/sprintf.Rd
index e479386b05a..51504ad258b 100644
--- a/src/library/base/man/sprintf.Rd
+++ b/src/library/base/man/sprintf.Rd
@@ -109,8 +109,6 @@ gettextf(fmt, \dots, domain = NULL)
 
   There is a limit of 8192 bytes on elements of \code{fmt} and also on
   strings included by a \code{\%s} conversion specification.
-
-  Character inputs with embedded nul bytes will be truncated at the first nul.
 }
 
 \value{
diff --git a/src/library/base/man/strsplit.Rd b/src/library/base/man/strsplit.Rd
index a4db307743f..879857675f0 100644
--- a/src/library/base/man/strsplit.Rd
+++ b/src/library/base/man/strsplit.Rd
@@ -84,8 +84,6 @@ strsplit(x, split, extended = TRUE, fixed = FALSE, perl = FALSE)
   (non-empty) string, the first element of the output is \code{""}, but
   if there is a match at the end of the string, the output is the same
   as with the match removed. 
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \section{Warning}{
   The standard regular expression code has been reported to be very slow
diff --git a/src/library/base/man/strwrap.Rd b/src/library/base/man/strwrap.Rd
index 16cc4251218..823f0f781db 100644
--- a/src/library/base/man/strwrap.Rd
+++ b/src/library/base/man/strwrap.Rd
@@ -41,8 +41,6 @@ strwrap(x, width = 0.9 * getOption("width"), indent = 0,
   
   Indentation is relative to the number of characters in the prefix
   string.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \examples{
 ## Read in file 'THANKS'.
diff --git a/src/library/base/man/substr.Rd b/src/library/base/man/substr.Rd
index dc1a6e49102..cf350aac913 100644
--- a/src/library/base/man/substr.Rd
+++ b/src/library/base/man/substr.Rd
@@ -49,8 +49,6 @@ substring(text, first, last = 1000000) <- value
   the current locale (see \code{\link{Encoding}} if the corresponding
   input had a declared encoding and the current locale is either Latin-1
   or UTF-8.
-
-  Inputs with embedded nul bytes will be truncated at the first nul.
 }
 \value{
   For \code{substr}, a character vector of the same length and with the
diff --git a/src/library/base/man/unique.Rd b/src/library/base/man/unique.Rd
index 20b0c4d7f16..a8397c7a83a 100644
--- a/src/library/base/man/unique.Rd
+++ b/src/library/base/man/unique.Rd
@@ -57,9 +57,6 @@ unique(x, incomparables = FALSE, \dots)
 
   Missing values are regarded as equal, but \code{NaN} is not equal to
   \code{NA_real_}.
-
-  Strings with embedded nuls of the same length will be considered
-  equal if they agree when truncated at the first nul.
 }
 \value{
   For a vector, an object of the same type of \code{x}, but with only
diff --git a/src/library/base/man/utf8Conversion.Rd b/src/library/base/man/utf8Conversion.Rd
index 9856018ce8d..e278e691635 100644
--- a/src/library/base/man/utf8Conversion.Rd
+++ b/src/library/base/man/utf8Conversion.Rd
@@ -30,7 +30,8 @@ intToUtf8(x, multiple = FALSE)
   \code{intToUtf8} converts a vector of (numeric) UTF-8 code points
   either to a single character string or a character vector of single
   characters.  (Note that a single character string could contain
-  embedded nuls.)  The \code{\link{Encoding}} is declared as
+  embedded nuls, in which case it will be truncated at the first nul,
+  with a warning.)  The \code{\link{Encoding}} is declared as
   \code{"UTF-8"}.
 }
 \examples{\dontrun{
diff --git a/src/main/envir.c b/src/main/envir.c
index 74bad400467..21a3665ba10 100644
--- a/src/main/envir.c
+++ b/src/main/envir.c
@@ -3230,7 +3230,24 @@ SEXP mkChar(const char *name)
 #ifndef USE_CHAR_HASHING
 SEXP mkCharLenCE(const char *name, int len, cetype_t enc)
 {
-    SEXP c = allocCharsxp(len);
+    int slen = strlen(name);
+    SEXP c;
+    if (slen < len) {
+	/* This is tricky: we want to make a reasonable job of
+	   representing this string, and EncodeString() is the most
+	   comprehensive */
+	c = allocCharsxp(len);
+	memcpy(CHAR_RW(c), name, len);
+	switch(enc) {
+	case CE_UTF8: SET_UTF8(c); break;
+	case CE_LATIN1: SET_LATIN1(c); break;
+	default: break;
+	}
+	warning(_("truncating string with embedded nuls: '%s'"), 
+		EncodeString(c, 0, 0, Rprt_adj_none));
+	len = slen;
+    }
+    c = allocCharsxp(len);
     memcpy(CHAR_RW(c), name, len);
     if (enc && strIsASCII(name)) enc = 0;
     switch(enc) {
@@ -3368,7 +3385,7 @@ SEXP mkCharLenCE(const char *name, int len, cetype_t enc)
 {
     SEXP cval, chain;
     unsigned int hashcode;
-    int need_enc;
+    int need_enc, slen = strlen(name);
 
     switch(enc){
     case CE_NATIVE:
@@ -3380,6 +3397,22 @@ SEXP mkCharLenCE(const char *name, int len, cetype_t enc)
     default:
         error("unknown encoding: %d", enc);
     }
+    if (slen < len) {
+	SEXP c;
+	/* This is tricky: we want to make a reasonable job of
+	   representing this string, and EncodeString() is the most
+	   comprehensive */
+	c = allocCharsxp(len);
+	memcpy(CHAR_RW(c), name, len);
+	switch(enc) {
+	case CE_UTF8: SET_UTF8(c); break;
+	case CE_LATIN1: SET_LATIN1(c); break;
+	default: break;
+	}
+	warning(_("truncating string with embedded nuls: '%s'"), 
+		EncodeString(c, 0, 0, Rprt_adj_none));
+	len = slen;
+    }
 
     if (enc && IsASCII(name, len)) enc = CE_NATIVE;
     switch(enc) {
diff --git a/src/main/printutils.c b/src/main/printutils.c
index 030f281c771..2d330e148a0 100644
--- a/src/main/printutils.c
+++ b/src/main/printutils.c
@@ -209,7 +209,7 @@ const char *EncodeReal2(double x, int w, int d, int e)
 
 void z_prec_r(Rcomplex *r, Rcomplex *x, double digits);
 
-const char 
+const char
 *EncodeComplex(Rcomplex x, int wr, int dr, int er, int wi, int di, int ei,
 	       char cdec)
 {
@@ -255,7 +255,7 @@ const char
 
    On Windows with surrogate pairs it will not be canonical, but AFAIK
    they do not occur in any MBCS (so it would only matter if we implement
-   UTF-8, and then only if Windows has surrogate pairs switched on, 
+   UTF-8, and then only if Windows has surrogate pairs switched on,
    which Western versions at least do not.).
 */
 
@@ -316,7 +316,7 @@ int Rstrwid(const char *str, int slen, cetype_t ienc, int quote)
 		    case L'\r':
 		    case L'\t':
 		    case L'\v':
-		    case L'\0':
+		    case L'\0': /* historical */
 			len += 2; break;
 		    default:
 			/* print in octal */
@@ -354,19 +354,19 @@ int Rstrwid(const char *str, int slen, cetype_t ienc, int quote)
 			len++; break;
 		    }
 		} else switch(*p) {
-		case '\a':
-		case '\b':
-		case '\f':
-		case '\n':
-		case '\r':
-		case '\t':
-		case '\v':
-		case '\0':
-		    len += 2; break;
-		default:
-		    /* print in octal */
-		    len += 4; break;
-		}
+		    case '\a':
+		    case '\b':
+		    case '\f':
+		    case '\n':
+		    case '\r':
+		    case '\t':
+		    case '\v':
+		    case '\0': /* historical */
+			len += 2; break;
+		    default:
+			/* print in octal */
+			len += 4; break;
+		    }
 		p++;
 	    } else { /* 8 bit char */
 #ifdef Win32 /* It seems Windows does not know what is printable! */
@@ -438,7 +438,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
 	    }
 	} else
 #endif
-        {
+	{
 	    p = translateChar(s);
 	    if(p == CHAR(s)) {
 		i = Rstrlen(s, quote);
@@ -475,18 +475,18 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
 	mbstate_t mb_st;
 	wchar_t wc;
 	unsigned int k; /* not wint_t as it might be signed */
-        Rboolean Unicode_warning = FALSE;
+	Rboolean Unicode_warning = FALSE;
 
 	if(ienc != CE_UTF8)  mbs_init(&mb_st);
 #ifdef Win32
-	else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; } 
+	else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; }
 #endif
 	for (i = 0; i < cnt; i++) {
 	    res = (ienc == CE_UTF8) ? utf8toucs(&wc, p):
 		mbrtowc(&wc, p, MB_CUR_MAX, NULL);
 	    if(res >= 0) { /* res = 0 is a terminator */
 		k = wc;
-		/* To be portable, treat \0 explicitly */
+		/* historical: To be portable, treat \0 explicitly */
 		if(res == 0) {k = 0; wc = L'\0';}
 		if(0x20 <= k && k < 0x7f && iswprint(wc)) {
 		    switch(wc) {
@@ -510,7 +510,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
 		    case L'\r': *q++ = '\\'; *q++ = 'r'; break;
 		    case L'\t': *q++ = '\\'; *q++ = 't'; break;
 		    case L'\v': *q++ = '\\'; *q++ = 'v'; break;
-		    case L'\0': *q++ = '\\'; *q++ = '0'; break;
+		    case L'\0': *q++ = '\\'; *q++ = '0'; break; /* historical */
 
 		    default:
 			/* print in octal */
@@ -524,7 +524,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
 			/* The problem here is that wc may be
 			   printable according to the Unicode tables,
 			   but it may not be printable on the ouput
-			   device concerned. */ 
+			   device concerned. */
 			for(j = 0; j < res; j++) *q++ = *p++;
 		    } else {
 #ifndef Win32
@@ -566,22 +566,22 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
 		    default: *q++ = *p; break;
 		    }
 		} else switch(*p) {
-		    /* ANSI Escapes */
-		case '\a': *q++ = '\\'; *q++ = 'a'; break;
-		case '\b': *q++ = '\\'; *q++ = 'b'; break;
-		case '\f': *q++ = '\\'; *q++ = 'f'; break;
-		case '\n': *q++ = '\\'; *q++ = 'n'; break;
-		case '\r': *q++ = '\\'; *q++ = 'r'; break;
-		case '\t': *q++ = '\\'; *q++ = 't'; break;
-		case '\v': *q++ = '\\'; *q++ = 'v'; break;
-		case '\0': *q++ = '\\'; *q++ = '0'; break;
-
-		default:
-		    /* print in octal */
-		    snprintf(buf, 5, "\\%03o", (unsigned char) *p);
-		    for(j = 0; j < 4; j++) *q++ = buf[j];
-		    break;
-		}
+			/* ANSI Escapes */
+		    case '\a': *q++ = '\\'; *q++ = 'a'; break;
+		    case '\b': *q++ = '\\'; *q++ = 'b'; break;
+		    case '\f': *q++ = '\\'; *q++ = 'f'; break;
+		    case '\n': *q++ = '\\'; *q++ = 'n'; break;
+		    case '\r': *q++ = '\\'; *q++ = 'r'; break;
+		    case '\t': *q++ = '\\'; *q++ = 't'; break;
+		    case '\v': *q++ = '\\'; *q++ = 'v'; break;
+		    case '\0': *q++ = '\\'; *q++ = '0'; break; /* historical */
+
+		    default:
+			/* print in octal */
+			snprintf(buf, 5, "\\%03o", (unsigned char) *p);
+			for(j = 0; j < 4; j++) *q++ = buf[j];
+			break;
+		    }
 		p++;
 	    } else {  /* 8 bit char */
 #ifdef Win32 /* It seems Windows does not know what is printable! */
@@ -598,7 +598,7 @@ const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
 	}
 
 #ifdef Win32
-    if(WinUTF8out && ienc == CE_UTF8)  { memcpy(q, UTF8out, 3); q += 3; } 
+    if(WinUTF8out && ienc == CE_UTF8)  { memcpy(q, UTF8out, 3); q += 3; }
 #endif
     if(quote) *q++ = quote;
     if(b > 0 && justify != Rprt_adj_right) {
@@ -833,4 +833,3 @@ void attribute_hidden VectorIndex(int i, int w)
 /* print index label "[`i']" , using total width `w' (left filling blanks) */
     Rprintf("%*s[%ld]", w-IndexWidth(i)-2, "", i);
 }
-
diff --git a/src/main/scan.c b/src/main/scan.c
index ee7d436d475..aaaadf8aee8 100644
--- a/src/main/scan.c
+++ b/src/main/scan.c
@@ -80,13 +80,13 @@ typedef struct {
     char convbuf[100];
 } LocalData;
 
-static SEXP insertString(char *str, int len, LocalData *l)
+static SEXP insertString(char *str, LocalData *l)
 {
     if (!strIsASCII(str)) {
-	if (l->con->UTF8out || l->isUTF8) return mkCharLenCE(str, len, CE_UTF8);
-	else if (l->isLatin1) return mkCharLenCE(str, len, CE_LATIN1);
+        if (l->con->UTF8out || l->isUTF8) return mkCharCE(str, CE_UTF8);
+        else if (l->isLatin1) return mkCharCE(str, CE_LATIN1);
     }
-    return mkCharLen(str, len);
+    return mkChar(str);
 }
 
 static R_INLINE Rboolean Rspace(unsigned int c)
@@ -313,7 +313,7 @@ static void scan_cleanup(void *data)
  */
 static char *
 fillBuffer(SEXPTYPE type, int strip, int *bch, LocalData *d,
-	   R_StringBuffer *buffer, int *nbytes)
+	   R_StringBuffer *buffer)
 {
 /* The basic reader function, called from scanVector() and scanFrame().
    Reads into _buffer_	which later will be read out by extractItem().
@@ -448,7 +448,6 @@ fillBuffer(SEXPTYPE type, int strip, int *bch, LocalData *d,
     }
     *bufp = '\0';
     *bch = filled;
-    *nbytes = m;
     return buffer->data;
 }
 
@@ -475,7 +474,7 @@ static R_INLINE void expected(char *what, char *got, LocalData *d)
     error(_("scan() expected '%s', got '%s'"), what, got);
 }
 
-static void extractItem(char *buffer, SEXP ans, int i, int nbytes, LocalData *d)
+static void extractItem(char *buffer, SEXP ans, int i, LocalData *d)
 {
     char *endp;
     switch(TYPEOF(ans)) {
@@ -521,7 +520,7 @@ static void extractItem(char *buffer, SEXP ans, int i, int nbytes, LocalData *d)
 	if (isNAstring(buffer, 1, d))
 	    SET_STRING_ELT(ans, i, NA_STRING);
 	else
-	    SET_STRING_ELT(ans, i, insertString(buffer, nbytes, d));
+	    SET_STRING_ELT(ans, i, insertString(buffer, d));
 	break;
     case RAWSXP:
 	if (isNAstring(buffer, 0, d))
@@ -541,7 +540,7 @@ static SEXP scanVector(SEXPTYPE type, int maxitems, int maxlines,
 		       int flush, SEXP stripwhite, int blskip, LocalData *d)
 {
     SEXP ans, bns;
-    int blocksize, c, i, n, linesread, nprev,strip, bch, nbytes;
+    int blocksize, c, i, n, linesread, nprev,strip, bch;
     char *buffer;
     R_StringBuffer strBuf = {NULL, 0, MAXELTSIZE};
 
@@ -579,14 +578,14 @@ static SEXP scanVector(SEXPTYPE type, int maxitems, int maxlines,
 	    PROTECT(ans);
 	    copyVector(ans, bns);
 	}
-	buffer = fillBuffer(type, strip, &bch, d, &strBuf, &nbytes);
+	buffer = fillBuffer(type, strip, &bch, d, &strBuf);
 	if (nprev == n && strlen(buffer)==0 &&
 	    ((blskip && bch =='\n') || bch == R_EOF)) {
 	    if (d->ttyflag || bch == R_EOF)
 		break;
 	}
 	else {
-	    extractItem(buffer, ans, n, nbytes, d);
+	    extractItem(buffer, ans, n, d);
 	    if (++n == maxitems) {
 		if (d->ttyflag && bch != '\n') { /* MBCS-safe */
 		    while ((c = scanchar(FALSE, d)) != '\n')
@@ -652,7 +651,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush,
 {
     SEXP ans, new, old, w;
     char *buffer = NULL;
-    int blksize, c, i, ii, j, n, nc, linesread, colsread, strip, bch, nbytes;
+    int blksize, c, i, ii, j, n, nc, linesread, colsread, strip, bch;
     int badline, nstring = 0;
     R_StringBuffer buf = {NULL, 0, MAXELTSIZE};
 
@@ -701,7 +700,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush,
 		if (fill) {
 		    buffer[0] = '\0';
 		    for (ii = colsread; ii < nc; ii++) {
-			extractItem(buffer, VECTOR_ELT(ans, ii), n, nbytes, d);
+			extractItem(buffer, VECTOR_ELT(ans, ii), n, d);
 		    }
 		    n++;
 		    ii = 0;
@@ -730,8 +729,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush,
 	    }
 	}
 
-	buffer = fillBuffer(TYPEOF(VECTOR_ELT(ans, ii)),
-			    strip, &bch, d, &buf, &nbytes);
+	buffer = fillBuffer(TYPEOF(VECTOR_ELT(ans, ii)), strip, &bch, d, &buf);
 	if (colsread == 0 &&
 	    strlen(buffer) == 0 &&
 	    ((blskip && bch =='\n') || bch == R_EOF)) {
@@ -739,7 +737,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush,
 		break;
 	}
 	else {
-	    extractItem(buffer, VECTOR_ELT(ans, ii), n, nbytes, d);
+	    extractItem(buffer, VECTOR_ELT(ans, ii), n, d);
 	    ii++;
 	    colsread++;
 	    if (length(stripwhite) == length(what))
@@ -765,7 +763,7 @@ static SEXP scanFrame(SEXP what, int maxitems, int maxlines, int flush,
 	    warning(_("number of items read is not a multiple of the number of columns"));
 	buffer[0] = '\0';	/* this is an NA */
 	for (ii = colsread; ii < nc; ii++) {
-	    extractItem(buffer, VECTOR_ELT(ans, ii), n, nbytes, d);
+	    extractItem(buffer, VECTOR_ELT(ans, ii), n, d);
 	}
 	n++;
     }
diff --git a/src/main/serialize.c b/src/main/serialize.c
index 4082d268b72..faf7c3a30da 100644
--- a/src/main/serialize.c
+++ b/src/main/serialize.c
@@ -2066,11 +2066,8 @@ SEXP attribute_hidden R_unserialize(SEXP icon, SEXP fun)
     hook = fun != R_NilValue ? CallHook : NULL;
 
     if (TYPEOF(icon) == STRSXP && LENGTH(icon) > 0) {
-        struct membuf_st mbs;
-	void *data = (void *)CHAR(STRING_ELT(icon, 0)); /* FIXME, is this right? */
-	int length = LENGTH(STRING_ELT(icon, 0));
-	InitMemInPStream(&in, &mbs, data,  length, hook, fun);
-	return R_Unserialize(&in);
+	/* was the format in R < 2.4.0, removed in R 2.8.0 */
+	error("character vectors are no longer accepted by unserialize()");
     } else if (TYPEOF(icon) == RAWSXP) {
         struct membuf_st mbs;
 	void *data = RAW(icon);
diff --git a/tests/reg-tests-2.R b/tests/reg-tests-2.R
index 98aec7330d1..418c54626d8 100644
--- a/tests/reg-tests-2.R
+++ b/tests/reg-tests-2.R
@@ -485,7 +485,7 @@ rowsum(matrix(1:12, 3,4), c("Y","X","Y"))
 ## PR#1115 (saving strings with ascii=TRUE)
 x <- y <- unlist(as.list(
     parse(text=paste("\"\\",
-          as.character(structure(0:255,class="octmode")),
+          as.character(structure(1:255,class="octmode")),
              "\"",sep=""))))
 save(x, ascii=T, file=(fn <- tempfile()))
 load(fn)
@@ -1292,7 +1292,7 @@ readBin(zz, "integer", n=100, size = 1) # read as small integers
 seek(zz, 0, "start")
 readBin(zz, "character", 100)  # is confused by embedded nul.
 seek(zz, 0, "start")
-readChar(zz, length(xx)) # correct
+readChar(zz, length(xx)) # truncates at embedded nul
 seek(zz) # make sure current position is reported properly
 close(zz)
 unlink("testbin")
diff --git a/tests/reg-tests-2.Rout.save b/tests/reg-tests-2.Rout.save
index a3452a53fec..a1248ed3bda 100644
--- a/tests/reg-tests-2.Rout.save
+++ b/tests/reg-tests-2.Rout.save
@@ -1,5 +1,5 @@
 
-R version 2.7.0 Under development (unstable) (2008-03-21 r44828)
+R version 2.8.0 Under development (unstable) (2008-04-21 r45412)
 Copyright (C) 2008 The R Foundation for Statistical Computing
 ISBN 3-900051-07-0
 
@@ -1422,7 +1422,7 @@ Y    4   10   16   22
 > ## PR#1115 (saving strings with ascii=TRUE)
 > x <- y <- unlist(as.list(
 +     parse(text=paste("\"\\",
-+           as.character(structure(0:255,class="octmode")),
++           as.character(structure(1:255,class="octmode")),
 +              "\"",sep=""))))
 > save(x, ascii=T, file=(fn <- tempfile()))
 > load(fn)
@@ -4046,8 +4046,11 @@ In readBin(zz, "character", 100) :
   incomplete string at end of file has been discarded
 > seek(zz, 0, "start")
 [1] 18
-> readChar(zz, length(xx)) # correct
-[1] "A test string\0more"
+> readChar(zz, length(xx)) # truncates at embedded nul
+[1] "A test string"
+Warning message:
+In readChar(zz, length(xx)) :
+  truncating string with embedded nuls: 'A test string\0more'
 > seek(zz) # make sure current position is reported properly
 [1] 18
 > close(zz)
diff --git a/tests/reg-tests-3.R b/tests/reg-tests-3.R
index 2bc3bbf37fb..a4cd04a202d 100644
--- a/tests/reg-tests-3.R
+++ b/tests/reg-tests-3.R
@@ -84,14 +84,3 @@ if(require(MASS)) {
 print(1.001, digits=16)
 ## 2.4.1 gave  1.001000000000000
 ## 2.5.0 errs on the side of caution.
-
-
-## iconv to UCS-2
-x <- "A test string"
-y <- try(iconv(x, "", "UCS-2BE"))
-if(!inherits(y, "try-error")) {
-    print(y)
-    z <- iconv(y, "UCS-2BE", "")
-    stopifnot(identical(x, z))
-}
-## works in R >= 2.6.0
diff --git a/tests/reg-tests-3.Rout.save b/tests/reg-tests-3.Rout.save
index 490dceba33d..4d9fd9e2585 100644
--- a/tests/reg-tests-3.Rout.save
+++ b/tests/reg-tests-3.Rout.save
@@ -1,6 +1,6 @@
 
-R version 2.7.0 Under development (unstable) (2007-10-01 r43032)
-Copyright (C) 2007 The R Foundation for Statistical Computing
+R version 2.8.0 Under development (unstable) (2008-04-21 r45415)
+Copyright (C) 2008 The R Foundation for Statistical Computing
 ISBN 3-900051-07-0
 
 R is free software and comes with ABSOLUTELY NO WARRANTY.
@@ -608,15 +608,3 @@ climb   1 320.99
 > ## 2.4.1 gave  1.001000000000000
 > ## 2.5.0 errs on the side of caution.
 > 
-> 
-> ## iconv to UCS-2
-> x <- "A test string"
-> y <- try(iconv(x, "", "UCS-2BE"))
-> if(!inherits(y, "try-error")) {
-+     print(y)
-+     z <- iconv(y, "UCS-2BE", "")
-+     stopifnot(identical(x, z))
-+ }
-[1] "\0A\0 \0t\0e\0s\0t\0 \0s\0t\0r\0i\0n\0g"
-> ## works in R >= 2.6.0
->