handle BOMs in UCS-2LE encoding

git-svn-id: https://svn.r-project.org/R/trunk@34278 00db46b3-68df-0310-9c12-caf00c1e9a41
wch · May 11, 2005 · 5fb2926 · 5fb2926
1 parent 5f0f339
commit 5fb2926
Show file tree

Hide file tree

Showing 7 changed files with 49 additions and 14 deletions.
diff --git a/NEWS b/NEWS
@@ -219,6 +219,10 @@ BUG FIXES
     o	strwrap() now makes a reasonable job of text that is invalid in the
 	current locale.
 
+    o	Reading with encoding "UCS-2LE" will remove any Byte Order
+	Mark, as most implementations of iconv fail to handle BOMs
+	(which are present in 'Windows Unicode' files).
+
 
 
 		CHANGES IN R VERSION 2.1.0

diff --git a/src/library/base/man/connections.Rd b/src/library/base/man/connections.Rd
@@ -186,6 +186,20 @@ isIncomplete(con)
 
   Re-encoding only works for connections in text mode.
 
+  The encoding \code{"UCS-2LE"} is treated specially, as it is the
+  appropriate value for Windows \sQuote{Unicode} text files.  If the
+  first two bytes are the Byte Order Mark \code{0xFFFE} then these are
+  removed as most implementations of \code{\link{iconv}} do not accept
+  BOMs.  Note that some implementations
+#ifdef windows
+  (including that used on Windows)
+#endif
+  will handle BOMs using encoding \code{"UCS2"} but many
+#ifdef windows
+  (including that in \code{glibc})
+#endif
+  will not.
+
   Exactly what happens when the requested translation cannot be done is
   in general undocumented.  Requesting a conversion that is not supported is
   an error, reported when the connection is opened.  On output the
@@ -384,7 +398,8 @@ close(con2)
 
 \dontrun{ ## examples of use of encodings
 cat(x, file = file("foo", "w", encoding="UTF-8"))
-}
-}
+# read a 'Windows Unicode' file including names
+A <- read.table(file("students", encoding="UCS-2LE"))
+}}
 \keyword{file}
 \keyword{connection}
diff --git a/src/library/utils/man/iconv.Rd b/src/library/utils/man/iconv.Rd
@@ -53,14 +53,9 @@ iconvlist()
 \note{
   Not all platforms support these functions.  See also
   \code{\link{capabilities}("iconv")}.
-
-#ifdef windows
-  The support DLL for these functions is not included in the miniR
-  distribution.
-#endif
 }
 \seealso{
-  \code{\link{localeToCharset}}, \code{\link{file}}.
+  \code{\link{localeToCharset}}, \code{\link{file}}. 
 }
 \examples{\dontrun{
 iconvlist()

diff --git a/src/main/connections.c b/src/main/connections.c
@@ -138,6 +138,9 @@ void set_iconv(Rconnection con)
 	/* initialize state, and prepare any initial bytes */
 	Riconv(tmp, NULL, NULL, &ob, &onb);
 	con->navail = 50-onb; con->inavail = 0;
+	/* libiconv can handle BOM marks on Windows Unicode files, but
+	   glibc's iconv cannot. Aargh ... */
+	if(streql(con->encname, "UCS-2LE")) con->inavail = -2;
     }
     if(con->canwrite) {
 	size_t onb = 25;
@@ -229,14 +232,20 @@ int dummy_vfprintf(Rconnection con, const char *format, va_list ap)
 int dummy_fgetc(Rconnection con)
 {
     int c;
+    Rboolean checkBOM = FALSE;
 
     if(con->inconv) {
 	if(con->navail <= 0) {
 	    unsigned int i, inew = 0;
-	    char *p = con->iconvbuff + con->inavail, *ib, *ob;
+	    char *p, *ib, *ob;
 	    size_t inb, onb, res;
 
 	    if(con->EOF_signalled) return R_EOF;
+	    if(con->inavail == -2) {
+		con->inavail = 0;
+		checkBOM = TRUE;
+	    }
+	    p = con->iconvbuff + con->inavail;
 	    for(i = con->inavail; i < 25; i++) {
 		c = con->fgetc_internal(con);
 		if(c == R_EOF){ con->EOF_signalled = TRUE; break; }
@@ -245,6 +254,12 @@ int dummy_fgetc(Rconnection con)
 		inew++;
 	    }
 	    if(inew == 0) return R_EOF;
+	    if(checkBOM && con->inavail >= 2 &&
+	       (unsigned char) con->iconvbuff[0] == 255 &&
+	       (unsigned char) con->iconvbuff[1] == 254) {
+		con->inavail -= 2;
+		memmove(con->iconvbuff, con->iconvbuff+2, con->inavail);
+	    }
 	    ib = con->iconvbuff; inb = con->inavail;
 	    ob = con->oconvbuff; onb = 50;
 	    res = Riconv(con->inconv, &ib, &inb, &ob, &onb);

diff --git a/tests/WinUnicode.dat b/tests/WinUnicode.dat
diff --git a/tests/reg-IO2.R b/tests/reg-IO2.R
@@ -97,4 +97,6 @@ cat('%comment\n\n%another\n%\n%\n',
 read.table("test.dat", comment.char = "%")
 unlink("test.dat")
 
+## test on Windows Unicode file
+scan(file("WinUnicode.dat", encoding="UCS-2LE"), 0)
 ## end of tests
diff --git a/tests/reg-IO2.Rout.save b/tests/reg-IO2.Rout.save
@@ -1,14 +1,14 @@
 
-R : Copyright 2004, The R Foundation for Statistical Computing
-Version 2.0.0 Under development (unstable) (2004-05-23), ISBN 3-900051-00-3
+R : Copyright 2005, The R Foundation for Statistical Computing
+Version 2.2.0 Under development (unstable) (2005-05-11), ISBN 3-900051-07-0
 
 R is free software and comes with ABSOLUTELY NO WARRANTY.
 You are welcome to redistribute it under certain conditions.
 Type 'license()' or 'licence()' for distribution details.
 
 R is a collaborative project with many contributors.
 Type 'contributors()' for more information and
-'citation()' on how to cite R in publications.
+'citation()' on how to cite R or R packages in publications.
 
 Type 'demo()' for some demos, 'help()' for on-line help, or
 'help.start()' for a HTML browser interface to help.
@@ -61,7 +61,7 @@ Error in read.table("foo1") : no lines available in input
 > try(read.table("foo3", header=TRUE, col.names=letters[1:4]))
 Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  : 
 	line 1 did not have 4 elements
-In addition: Warning message: 
+In addition: Warning message:
 header and 'col.names' are of different lengths in: read.table("foo3", header = TRUE, col.names = letters[1:4]) 
 > unlink("foo3")
 > 
@@ -71,7 +71,7 @@ header and 'col.names' are of different lengths in: read.table("foo3", header =
   head
 1    2
 3    4
-Warning message: 
+Warning message:
 incomplete final line found by readTableHeader on 'foo4' 
 > unlink("foo4")
 > 
@@ -199,5 +199,9 @@ logical(0)
 5     3           0.8           3
 > unlink("test.dat")
 > 
+> ## test on Windows Unicode file
+> scan(file("WinUnicode.dat", encoding="UCS-2LE"), 0)
+Read 8 items
+[1] 1 2 3 4 5 6 7 8
 > ## end of tests
 >