Merge pull request #433 from jimhester/bugfix/issue309

Support reading into long vectors
tidyverse · Jun 15, 2016 · afb2a0d · afb2a0d
2 parents 75b0b5f + 909d0a5
commit afb2a0d
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 10 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # readr 0.2.2.9000
 
+* Supports reading into long vectors (#309, @jimhester).
+
 * Printing of double values now uses an
   [implementation](https://github.com/juj/MathGeoLib/blob/master/src/Math/grisu3.c)
   of the [grisu3

diff --git a/src/SourceRaw.h b/src/SourceRaw.h
@@ -14,7 +14,7 @@ class SourceRaw : public Source {
       x_(x)
   {
     begin_ = (const char*) RAW(x);
-    end_ = (const char*) RAW(x) + Rf_length(x);
+    end_ = (const char*) RAW(x) + Rf_xlength(x);
 
     // Skip lines, if needed
     begin_ = skipLines(begin_, end_, skip, comment);

diff --git a/src/SourceString.h b/src/SourceString.h
@@ -15,7 +15,7 @@ class SourceString : public Source {
     string_ = x[0];
 
     begin_ = CHAR(string_);
-    end_ = begin_ + Rf_length(string_);
+    end_ = begin_ + Rf_xlength(string_);
 
     // Skip lines, if needed
     begin_ = skipLines(begin_, end_, skip, comment);

diff --git a/src/read.cpp b/src/read.cpp
@@ -29,10 +29,10 @@ CharacterVector read_lines_(List sourceSpec, List locale_, int n_max = -1,
   LocaleInfo locale(locale_);
   Progress progressBar;
 
-  int n = (n_max < 0) ? 10000 : n_max;
+  R_len_t n = (n_max < 0) ? 10000 : n_max;
   CharacterVector out(n);
 
-  int i = 0;
+  R_len_t i = 0;
   for (Token t = tokenizer.nextToken(); t.type() != TOKEN_EOF; t = tokenizer.nextToken()) {
     if (progress && (i + 1) % 25000 == 0)
       progressBar.show(tokenizer.progress());
@@ -41,7 +41,7 @@ CharacterVector read_lines_(List sourceSpec, List locale_, int n_max = -1,
       if (n_max < 0) {
         // Estimate rows in full dataset
         n = (i / tokenizer.progress().first) * 1.2;
-        out = Rf_lengthgets(out, n);
+        out = Rf_xlengthgets(out, n);
       } else {
         break;
       }
@@ -54,7 +54,7 @@ CharacterVector read_lines_(List sourceSpec, List locale_, int n_max = -1,
   }
 
   if (i < n) {
-    out = Rf_lengthgets(out, i);
+    out = Rf_xlengthgets(out, i);
   }
 
   if (progress)
@@ -72,10 +72,10 @@ List read_lines_raw_(List sourceSpec, int n_max = -1, bool progress = false) {
   tokenizer.tokenize(source->begin(), source->end());
   Progress progressBar;
 
-  int n = (n_max < 0) ? 10000 : n_max;
+  R_len_t n = (n_max < 0) ? 10000 : n_max;
   List out(n);
 
-  int i = 0;
+  R_len_t i = 0;
   for (Token t = tokenizer.nextToken(); t.type() != TOKEN_EOF; t = tokenizer.nextToken()) {
     if (progress && (i + 1) % 25000 == 0)
       progressBar.show(tokenizer.progress());
@@ -84,7 +84,7 @@ List read_lines_raw_(List sourceSpec, int n_max = -1, bool progress = false) {
       if (n_max < 0) {
         // Estimate rows in full dataset
         n = (i / tokenizer.progress().first) * 1.2;
-        out = Rf_lengthgets(out, n);
+        out = Rf_xlengthgets(out, n);
       } else {
         break;
       }
@@ -97,7 +97,7 @@ List read_lines_raw_(List sourceSpec, int n_max = -1, bool progress = false) {
   }
 
   if (i < n) {
-    out = Rf_lengthgets(out, i);
+    out = Rf_xlengthgets(out, i);
   }
 
   if (progress)

diff --git a/tests/testthat/test-read-lines.R b/tests/testthat/test-read-lines.R
@@ -8,3 +8,18 @@ test_that("read_lines respects encoding", {
 test_that("read_lines returns an empty character vector on an empty file", {
    expect_equal(read_lines("empty-file"), character())
 })
+
+# These tests are slow so are commented out
+#test_that("long vectors are supported", {
+  #tmp <- tempfile(fileext = ".gz")
+  #on.exit(unlink(tmp))
+
+  #x <- rep(paste(rep("a", 2 ^ 16), collapse = ''), 2 ^ 15)
+  #con <- gzfile(tmp, open = "w", compression = 0)
+  #writeLines(x, con)
+  #close(con)
+
+  #expect_equal(length(read_lines(tmp)), 2^15)
+
+  #expect_equal(length(read_lines_raw(tmp)), 2^15)
+#})