Skip to content

Commit

Permalink
Add trivial csv file scanner
Browse files Browse the repository at this point in the history
  • Loading branch information
dain committed Aug 14, 2012
1 parent 7bb225d commit 8272f12
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 0 deletions.
74 changes: 74 additions & 0 deletions src/main/java/com/facebook/presto/CsvFileScanner.java
@@ -0,0 +1,74 @@
package com.facebook.presto;

import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Iterables;
import com.google.common.io.InputSupplier;
import com.google.common.io.LineReader;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;

public class CsvFileScanner implements Iterable<ValueBlock>
{
private final InputSupplier<InputStreamReader> inputSupplier;
private final Splitter columnSplitter;
private final int columnIndex;

public CsvFileScanner(InputSupplier<InputStreamReader> inputSupplier, int columnIndex, char columnSeparator)
{
this.columnIndex = columnIndex;
Preconditions.checkNotNull(inputSupplier, "inputSupplier is null");
this.inputSupplier = inputSupplier;
columnSplitter = Splitter.on(columnSeparator);
}

@Override
public Iterator<ValueBlock> iterator()
{
return new ColumnIterator(inputSupplier, columnIndex, columnSplitter);
}

private static class ColumnIterator extends AbstractIterator<ValueBlock>
{
private long position;
private final LineReader reader;
private int columnIndex;
private Splitter columnSplitter;

public ColumnIterator(InputSupplier<InputStreamReader> inputSupplier, int columnIndex, Splitter columnSplitter)
{
try {
this.reader = new LineReader(inputSupplier.getInput());
}
catch (IOException e) {
throw Throwables.propagate(e);
}
this.columnIndex = columnIndex;
this.columnSplitter = columnSplitter;
}

@Override
protected ValueBlock computeNext()
{
String line;
try {
line = reader.readLine();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
if (line == null) {
endOfData();
return null;
}
Iterable<String> split = columnSplitter.split(line);
String value = Iterables.get(split, columnIndex);
return new UncompressedValueBlock(position++, value);

}
}
}
46 changes: 46 additions & 0 deletions src/main/java/com/facebook/presto/PairsIterator.java
@@ -0,0 +1,46 @@
package com.facebook.presto;

import com.google.common.collect.AbstractIterator;
import com.google.common.collect.PeekingIterator;

import java.util.Iterator;

public class PairsIterator
extends AbstractIterator<Pair>
{
private final Iterator<ValueBlock> blockIterator;
private PeekingIterator<Pair> currentBlock;

public PairsIterator(Iterator<ValueBlock> blockIterator)
{
this.blockIterator = blockIterator;
}

@Override
protected Pair computeNext()
{
if (!advance()) {
endOfData();
return null;
}
return currentBlock.next();
}

private boolean advance()
{
// does current block iterator have more data?
if (currentBlock != null && currentBlock.hasNext()) {
return true;
}

// are there more blocks?
if (!blockIterator.hasNext()) {
return false;
}

// advance to next block and open an iterator
currentBlock = blockIterator.next().pairIterator();
return true;
}

}
47 changes: 47 additions & 0 deletions src/test/java/com/facebook/presto/CsvFileScannerTest.java
@@ -0,0 +1,47 @@
package com.facebook.presto;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableList;
import com.google.common.io.InputSupplier;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.InputStreamReader;

import static com.google.common.io.Resources.getResource;
import static com.google.common.io.Resources.newReaderSupplier;

public class CsvFileScannerTest
{
private final InputSupplier<InputStreamReader> inputSupplier = newReaderSupplier(getResource("data.csv"), Charsets.UTF_8);

@Test
public void testIterator()
throws Exception
{
CsvFileScanner firstColumn = new CsvFileScanner(inputSupplier, 0, ',');

Assert.assertEquals(ImmutableList.copyOf(new PairsIterator(firstColumn.iterator())),
ImmutableList.of(
new Pair(0, "0"),
new Pair(1, "1"),
new Pair(2, "2"),
new Pair(3, "3")));

CsvFileScanner secondColumn = new CsvFileScanner(inputSupplier, 1, ',');
Assert.assertEquals(ImmutableList.copyOf(new PairsIterator(secondColumn.iterator())),
ImmutableList.of(
new Pair(0, "apple"),
new Pair(1, "banana"),
new Pair(2, "cherry"),
new Pair(3, "date")));

CsvFileScanner thirdColumn = new CsvFileScanner(inputSupplier, 2, ',');
Assert.assertEquals(ImmutableList.copyOf(new PairsIterator(thirdColumn.iterator())),
ImmutableList.of(
new Pair(0, "alice"),
new Pair(1, "bob"),
new Pair(2, "charlie"),
new Pair(3, "dave")));
}
}
4 changes: 4 additions & 0 deletions src/test/resources/data.csv
@@ -0,0 +1,4 @@
0,apple,alice
1,banana,bob
2,cherry,charlie
3,date,dave

0 comments on commit 8272f12

Please sign in to comment.