Skip to content

Commit

Permalink
implemented BACK_TO_DELIMITER unescaped quote handling.
Browse files Browse the repository at this point in the history
Fixed github #271 & #259
  • Loading branch information
jbax committed Oct 11, 2018
1 parent edccaa9 commit 7aaf368
Show file tree
Hide file tree
Showing 10 changed files with 367 additions and 15 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.univocity</groupId>
<artifactId>univocity-parsers</artifactId>
<version>2.7.6</version>
<version>2.8.0-SNAPSHOT</version>
<name>univocity-parsers</name>
<packaging>jar</packaging>
<description>univocity's open source parsers for processing different text formats using a consistent API</description>
Expand Down
37 changes: 28 additions & 9 deletions src/main/java/com/univocity/parsers/common/AbstractParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,10 @@ public final void parse(Reader reader) {
processComment();
continue;
}
parseRecord();

if (output.pendingRecords.isEmpty()) {
parseRecord();
}

String[] row = output.rowParsed();
if (row != null) {
Expand All @@ -141,6 +144,9 @@ public final void parse(Reader reader) {
} catch (EOFException ex) {
try {
handleEOF();
while(!output.pendingRecords.isEmpty()) {
handleEOF();
}
} finally {
stopParsing();
}
Expand Down Expand Up @@ -190,16 +196,18 @@ private String[] handleEOF() {
String[] row = null;
try {
boolean consumeValueOnEOF = consumeValueOnEOF();
if (output.column != 0 || consumeValueOnEOF) {
if (output.column != 0 || (consumeValueOnEOF && !context.isStopped())) {
if (output.appender.length() > 0 || consumeValueOnEOF) {
output.valueParsed();
} else {
} else if (input.currentParsedContentLength() > 0){
output.emptyParsed();
}
row = output.rowParsed();
} else if (output.appender.length() > 0 || input.currentParsedContentLength() > 0){
} else if (output.appender.length() > 0 || input.currentParsedContentLength() > 0) {
output.valueParsed();
row = output.rowParsed();
} else if (!output.pendingRecords.isEmpty()){
row = output.pendingRecords.poll();
}
} catch (Throwable e) {
throw handleException(e);
Expand Down Expand Up @@ -288,7 +296,7 @@ private String getParsedContent(CharSequence tmp) {
}

private TextParsingException handleException(Throwable ex) {
if(context != null) {
if (context != null) {
context.stop();
}
if (ex instanceof DataProcessingException) {
Expand Down Expand Up @@ -418,6 +426,7 @@ private void stopParsing(Throwable error) {
*/
public final void stopParsing() {
try {
ch = '\0';
try {
if (context != null) {
context.stop();
Expand Down Expand Up @@ -557,7 +566,9 @@ public final String[] parseNext() {
processComment();
continue;
}
parseRecord();
if (output.pendingRecords.isEmpty()) {
parseRecord();
}
String[] row = output.rowParsed();
if (row != null) {
if (recordsToRead >= 0 && context.currentRecord() >= recordsToRead) {
Expand All @@ -575,11 +586,17 @@ public final String[] parseNext() {
return null;
}
}

if(output.column != 0){
return output.rowParsed();
}
stopParsing();
return null;
} catch (EOFException ex) {
String[] row = handleEOF();
stopParsing();
if(output.pendingRecords.isEmpty()) {
stopParsing();
}
return row;
} catch (NullPointerException ex) {
if (context == null) {
Expand Down Expand Up @@ -651,7 +668,9 @@ public final String[] parseLine(String line) {
processComment();
return null;
}
parseRecord();
if(output.pendingRecords.isEmpty()) {
parseRecord();
}
String[] row = output.rowParsed();
if (row != null) {
if (processor != NoopProcessor.instance) {
Expand Down Expand Up @@ -1210,7 +1229,7 @@ public final ParsingContext getContext() {
* @return the metadata of {@link Record}s generated with the current input.
*/
public final RecordMetaData getRecordMetadata() {
if(context == null){
if (context == null) {
throw new IllegalStateException("Record metadata not available. The parser has not been started.");
}
return context.recordMetaData();
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/com/univocity/parsers/common/ParserOutput.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ public class ParserOutput {
private long currentRecord;

public boolean trim = false;
public final Deque<String[]> pendingRecords = new LinkedList<String[]>();

/**
* Headers parsed from the input when {@link CommonParserSettings#headerExtractionEnabled} is {@code true},
Expand Down Expand Up @@ -152,6 +153,9 @@ protected void initializeHeaders() {
* @return the sequence of parsed values in a record.
*/
public String[] rowParsed() {
if(!pendingRecords.isEmpty()){
return pendingRecords.poll();
}
// some values were parsed. Let's return them
if (column > 0) {
// identifies selected columns and headers (in the first non-empty row)
Expand Down
39 changes: 39 additions & 0 deletions src/main/java/com/univocity/parsers/common/input/CharAppender.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,37 @@ public interface CharAppender extends CharSequence {
*/
void append(char ch);

/**
* Returns first the position of a given character
* @param ch the character to look for
* @param from the starting index from where the search will begin.
* @return the position of the given character in the appended content, {@code -1} if not found
*/
int indexOf(char ch, int from);

/**
* Returns the first position of any given character
* @param chars the characters to look for
* @param from the starting index from where the search will begin.
* @return the position any one of the given characters in the appended content, {@code -1} if none found
*/
int indexOfAny(char[] chars, int from);

/**
* Returns a section of the appended content
* @param from the starting position in the buffer
* @param length the number of characters to accumulate from the given start position
* @return a {@code String} with the section of characters accumulated by this appender.
*/
String substring(int from, int length);

/**
* Removes a section from the appended content
* @param from the starting position in the buffer (inclusive)
* @param length the number of characters to accumulate from the given start position
*/
void remove(int from, int length);

/**
* Appends the given codepoint.
* @param ch the codepoint to append
Expand Down Expand Up @@ -232,4 +263,12 @@ public interface CharAppender extends CharSequence {
* @param count the number of characters to ignore
*/
void ignore(int count);

/**
* Deletes a given number of characters from the end of the appended content.
* Will reset the internal whitespace count if any. Invoke {@link #updateWhitespace()}
* to recalculate the number of trailing whitespaces in the appended content.
* @param count the number of characters to delete.
*/
void delete(int count);
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,49 @@ public void appendIgnoringWhitespace(char ch) {
}
}

@Override
public int indexOf(char ch, int from) {
int len = index - whitespaceCount;
for (int i = from; i < len; i++) {
if (chars[i] == ch) {
return i;
}
}
return -1;
}

@Override
public int indexOfAny(char[] chars, int from) {
int len = index - whitespaceCount;
for (int i = from; i < len; i++) {
for(int j = 0; j < chars.length; j++){
if (this.chars[i] == chars[j]) {
return i;
}
}
}
return -1;
}

@Override
public String substring(int from, int length) {
return new String(chars, from, length);
}

@Override
public void remove(int from, int length) {
if (length > 0) {
int srcPos = from + length;
int len = index - length;
if(srcPos + len > index){
len = len - from;
}

System.arraycopy(chars, srcPos, chars, from, len);
index -= length;
}
}

@Override
public void append(char ch) {
chars[index++] = ch;
Expand Down Expand Up @@ -294,4 +337,13 @@ public final String subSequence(int from, int to) {
public final void ignore(int count) {
whitespaceCount += count;
}

@Override
public void delete(int count){
index -= count;
if(index < 0){
index = 0;
}
whitespaceCount = 0;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -273,4 +273,44 @@ public void append(Object obj) {
public void ignore(int count) {

}

/**
* Does nothing
*/
@Override
public int indexOf(char ch, int from) {
return -1;
}

/**
* Does nothing
*/
@Override
public String substring(int from, int length) {
return null;
}

/**
* Does nothing
*/
@Override
public void remove(int from, int length) {

}

/**
* Does nothing
*/
@Override
public void delete(int count) {

}

/**
* Does nothing
*/
@Override
public int indexOfAny(char[] chars, int from) {
return 0;
}
}
31 changes: 27 additions & 4 deletions src/main/java/com/univocity/parsers/csv/CsvParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public final class CsvParser extends AbstractParser<CsvParserSettings> {
private final boolean ignoreLeadingWhitespace;
private boolean parseUnescapedQuotes;
private boolean parseUnescapedQuotesUntilDelimiter;
private boolean backToDelimiter;
private final boolean doNotEscapeUnquotedValues;
private final boolean keepEscape;
private final boolean keepQuotes;
Expand All @@ -57,6 +58,7 @@ public final class CsvParser extends AbstractParser<CsvParserSettings> {
private final String emptyValue;
private final boolean trimQuotedLeading;
private final boolean trimQuotedTrailing;
private char[] delimiters;

/**
* The CsvParser supports all settings provided by {@link CsvParserSettings}, and requires this configuration to be properly initialized.
Expand Down Expand Up @@ -95,7 +97,8 @@ public CsvParser(CsvParserSettings settings) {
quoteHandling = RAISE_ERROR;
}
} else {
parseUnescapedQuotesUntilDelimiter = quoteHandling == STOP_AT_DELIMITER || quoteHandling == SKIP_VALUE;
backToDelimiter = quoteHandling == BACK_TO_DELIMITER;
parseUnescapedQuotesUntilDelimiter = quoteHandling == STOP_AT_DELIMITER || quoteHandling == SKIP_VALUE || backToDelimiter;
parseUnescapedQuotes = quoteHandling != RAISE_ERROR;
}
}
Expand Down Expand Up @@ -142,7 +145,7 @@ protected final void parseRecord() {
}
continue;
}
} else if (len == -1 && input.skipQuotedString(quote, quoteEscape, delimiter, newLine)){
} else if (len == -1 && input.skipQuotedString(quote, quoteEscape, delimiter, newLine)) {
output.valueParsed();
try {
ch = input.nextChar();
Expand All @@ -165,7 +168,9 @@ protected final void parseRecord() {
output.trim = trimQuotedTrailing;
parseQuotedValue();
input.enableNormalizeLineEndings(true);
output.valueParsed();
if (!(unescaped && quoteHandling == BACK_TO_DELIMITER && output.appender.length() == 0)) {
output.valueParsed();
}
} else if (doNotEscapeUnquotedValues) {
String value = null;
int len = output.appender.length();
Expand Down Expand Up @@ -222,6 +227,7 @@ private void handleValueSkipping(boolean quoted) {

private void handleUnescapedQuoteInValue() {
switch (quoteHandling) {
case BACK_TO_DELIMITER:
case STOP_AT_CLOSING_QUOTE:
case STOP_AT_DELIMITER:
output.appender.append(quote);
Expand All @@ -237,6 +243,20 @@ private void handleUnescapedQuoteInValue() {
private boolean handleUnescapedQuote() {
unescaped = true;
switch (quoteHandling) {
case BACK_TO_DELIMITER:
int pos;
while ((pos = output.appender.indexOfAny(delimiters, 0)) != -1) {
String value = output.appender.substring(0, pos);
output.valueParsed(value);
if (output.appender.charAt(pos) == newLine) {
output.pendingRecords.add(output.rowParsed());
}
output.appender.remove(0, pos + 1);
}
output.appender.append(ch);
prev = '\0';
parseQuotedValue();
return true;
case STOP_AT_CLOSING_QUOTE:
case STOP_AT_DELIMITER:
output.appender.append(quote);
Expand Down Expand Up @@ -454,7 +474,9 @@ protected final boolean consumeValueOnEOF() {
}
}
}
return prev != '\0' && ch != delimiter && ch != newLine;
boolean out = prev != '\0' && ch != delimiter && ch != newLine;
ch = prev = '\0';
return out;
}

/**
Expand All @@ -468,5 +490,6 @@ public final void updateFormat(CsvFormat format) {
quoteEscape = format.getQuoteEscape();
escapeEscape = format.getCharToEscapeQuoteEscaping();
newLine = format.getNormalizedNewline();
delimiters = new char[]{delimiter, newLine};
}
}
Loading

0 comments on commit 7aaf368

Please sign in to comment.