Skip to content

Commit

Permalink
ref #79: Improve parsing of inline images; Handle the scenario where …
Browse files Browse the repository at this point in the history
…"EI" can be found inside the image data, right at the end of a line, and more binary data comes afterwards;

Uses the hasNoFollowingBinData() method used by PDFBox
  • Loading branch information
ediweissmann committed Jan 8, 2024
1 parent d22773e commit 7d6c68a
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 1 deletion.
91 changes: 90 additions & 1 deletion src/main/java/org/sejda/sambox/input/ContentStreamParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
Expand All @@ -50,6 +53,11 @@ public class ContentStreamParser extends SourceReader
private final ContentStreamCOSParser cosParser;
private final List<Object> tokens = new ArrayList<>();

private static final Logger LOG = LoggerFactory.getLogger(ContentStreamParser.class);

private static final int MAX_BIN_CHAR_TEST_LENGTH = 10;
private final byte[] binCharTestArr = new byte[MAX_BIN_CHAR_TEST_LENGTH];

public ContentStreamParser(PDContentStream stream) throws IOException
{
this(SeekableSources.inMemorySeekableSourceFrom(stream.getContents()));
Expand Down Expand Up @@ -165,7 +173,7 @@ private boolean isEndOfImageFrom(long position) throws IOException
{
current = source().read();
// if not a EI we restore the position and go on
if (current == 'I' && (isEndOfImage() || isEOF(source().peek())))
if (current == 'I' && (isEndOfImage() || isEOF(source().peek())) && hasNoFollowingBinData())
{
return true;
}
Expand Down Expand Up @@ -219,6 +227,87 @@ private boolean isEndOfImage() throws IOException
}
}

/**
* Looks up an amount of bytes if they contain only ASCII characters (no
* control sequences etc.), and that these ASCII characters begin with a
* sequence of 1-3 non-blank characters between blanks
*
* @return <code>true</code> if next bytes are probably printable ASCII
* characters starting with a PDF operator, otherwise <code>false</code>
*/
private boolean hasNoFollowingBinData() throws IOException
{
long originalPosition = source().position();

try {
// as suggested in PDFBOX-1164
final int readBytes = source().read(ByteBuffer.wrap(binCharTestArr));
boolean noBinData = true;
int startOpIdx = -1;
int endOpIdx = -1;

if (readBytes > 0)
{
for (int bIdx = 0; bIdx < readBytes; bIdx++)
{
final byte b = binCharTestArr[bIdx];
if (b != 0 && b < 0x09 || b > 0x0a && b < 0x20 && b != 0x0d)
{
// control character or > 0x7f -> we have binary data
noBinData = false;
break;
}
// find the start of a PDF operator
if (startOpIdx == -1 && !(b == 0 || b == 9 || b == 0x20 || b == 0x0a || b == 0x0d))
{
startOpIdx = bIdx;
} else if (startOpIdx != -1 && endOpIdx == -1 &&
(b == 0 || b == 9 || b == 0x20 || b == 0x0a || b == 0x0d))
{
endOpIdx = bIdx;
}
}

// PDFBOX-3742: just assuming that 1-3 non blanks is a PDF operator isn't enough
if (endOpIdx != -1 && startOpIdx != -1)
{
// usually, the operator here is Q, sometimes EMC (PDFBOX-2376), S (PDFBOX-3784).
String s = new String(binCharTestArr, startOpIdx, endOpIdx - startOpIdx);
if (!"q".equals(s) && !"Q".equals(s) && !"EMC".equals(s) && !"S".equals(s))
{
noBinData = false;
}
}

// only if not close to eof
if (readBytes == MAX_BIN_CHAR_TEST_LENGTH)
{
// a PDF operator is 1-3 bytes long
if (startOpIdx != -1 && endOpIdx == -1)
{
endOpIdx = MAX_BIN_CHAR_TEST_LENGTH;
}
if (endOpIdx != -1 && startOpIdx != -1 && endOpIdx - startOpIdx > 3)
{
noBinData = false;
}
}
}

if (!noBinData)
{
LOG.warn("ignoring 'EI' assumed to be in the middle of inline image at stream offset " + originalPosition);
}

return noBinData;

}
finally
{
source().position(originalPosition);
}
}

@Override
public void close() throws IOException
{
Expand Down
11 changes: 11 additions & 0 deletions src/test/java/org/sejda/sambox/input/ContentStreamParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,15 @@ public void nextInlineImageEISpace() throws IOException
operator = (Operator) victim.nextParsedToken();
assertEquals("q", operator.getName());
}

@Test
public void nextInlineImageEIInStreamWithoutSpace() throws IOException {
victim = new ContentStreamParser(inMemorySeekableSourceFrom(
getClass().getResourceAsStream("/sambox/inline_image_EI_inside_stream.txt")));
Operator operator = (Operator) victim.nextParsedToken();
assertEquals("BI", operator.getName());
assertEquals(768, operator.getImageData().length);
operator = (Operator) victim.nextParsedToken();
assertEquals("Q", operator.getName());
}
}
7 changes: 7 additions & 0 deletions src/test/resources/sambox/inline_image_EI_inside_stream.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
BI /W 1088 /H 1071 /BPC 1 /IM true /F [/A85 /Fl] ID
Gb"/l4d,,%%YVZ[+g[6&U\)s4UZj0f6CT_kKGBW;W9W#p=OsRY:=9umQMC(!+::Xb3"a3>7pLQ'47i38q=sk&?dFQ&,nh@b\&[[lFmPR3TDs;r[lh+Fa")%grH$cWnGD
>2,kKA"^HC"g)s/i03L2YW(nq_([2o4Ea,WgTd(o.PK=FtYHi0</fDD0R-8<[XZ6H5219#kH^lZc!F-sib.icBtUXo->b@7DO%Ii*bg9YWu>lHItO1,*skb7r]GE!hR#
/;L7n*SO]LH3S`^nFU04O9eG<*IjA:O^7uS/1h?579Y[,XKrg=$C*:7;u2":*K&-2Dh:_c!_*'XDmgH?IQrA[#f%_a=hm1h(l-u?>nPbVL'g14k_"f:PkSc.)dXYV&EI
u0S_Y(39[BBVu&Q&L'[%Q]TGPgUP+X%JL5@>>K-]%<k0l5fUYXK<ORot+"MW7Po=?(h]1c4a*A.Xq$IARX#Q-s)K`+K\C=`89YFm_Msl,\:?V23Y,8ha(nn>7;cbc13R
+I5bX&WQ(h_8T^%b*S9<YPE+OM$Q+;.*J`..']`Q[ir`F-tu0E:@#^ck4mu>2oE43kWhF;I?F$QKf3kbJ]auKcJXpE>s+BaP)?8A.E[J@u[87kT9OOokNR==6/ppg?^" %>pa.:
[J]dg_Qgd//:ks4f:mr,;b9nLWuG1'0S_[6F5TT.eF3N@m!A5B#+U+>I?F\`\5m/AFW8`*XE%:l#?<[N0\(tO<WpI6dV*)"F-pHoj'R,Poc4~> EI Q

0 comments on commit 7d6c68a

Please sign in to comment.