Added unit tests on the gzip parser.

yacy · Aug 22, 2017 · c6ae871 · c6ae871
1 parent 169ffdd
commit c6ae871
Show file tree

Hide file tree

Showing 5 changed files with 209 additions and 12 deletions.
diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java
@@ -234,6 +234,9 @@ public Document[] parseWithLimits(final DigestURL location, final String mimeTyp
             Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
             if (docs != null) {
             	maindoc.addSubDocuments(docs);
+            	if(docs.length > 0 && docs[0].isPartiallyParsed()) {
+            		maindoc.setPartiallyParsed(true);
+            	}
             }
         } catch (final Exception e) {
             throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);

diff --git a/test/java/net/yacy/document/parser/gzipParserTest.java b/test/java/net/yacy/document/parser/gzipParserTest.java
@@ -22,14 +22,20 @@
 
 package net.yacy.document.parser;
 
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertEquals;
 
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collection;
 
 import org.junit.Test;
 
+import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.Document;
 import net.yacy.document.Parser.Failure;
@@ -45,32 +51,220 @@ public class gzipParserTest {
 
 	/**
 	 * Unit test for the gzipParser.parse() function with some small gz test files.
-	 * @throws Failure when a file could not be parsed
-	 * @throws InterruptedException when the test was interrupted before its termination
-	 * @throws IOException when a read/write error occurred
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
 	 */
 	@Test
 	public void testParse() throws Failure, InterruptedException, IOException {
-		final String[] fileNames = {
-				"umlaute_html_utf8.html.gz",
-				"umlaute_linux.txt.gz"
-		};
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
 		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
 		gzipParser parser = new gzipParser();
-
+
+		for (String fileName : fileNames) {
+			FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try {
+				Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(),
+						new VocabularyScraper(), 0, inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Testing parse integration with the tar parser on a test tgz archive.
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseTgz() throws Failure, InterruptedException, IOException {
+		final String fileName = "umlaute_html_xml_txt_gnu.tgz";
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
+		FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+		DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try {
+			Document[] documents = parser.parse(location, "application/gzip", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream);
+
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+			final String parsedText = documents[0].getTextString();
+			assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+			assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+					parsedText.contains("Maßkrügen"));
+			assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+			assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+			assertTrue(parsedText.contains("URL reference in raw text file"));
+			assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+
+			final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
+					detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+		} finally {
+			inStream.close();
+		}
+	}
+
+	/**
+	 * Unit test for the gzipParser.parseWithLimits() function with some small gz
+	 * test files which content is within limits.
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimits() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
 		for (String fileName : fileNames) {
 			FileInputStream inStream = new FileInputStream(new File(folder, fileName));
 			DigestURL location = new DigestURL("http://localhost/" + fileName);
 			try {
-				Document[] documents = parser.parse(location, "application/gzip", null, new VocabularyScraper(), 0,
-						inStream);
+				Document[] documents = parser.parseWithLimits(location, "application/gzip",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000,
+						10000);
 				assertNotNull("Parser result must not be null for file " + fileName, documents);
 				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
-				assertTrue("Parsed text must contain test word with umlaut char" + fileName, documents[0].getTextString().contains("Maßkrügen"));
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+				assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
 			} finally {
 				inStream.close();
 			}
 		}
+
+	}
+
+	/**
+	 * Unit test for the gzipParser.parseWithLimits() when maxLinks limit is exceeded
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
+		/* maxLinks limit exceeded */
+		for (String fileName : fileNames) {
+			FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try {
+				Document[] documents = parser.parseWithLimits(location, "application/gzip",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+				assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * Unit test for the gzipParser.parseWithLimits() when maxBytes limit is exceeded
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.gz", "umlaute_linux.txt.gz" };
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+		gzipParser parser = new gzipParser();
+
+		String fileName = fileNames[0];
+		FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+		DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try {
+			/* The bytes limit is set to let parsing the beginning text part, but stop before reaching the <a> tag */
+			final long maxBytes = 258;
+			Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		} finally {
+			inStream.close();
+		}
+
+		fileName = fileNames[1];
+		inStream = new FileInputStream(new File(folder, fileName));
+		location = new DigestURL("http://localhost/" + fileName);
+		try {
+			/* The bytes limit is set to let parsing the beginning of the text, but stop before reaching the URL */
+			final long maxBytes = 65;
+			Document[] documents = parser.parseWithLimits(location, "application/gzip", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		} finally {
+			inStream.close();
+		}
 	}
 
 }
diff --git a/test/parsertest/umlaute_html_utf8.html.gz b/test/parsertest/umlaute_html_utf8.html.gz
diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.tgz b/test/parsertest/umlaute_html_xml_txt_gnu.tgz
diff --git a/test/parsertest/umlaute_linux.txt.gz b/test/parsertest/umlaute_linux.txt.gz