From 2aae5a5eaad142cf117a44cc1e1f39dbc1f776ca Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Thu, 29 May 2025 18:47:20 -0500 Subject: [PATCH 1/8] [SPARKNLP-1138] Adding basic chunking to partition (#14593) --- .../partition/partition_properties.py | 44 ++++++- .../partition/partition_transformer.py | 16 +-- .../partition/partition_transformer_test.py | 32 ++++- .../johnsnowlabs/partition/BasicChunker.scala | 117 ++++++++++++++++++ .../HasSemanticChunkerProperties.scala | 44 +++++++ .../johnsnowlabs/partition/Partition.scala | 13 +- .../partition/PartitionTransformer.scala | 42 +++++-- .../partition/SemanticChunker.scala | 67 ++++++++++ .../partition/util/PartitionHelper.scala | 5 +- .../com/johnsnowlabs/reader/HTMLReader.scala | 4 +- .../johnsnowlabs/reader/SparkNLPReader.scala | 3 +- .../com/johnsnowlabs/reader/TextReader.scala | 7 +- src/test/resources/reader/txt/long-text.txt | 1 + .../partition/PartitionChunkerTest.scala | 41 ++++++ .../partition/PartitionTransformerTest.scala | 47 ++++++- 15 files changed, 453 insertions(+), 30 deletions(-) create mode 100644 src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala create mode 100644 src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala create mode 100644 src/main/scala/com/johnsnowlabs/partition/SemanticChunker.scala create mode 100644 src/test/resources/reader/txt/long-text.txt create mode 100644 src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala diff --git a/python/sparknlp/partition/partition_properties.py b/python/sparknlp/partition/partition_properties.py index 3bea2e77610da7..7f72ec41486182 100644 --- a/python/sparknlp/partition/partition_properties.py +++ b/python/sparknlp/partition/partition_properties.py @@ -254,4 +254,46 @@ def setThreshold(self, value): return self._set(threshold=value) def getThreshold(self): - return self.getOrDefault(self.threshold) \ No newline at end of file + return self.getOrDefault(self.threshold) + +class HasSemanticChunkerProperties(Params): + + chunkingStrategy = Param( + Params._dummy(), + "chunkingStrategy", + "Set the chunking strategy", + typeConverter=TypeConverters.toString + ) + + def setChunkingStrategy(self, value): + return self._set(chunkingStrategy=value) + + maxCharacters = Param( + Params._dummy(), + "maxCharacters", + "Set the maximum number of characters", + typeConverter=TypeConverters.toInt + ) + + def setMaxCharacters(self, value): + return self._set(maxCharacters=value) + + newAfterNChars = Param( + Params._dummy(), + "newAfterNChars", + "Insert a new chunk after N characters", + typeConverter=TypeConverters.toInt + ) + + def setNewAfterNChars(self, value): + return self._set(newAfterNChars=value) + + overlap = Param( + Params._dummy(), + "overlap", + "Set the number of overlapping characters between chunks", + typeConverter=TypeConverters.toInt + ) + + def setOverlap(self, value): + return self._set(overlap=value) \ No newline at end of file diff --git a/python/sparknlp/partition/partition_transformer.py b/python/sparknlp/partition/partition_transformer.py index 0598c3aaa20af2..50e1e5edc94f31 100644 --- a/python/sparknlp/partition/partition_transformer.py +++ b/python/sparknlp/partition/partition_transformer.py @@ -15,13 +15,15 @@ from sparknlp.common import * from sparknlp.partition.partition_properties import * + class PartitionTransformer( AnnotatorModel, HasEmailReaderProperties, HasExcelReaderProperties, HasHTMLReaderProperties, HasPowerPointProperties, - HasTextReaderProperties + HasTextReaderProperties, + HasSemanticChunkerProperties ): """ The PartitionTransformer annotator allows you to use the Partition feature more smoothly @@ -162,10 +164,6 @@ def setIncludePageBreaks(self, value): def getIncludePageBreaks(self): return self.getOrDefault(self.includePageBreaks) - # def setHeaders(self, headers: Dict[str, str]): - # self._call_java("setHeadersPython", headers) - # return self - @keyword_only def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer", java_model=None): @@ -192,5 +190,9 @@ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer", paragraphSplit=DOUBLE_PARAGRAPH_PATTERN, shortLineWordThreshold=5, maxLineCount=2000, - threshold=0.1 - ) + threshold=0.1, + chunkingStrategy="", + maxCharacters=100, + newAfterNChars=-1, + overlap=0 + ) \ No newline at end of file diff --git a/python/test/partition/partition_transformer_test.py b/python/test/partition/partition_transformer_test.py index decd1fcb176e16..283b6ace2f9c77 100644 --- a/python/test/partition/partition_transformer_test.py +++ b/python/test/partition/partition_transformer_test.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import unittest import pytest @@ -80,4 +81,33 @@ def runTest(self): resultDf = pipelineModel.transform(self.testDataSet) resultDf.show(truncate=False) - self.assertTrue(resultDf.select("partition").count() > 0) \ No newline at end of file + self.assertTrue(resultDf.select("partition").count() > 0) + + +@pytest.mark.slow +class PartitionTransformerChunkTestSpec(unittest.TestCase): + + def setUp(self): + self.spark = SparkContextForTest.spark + self.content_path = f"file:///{os.getcwd()}/../src/test/resources/reader/txt/rag-example.txt" + self.testDataSet = self.spark.createDataFrame( + [("An example with DocumentAssembler annotator",)], + ["text"] + ) + self.emptyDataSet = self.spark.createDataFrame([], self.testDataSet.schema) + + def runTest(self): + partition = PartitionTransformer() \ + .setInputCols(["document"]) \ + .setContentPath(self.content_path) \ + .setOutputCol("partition") \ + .setChunkingStrategy("basic") \ + .setMaxCharacters(140) + + pipeline = Pipeline(stages=[partition]) + pipelineModel = pipeline.fit(self.emptyDataSet) + + resultDf = pipelineModel.transform(self.emptyDataSet) + resultDf.show(truncate=False) + + # self.assertTrue(resultDf.select("partition").count() >= 0) \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala b/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala new file mode 100644 index 00000000000000..80b6747cd5ac7d --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala @@ -0,0 +1,117 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.reader.HTMLElement + +import scala.collection.mutable + +case class Chunk(elements: List[HTMLElement]) { + def length: Int = elements.map(_.content.length).sum +} + +object BasicChunker { + + /** Splits a list of [[HTMLElement]]s into chunks constrained by a maximum number of characters. + * + * This method ensures that no chunk exceeds the specified `maxCharacters` limit. Optionally, a + * `newAfterNChars` parameter can be used to set a soft boundary for starting new chunks + * earlier, and `overlap` can be used to retain trailing characters from the previous chunk in + * the next one (when splitting long elements). + * + * @param elements + * The list of [[HTMLElement]]s to be chunked. + * @param maxCharacters + * The hard limit on the number of characters per chunk. + * @param newAfterNChars + * Optional soft limit for starting a new chunk before reaching `maxCharacters`. If set to + * -1, this soft limit is ignored. + * @param overlap + * Number of trailing characters to overlap between chunks when splitting long elements. This + * helps maintain context in downstream NLP tasks. + * @return + * A list of [[Chunk]] objects, each containing a group of elements whose combined content + * length does not exceed the specified limits. + */ + + def chunkBasic( + elements: List[HTMLElement], + maxCharacters: Int, + newAfterNChars: Int = -1, + overlap: Int = 0): List[Chunk] = { + val softLimit = if (newAfterNChars > 0) newAfterNChars else maxCharacters + var currentChunk = List.empty[HTMLElement] + var currentLength = 0 + val chunks = mutable.ListBuffer.empty[Chunk] + + def finalizeChunk(): Unit = { + if (currentChunk.nonEmpty) { + chunks += Chunk(currentChunk) + currentChunk = List.empty[HTMLElement] + currentLength = 0 + } + } + + for (element <- elements) { + val elLength = element.content.length + + if (elLength > maxCharacters) { + val splitElements = splitHTMLElement(element, maxCharacters, overlap) + for (splitEl <- splitElements) { + if (currentLength + splitEl.content.length > maxCharacters || currentLength >= softLimit) + finalizeChunk() + currentChunk :+= splitEl + currentLength += splitEl.content.length + } + } else if (currentLength + elLength > maxCharacters || currentLength >= softLimit) { + finalizeChunk() + currentChunk :+= element + currentLength += elLength + } else { + currentChunk :+= element + currentLength += elLength + } + } + + finalizeChunk() + chunks.toList + } + + private def splitHTMLElement( + element: HTMLElement, + maxLen: Int, + overlap: Int): List[HTMLElement] = { + val words = element.content.split(" ") + val buffer = mutable.ListBuffer.empty[HTMLElement] + var chunk = new StringBuilder + + for (word <- words) { + if (chunk.length + word.length + 1 > maxLen) { + val text = chunk.toString().trim + buffer += element.copy(content = text) + chunk = new StringBuilder + if (overlap > 0 && text.length >= overlap) + chunk.append(text.takeRight(overlap)).append(" ") + } + chunk.append(word).append(" ") + } + + if (chunk.nonEmpty) + buffer += element.copy(content = chunk.toString().trim) + + buffer.toList + } +} diff --git a/src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala new file mode 100644 index 00000000000000..d9fe453725561f --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala @@ -0,0 +1,44 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable +import org.apache.spark.ml.param.Param + +trait HasSemanticChunkerProperties extends ParamsAndFeaturesWritable { + + val chunkingStrategy = new Param[String](this, "chunkingStrategy", "Set the chunking strategy") + + def setChunkingStrategy(value: String): this.type = set(chunkingStrategy, value) + + val maxCharacters = + new Param[Int](this, "maxCharacters", "Set the maximum number of characters") + + def setMaxCharacters(value: Int): this.type = set(maxCharacters, value) + + val newAfterNChars = + new Param[Int](this, "newAfterNChars", "Insert a new chunk after N characters") + + def setNewAfterNChars(value: Int): this.type = set(newAfterNChars, value) + + val overlap = + new Param[Int](this, "overlap", "Set the number of overlapping characters between chunks") + + def setOverlap(value: Int): this.type = set(overlap, value) + + setDefault(chunkingStrategy -> "", maxCharacters -> 100, newAfterNChars -> -1, overlap -> 0) + +} diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala index 1480d33a8c053d..4fbb44c284e4cb 100644 --- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala +++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala @@ -144,7 +144,13 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) case None => getReaderByExtension(path, sparkNLPReader) } - reader(path) + val partitionResult = reader(path) + if (hasChunkerStrategy) { + val chunker = new SemanticChunker(params.asScala.toMap) + partitionResult.withColumn( + "chunks", + chunker.chunkUDF()(partitionResult(sparkNLPReader.getOutputColumn))) + } else partitionResult } def partitionStringContent( @@ -342,6 +348,11 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) .headOption } + private def hasChunkerStrategy: Boolean = { + Seq("chunking_strategy", "chunkingStrategy") + .exists(params.asScala.contains) + } + } object Partition { diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala index 1dc2e48b8282ac..e3bd9d6b587c10 100644 --- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala @@ -19,12 +19,12 @@ import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT} import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate} import com.johnsnowlabs.partition.util.PartitionHelper.{ datasetWithBinaryFile, - datasetWithTxtFile, + datasetWithTextFile, isStringContent } import com.johnsnowlabs.reader.util.HasPdfProperties import com.johnsnowlabs.reader.{HTMLElement, PdfToText} -import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.{col, explode, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} @@ -85,7 +85,8 @@ class PartitionTransformer(override val uid: String) with HasHTMLReaderProperties with HasPowerPointProperties with HasTextReaderProperties - with HasPdfProperties { + with HasPdfProperties + with HasSemanticChunkerProperties { def this() = this(Identifiable.randomUID("PartitionTransformer")) protected val logger: Logger = LoggerFactory.getLogger(getClass.getName) @@ -150,16 +151,20 @@ class PartitionTransformer(override val uid: String) "paragraphSplit" -> $(paragraphSplit), "shortLineWordThreshold" -> $(shortLineWordThreshold).toString, "maxLineCount" -> $(maxLineCount).toString, - "threshold" -> $(threshold).toString) + "threshold" -> $(threshold).toString, + "chunkingStrategy" -> $(chunkingStrategy), + "maxCharacters" -> $(maxCharacters).toString, + "newAfterNChars" -> $(newAfterNChars).toString, + "overlap" -> $(overlap).toString) val partitionInstance = new Partition(params.asJava) - partitionInstance.setOutputColumn($(inputCols).head) val inputColum = if (get(inputCols).isDefined) { $(inputCols).head } else { partitionInstance.getOutputColumn } - partitionInstance.setOutputColumn($(inputCols).head) + partitionInstance.setOutputColumn(inputColum) + val partitionDf = if (isStringContent($(contentType))) { val partitionUDF = udf((text: String) => partitionInstance.partitionStringContent(text, $(this.headers).asJava)) @@ -167,7 +172,7 @@ class PartitionTransformer(override val uid: String) schemaFieldOpt match { case Some(StructField(_, StringType, _, _)) => - val stringContentDF = datasetWithTxtFile(dataset.sparkSession, $(contentPath)) + val stringContentDF = datasetWithTextFile(dataset.sparkSession, $(contentPath)) stringContentDF .withColumn(inputColum, partitionUDF(col("content"))) @@ -192,12 +197,14 @@ class PartitionTransformer(override val uid: String) binaryContentDF.withColumn(inputColum, partitionUDF(col("content"))) } - val colName = findHTMLElementColumn(partitionDf).getOrElse { + val htmlElementColumns = findHTMLElementColumns(partitionDf) + + if (htmlElementColumns.isEmpty) { val schemaString = partitionDf.schema.treeString throw new IllegalArgumentException( s"""❌ No column of type Array[HTMLElement] was found in the DataFrame. | - |💡 Expected a column with schema matching: Array[HTMLElement] + |💡 Expected one or more columns with schema matching: Array[HTMLElement] | |🧪 DataFrame Schema: |$schemaString @@ -208,7 +215,12 @@ class PartitionTransformer(override val uid: String) | - metadata: Map[String, String] """.stripMargin) } - partitionDf.withColumn(getOutputCol, wrapColumnMetadata(convertToAnnotations(col(colName)))) + + // Transform each matching column + val transformedDf = htmlElementColumns.foldLeft(partitionDf) { (df, colName) => + df.withColumn(getOutputCol, wrapColumnMetadata(convertToAnnotations(col(colName)))) + } + transformedDf } private def convertToAnnotations = udf { elements: Seq[Row] => @@ -242,4 +254,14 @@ class PartitionTransformer(override val uid: String) .map(_.name) } + private def findHTMLElementColumns(dataFrame: DataFrame): Seq[String] = { + val htmlElementSchema = Encoders.product[HTMLElement].schema + + dataFrame.schema.fields.collect { + case StructField(name, ArrayType(structType: StructType, _), _, _) + if structType == htmlElementSchema => + name + } + } + } diff --git a/src/main/scala/com/johnsnowlabs/partition/SemanticChunker.scala b/src/main/scala/com/johnsnowlabs/partition/SemanticChunker.scala new file mode 100644 index 00000000000000..3125ff2c284437 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/partition/SemanticChunker.scala @@ -0,0 +1,67 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.partition.BasicChunker.chunkBasic +import com.johnsnowlabs.reader.HTMLElement +import com.johnsnowlabs.reader.util.PartitionOptions.{getDefaultInt, getDefaultString} +import org.apache.spark.sql.Row +import org.apache.spark.sql.expressions.UserDefinedFunction +import org.apache.spark.sql.functions.udf + +import scala.collection.mutable + +class SemanticChunker(chunkerOptions: Map[String, String]) extends Serializable { + + def chunkUDF(): UserDefinedFunction = { + udf((elements: Seq[Row]) => { + val htmlElements = elements.map { row => + val elementType = row.getAs[String]("elementType") + val content = row.getAs[String]("content") + val metadata = row.getAs[Map[String, String]]("metadata") + HTMLElement(elementType, content, mutable.Map.empty ++ metadata) + }.toList + + val chunks = getChunkerStrategy match { + case "basic" => chunkBasic(htmlElements, getMaxCharacters, getNewAfterNChars, getOverlap) + case _ => + throw new IllegalArgumentException(s"Unknown chunker strategy: $getChunkerStrategy") + } + + chunks.flatMap(_.elements) + }) + } + + private def getMaxCharacters: Int = { + getDefaultInt(chunkerOptions, Seq("maxCharacters", "max_characters"), default = 500) + } + + private def getNewAfterNChars: Int = { + getDefaultInt(chunkerOptions, Seq("newAfterNChars", "new_after_n_chars"), default = -1) + } + + private def getOverlap: Int = { + getDefaultInt(chunkerOptions, Seq("overlap", "overlap"), default = 0) + } + + private def getChunkerStrategy: String = { + getDefaultString( + chunkerOptions, + Seq("chunkingStrategy", "chunking_strategy"), + default = "none") + } + +} diff --git a/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala b/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala index a69b6f51dd51ba..2f7c959f86cf58 100644 --- a/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala +++ b/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala @@ -29,10 +29,11 @@ object PartitionHelper { byteArrayRDD.toDF("path", "content") } - def datasetWithTxtFile(sparkSession: SparkSession, contentPath: String): DataFrame = { + def datasetWithTextFile(sparkSession: SparkSession, contentPath: String): DataFrame = { import sparkSession.implicits._ val textFilesRDD = sparkSession.sparkContext.wholeTextFiles(contentPath) - textFilesRDD.toDF("path", "content") + textFilesRDD + .toDF("path", "content") } def isStringContent(contentType: String): Boolean = { diff --git a/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala index d205e3fca5dd1e..06a03003c4878a 100644 --- a/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.reader import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.nlp.util.io.ResourceHelper.{isValidURL, validFile} -import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTxtFile +import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.jsoup.Jsoup @@ -109,7 +109,7 @@ class HTMLReader( ResourceHelper match { case _ if validFile(inputSource) && !inputSource.startsWith("http") => - val htmlDf = datasetWithTxtFile(spark, inputSource) + val htmlDf = datasetWithTextFile(spark, inputSource) .withColumn(outputColumn, parseHtmlUDF(col("content"))) if (storeContent) htmlDf.select("path", "content", outputColumn) else htmlDf.select("path", outputColumn) diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala index a3af24454af074..a1637116cb7905 100644 --- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala @@ -33,7 +33,8 @@ import scala.collection.JavaConverters._ class SparkNLPReader( params: java.util.Map[String, String] = new java.util.HashMap(), - headers: java.util.Map[String, String] = new java.util.HashMap()) { + headers: java.util.Map[String, String] = new java.util.HashMap()) + extends Serializable { /** Instantiates class to read HTML files. * diff --git a/src/main/scala/com/johnsnowlabs/reader/TextReader.scala b/src/main/scala/com/johnsnowlabs/reader/TextReader.scala index d69050ab112031..ea0598a05940da 100644 --- a/src/main/scala/com/johnsnowlabs/reader/TextReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/TextReader.scala @@ -20,7 +20,7 @@ import com.johnsnowlabs.nlp.annotators.cleaners.util.CleanerHelper.{ DOUBLE_PARAGRAPH_PATTERN } import com.johnsnowlabs.nlp.util.io.ResourceHelper -import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTxtFile +import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile import com.johnsnowlabs.reader.util.TextParser import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} @@ -112,8 +112,9 @@ class TextReader( */ def txt(filePath: String): DataFrame = { if (ResourceHelper.validFile(filePath)) { - val textDf = datasetWithTxtFile(spark, filePath) - .withColumn(outputColumn, parseTxtUDF(col("content"))) + import spark.implicits._ + val textDf = datasetWithTextFile(spark, filePath) + .withColumn(outputColumn, parseTxtUDF($"content")) if (storeContent) textDf.select("path", outputColumn, "content") else textDf.select("path", outputColumn) } else { diff --git a/src/test/resources/reader/txt/long-text.txt b/src/test/resources/reader/txt/long-text.txt new file mode 100644 index 00000000000000..cadaab9be2048e --- /dev/null +++ b/src/test/resources/reader/txt/long-text.txt @@ -0,0 +1 @@ +Ukrainian forces reportedly advanced in the western Donetsk-eastern Zaporizhia Oblast border area and in western Zaporizhia Oblast amid Ukrainian counteroffensive operations in southern and eastern Ukraine. Tavriisk Group of Forces Spokesperson Oleksandr Shtupun reported that Ukrainian forces are advancing in the directions of Novoprokopivka (13km south of Orikhiv), Mala Tokmachka (9km southeast of Orikhiv), and Ocheretuvate (25km southeast of Orikhiv) in western Zaporizhia Oblast.[1] Shtupun also stated that Ukrainian forces advanced near Urozhaine (9km south of Velyka Novosilka) and Robotyne (10km south of Orikhiv) and achieved unspecified successes near Staromayorske (9km south of Velyka Novosilka) in the Berdyansk direction (western Donetsk-eastern Zaporizhia Oblast border area) and in an unspecified location in the Melitopol direction (western Zaporizhia Oblast).[2] Ukrainian Eastern Group of Forces Spokesperson Ilya Yevlash stated that Ukrainian forces continued offensive operations in the Bakhmut direction.[3] \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala new file mode 100644 index 00000000000000..17457c86396921 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala @@ -0,0 +1,41 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.FastTest +import org.apache.spark.sql.functions.explode +import org.scalatest.flatspec.AnyFlatSpec + +class PartitionChunkerTest extends AnyFlatSpec { + import ResourceHelper.spark.implicits._ + val txtDirectory = "src/test/resources/reader/txt" + + "Partition" should "perform basic chunk text" taggedAs FastTest in { + val partitionOptions = Map("contentType" -> "text/plain", "chunkingStrategy" -> "basic") + val textDf = Partition(partitionOptions).partition(s"$txtDirectory/long-text.txt") + textDf.show(truncate = false) + + val partitionDf = textDf.select(explode($"txt.content")) + partitionDf.show(truncate = false) + assert(partitionDf.count() == 1) + + val chunkDf = textDf.select(explode($"chunks.content")) + chunkDf.show(truncate = false) + assert(chunkDf.count() > 1) + } + +} diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala index 474cf4c1b13430..47d6f1d0c8e70b 100644 --- a/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala @@ -27,10 +27,11 @@ class PartitionTransformerTest extends AnyFlatSpec with SparkSessionTest { val wordDirectory = "src/test/resources/reader/doc" val emailDirectory = "src/test/resources/reader/email" val htmlDirectory = "src/test/resources/reader/html" + val txtDirectory = "src/test/resources/reader/txt" "PartitionTransformer" should "work in a RAG pipeline" taggedAs SlowTest in { val partition = new PartitionTransformer() - .setInputCols("doc") + .setInputCols("text") .setContentType("application/msword") .setContentPath(s"$wordDirectory/fake_table.docx") .setOutputCol("partition") @@ -46,7 +47,7 @@ class PartitionTransformerTest extends AnyFlatSpec with SparkSessionTest { val pipelineModel = pipeline.fit(emptyDataSet) val resultDf = pipelineModel.transform(emptyDataSet) - resultDf.select("doc", "partition", "translation").show(truncate = false) + resultDf.select("text", "partition", "translation").show(truncate = false) assert(resultDf.select("partition").count() > 0) } @@ -167,4 +168,46 @@ class PartitionTransformerTest extends AnyFlatSpec with SparkSessionTest { assert(resultDf.select("partition").count() > 0) } + it should "chunk semantically" taggedAs FastTest in { + val partition = new PartitionTransformer() + .setInputCols("text") + .setContentType("text/plain") + .setContentPath(s"$txtDirectory") + .setOutputCol("chunks") + .setChunkingStrategy("basic") + .setMaxCharacters(72) + + val pipeline = new Pipeline() + .setStages(Array(partition)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + resultDf.show(truncate = false) + } + + it should "chunk semantically with document assembler" taggedAs FastTest in { + import spark.implicits._ + val testDataSet = Seq( + "Introduction: RAG stands for Retrieval-Augmented Generation." + + " Why RAG? It improves factual accuracy and adds fresh or private data to LLMs." + + " Chunking: Breaks documents into pieces so they can be embedded." + + " Semantic Chunking: Focus on respecting document structure like sections." + + " Summary: RAG is powerful when paired with good chunking!").toDS + .toDF("text") + + val partition = new PartitionTransformer() + .setInputCols("document") + .setOutputCol("chunks") + .setChunkingStrategy("basic") + .setMaxCharacters(140) + + val pipeline = new Pipeline() + .setStages(Array(documentAssembler, partition)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(testDataSet) + resultDf.select("chunks").show(truncate = false) + + } + } From 3e0aa61482e1da1b67904b97b86ab3fe5f010434 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Mon, 2 Jun 2025 15:49:42 -0500 Subject: [PATCH 2/8] [SPARKNLP-1163] Adding title chunking strategy (#14594) --- .../partition/partition_properties.py | 24 ++- .../partition/partition_transformer.py | 6 +- .../johnsnowlabs/partition/BasicChunker.scala | 4 - .../com/johnsnowlabs/partition/Chunk.scala | 7 + ...rties.scala => HasChunkerProperties.scala} | 24 ++- .../johnsnowlabs/partition/Partition.scala | 2 +- ...icChunker.scala => PartitionChunker.scala} | 28 +++- .../partition/PartitionTransformer.scala | 6 +- .../johnsnowlabs/partition/TitleChunker.scala | 151 ++++++++++++++++++ .../partition/PartitionChunkerTest.scala | 14 ++ .../partition/TitleChunkerTest.scala | 73 +++++++++ 11 files changed, 324 insertions(+), 15 deletions(-) create mode 100644 src/main/scala/com/johnsnowlabs/partition/Chunk.scala rename src/main/scala/com/johnsnowlabs/partition/{HasSemanticChunkerProperties.scala => HasChunkerProperties.scala} (68%) rename src/main/scala/com/johnsnowlabs/partition/{SemanticChunker.scala => PartitionChunker.scala} (72%) create mode 100644 src/main/scala/com/johnsnowlabs/partition/TitleChunker.scala create mode 100644 src/test/scala/com/johnsnowlabs/partition/TitleChunkerTest.scala diff --git a/python/sparknlp/partition/partition_properties.py b/python/sparknlp/partition/partition_properties.py index 7f72ec41486182..a13f9167eef668 100644 --- a/python/sparknlp/partition/partition_properties.py +++ b/python/sparknlp/partition/partition_properties.py @@ -256,7 +256,7 @@ def setThreshold(self, value): def getThreshold(self): return self.getOrDefault(self.threshold) -class HasSemanticChunkerProperties(Params): +class HasChunkerProperties(Params): chunkingStrategy = Param( Params._dummy(), @@ -296,4 +296,24 @@ def setNewAfterNChars(self, value): ) def setOverlap(self, value): - return self._set(overlap=value) \ No newline at end of file + return self._set(overlap=value) + + combineTextUnderNChars = Param( + Params._dummy(), + "combineTextUnderNChars", + "Threshold to merge adjacent small sections", + typeConverter=TypeConverters.toInt + ) + + def setCombineTextUnderNChars(self, value): + return self._set(combineTextUnderNChars=value) + + overlapAll = Param( + Params._dummy(), + "overlapAll", + "Apply overlap context between all sections, not just split chunks", + typeConverter=TypeConverters.toBoolean + ) + + def setOverlapAll(self, value): + return self._set(overlapAll=value) diff --git a/python/sparknlp/partition/partition_transformer.py b/python/sparknlp/partition/partition_transformer.py index 50e1e5edc94f31..a971bb44ae78b7 100644 --- a/python/sparknlp/partition/partition_transformer.py +++ b/python/sparknlp/partition/partition_transformer.py @@ -23,7 +23,7 @@ class PartitionTransformer( HasHTMLReaderProperties, HasPowerPointProperties, HasTextReaderProperties, - HasSemanticChunkerProperties + HasChunkerProperties ): """ The PartitionTransformer annotator allows you to use the Partition feature more smoothly @@ -194,5 +194,7 @@ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer", chunkingStrategy="", maxCharacters=100, newAfterNChars=-1, - overlap=0 + overlap=0, + combineTextUnderNChars=0, + overlapAll=False ) \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala b/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala index 80b6747cd5ac7d..881f07e8664985 100644 --- a/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala +++ b/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala @@ -19,10 +19,6 @@ import com.johnsnowlabs.reader.HTMLElement import scala.collection.mutable -case class Chunk(elements: List[HTMLElement]) { - def length: Int = elements.map(_.content.length).sum -} - object BasicChunker { /** Splits a list of [[HTMLElement]]s into chunks constrained by a maximum number of characters. diff --git a/src/main/scala/com/johnsnowlabs/partition/Chunk.scala b/src/main/scala/com/johnsnowlabs/partition/Chunk.scala new file mode 100644 index 00000000000000..04e6a2585378e1 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/partition/Chunk.scala @@ -0,0 +1,7 @@ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.reader.HTMLElement + +case class Chunk(elements: List[HTMLElement]) { + def length: Int = elements.map(_.content.length).sum +} diff --git a/src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasChunkerProperties.scala similarity index 68% rename from src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala rename to src/main/scala/com/johnsnowlabs/partition/HasChunkerProperties.scala index d9fe453725561f..82de0df0ca13d7 100644 --- a/src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala +++ b/src/main/scala/com/johnsnowlabs/partition/HasChunkerProperties.scala @@ -18,7 +18,7 @@ package com.johnsnowlabs.partition import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable import org.apache.spark.ml.param.Param -trait HasSemanticChunkerProperties extends ParamsAndFeaturesWritable { +trait HasChunkerProperties extends ParamsAndFeaturesWritable { val chunkingStrategy = new Param[String](this, "chunkingStrategy", "Set the chunking strategy") @@ -39,6 +39,26 @@ trait HasSemanticChunkerProperties extends ParamsAndFeaturesWritable { def setOverlap(value: Int): this.type = set(overlap, value) - setDefault(chunkingStrategy -> "", maxCharacters -> 100, newAfterNChars -> -1, overlap -> 0) + val combineTextUnderNChars = + new Param[Int](this, "combineTextUnderNChars", "Threshold to merge adjacent small sections") + + def setComBineTextUnderNChars(value: Int): this.type = + set(combineTextUnderNChars, value) + + val overlapAll = + new Param[Boolean]( + this, + "overlapAll", + "Apply overlap context between all sections, not just split chunks") + + def setOverlapAll(value: Boolean): this.type = set(overlapAll, value) + + setDefault( + chunkingStrategy -> "", + maxCharacters -> 100, + newAfterNChars -> -1, + overlap -> 0, + combineTextUnderNChars -> 0, + overlapAll -> false) } diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala index 4fbb44c284e4cb..a339fe9b258ee4 100644 --- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala +++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala @@ -146,7 +146,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) val partitionResult = reader(path) if (hasChunkerStrategy) { - val chunker = new SemanticChunker(params.asScala.toMap) + val chunker = new PartitionChunker(params.asScala.toMap) partitionResult.withColumn( "chunks", chunker.chunkUDF()(partitionResult(sparkNLPReader.getOutputColumn))) diff --git a/src/main/scala/com/johnsnowlabs/partition/SemanticChunker.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionChunker.scala similarity index 72% rename from src/main/scala/com/johnsnowlabs/partition/SemanticChunker.scala rename to src/main/scala/com/johnsnowlabs/partition/PartitionChunker.scala index 3125ff2c284437..84187d40b60364 100644 --- a/src/main/scala/com/johnsnowlabs/partition/SemanticChunker.scala +++ b/src/main/scala/com/johnsnowlabs/partition/PartitionChunker.scala @@ -16,15 +16,20 @@ package com.johnsnowlabs.partition import com.johnsnowlabs.partition.BasicChunker.chunkBasic +import com.johnsnowlabs.partition.TitleChunker.chunkByTitle import com.johnsnowlabs.reader.HTMLElement -import com.johnsnowlabs.reader.util.PartitionOptions.{getDefaultInt, getDefaultString} +import com.johnsnowlabs.reader.util.PartitionOptions.{ + getDefaultBoolean, + getDefaultInt, + getDefaultString +} import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.udf import scala.collection.mutable -class SemanticChunker(chunkerOptions: Map[String, String]) extends Serializable { +class PartitionChunker(chunkerOptions: Map[String, String]) extends Serializable { def chunkUDF(): UserDefinedFunction = { udf((elements: Seq[Row]) => { @@ -37,6 +42,14 @@ class SemanticChunker(chunkerOptions: Map[String, String]) extends Serializable val chunks = getChunkerStrategy match { case "basic" => chunkBasic(htmlElements, getMaxCharacters, getNewAfterNChars, getOverlap) + case "byTitle" | "by_title" => + chunkByTitle( + htmlElements, + getMaxCharacters, + getCombineTextUnderNChars, + getOverlap, + getNewAfterNChars, + getOverlapAll) case _ => throw new IllegalArgumentException(s"Unknown chunker strategy: $getChunkerStrategy") } @@ -64,4 +77,15 @@ class SemanticChunker(chunkerOptions: Map[String, String]) extends Serializable default = "none") } + private def getCombineTextUnderNChars: Int = { + getDefaultInt( + chunkerOptions, + Seq("combineTextUnderNChars", "combine_text_under_n_chars"), + default = 0) + } + + private def getOverlapAll: Boolean = { + getDefaultBoolean(chunkerOptions, Seq("overlapAll", "overlap_all"), default = false) + } + } diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala index e3bd9d6b587c10..73d461c91aaafc 100644 --- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala @@ -86,7 +86,7 @@ class PartitionTransformer(override val uid: String) with HasPowerPointProperties with HasTextReaderProperties with HasPdfProperties - with HasSemanticChunkerProperties { + with HasChunkerProperties { def this() = this(Identifiable.randomUID("PartitionTransformer")) protected val logger: Logger = LoggerFactory.getLogger(getClass.getName) @@ -155,7 +155,9 @@ class PartitionTransformer(override val uid: String) "chunkingStrategy" -> $(chunkingStrategy), "maxCharacters" -> $(maxCharacters).toString, "newAfterNChars" -> $(newAfterNChars).toString, - "overlap" -> $(overlap).toString) + "overlap" -> $(overlap).toString, + "combineTextUnderNChars" -> $(combineTextUnderNChars).toString, + "overlapAll" -> $(overlapAll).toString) val partitionInstance = new Partition(params.asJava) val inputColum = if (get(inputCols).isDefined) { diff --git a/src/main/scala/com/johnsnowlabs/partition/TitleChunker.scala b/src/main/scala/com/johnsnowlabs/partition/TitleChunker.scala new file mode 100644 index 00000000000000..3a03151fd6303a --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/partition/TitleChunker.scala @@ -0,0 +1,151 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.reader.{ElementType, HTMLElement} + +import scala.collection.mutable + +object TitleChunker { + + /** Splits a list of HTML elements into semantically grouped Chunks based on Title and Table + * markers. + * + * @param elements + * List of input HTML elements to chunk. + * @param maxCharacters + * Maximum length allowed per chunk. Longer sections are split. + * @param combineTextUnderNChars + * Threshold to merge adjacent small sections. + * @param overlap + * Number of characters to repeat between consecutive chunks. + * @param newAfterNChars + * Soft limit to trigger new section if length exceeded, even before maxCharacters. + * @param overlapAll + * Apply overlap context between all sections, not just split chunks. + * @return + * List of Chunks partitioned by title and content heuristics. + */ + def chunkByTitle( + elements: List[HTMLElement], + maxCharacters: Int, + combineTextUnderNChars: Int = 0, + overlap: Int = 0, + newAfterNChars: Int = -1, + overlapAll: Boolean = false): List[Chunk] = { + + val softLimit = if (newAfterNChars <= 0) maxCharacters else newAfterNChars + val chunks = mutable.ListBuffer.empty[Chunk] + val sections = mutable.ListBuffer.empty[List[HTMLElement]] + var currentSection = List.empty[HTMLElement] + var currentLength = 0 + var currentPage = -1 + + for (element <- elements) { + val elementLength = element.content.length + val isTable = element.elementType == "Table" + val elementPage = element.metadata.getOrElse("pageNumber", "-1").toInt + + val pageChanged = currentPage != -1 && elementPage != currentPage + val softLimitExceeded = currentSection.length >= 2 && + (currentLength + elementLength > softLimit) + + if (isTable) { + if (currentSection.nonEmpty) sections += currentSection + sections += List(element) + currentSection = List.empty + currentLength = 0 + currentPage = -1 + } else if (pageChanged || softLimitExceeded) { + if (currentSection.nonEmpty) sections += currentSection + currentSection = List(element) + currentLength = elementLength + currentPage = elementPage + } else { + currentSection :+= element + currentLength += elementLength + currentPage = elementPage + } + } + if (currentSection.nonEmpty) sections += currentSection + + val mergedSections = sections.foldLeft(List.empty[List[HTMLElement]]) { (acc, section) => + val sectionLength = section.map(_.content.length).sum + val canMerge = combineTextUnderNChars > 0 && + sectionLength < combineTextUnderNChars && + acc.nonEmpty && + acc.last.exists(_.elementType != "Table") && + section.exists(_.elementType != "Table") + + if (canMerge) { + acc.init :+ (acc.last ++ section) + } else { + acc :+ section + } + } + + var lastNarrativeText = "" + for (section <- mergedSections) { + if (section.exists(_.elementType == "Table")) { + chunks += Chunk(section) + lastNarrativeText = "" + } else { + val sectionText = section.map(_.content).mkString(" ") + val content = + if (overlap > 0 && lastNarrativeText.nonEmpty && (overlapAll || sectionText.length > maxCharacters)) + lastNarrativeText.takeRight(overlap) + " " + sectionText + else sectionText + + val merged = HTMLElement(ElementType.NARRATIVE_TEXT, content.trim, section.head.metadata) + val split = if (content.length > maxCharacters) { + splitHTMLElement(merged, maxCharacters, overlap) + } else List(merged) + + chunks ++= split.map(e => Chunk(List(e))) + lastNarrativeText = sectionText + } + } + + chunks.toList + } + + private def splitHTMLElement( + element: HTMLElement, + maxLen: Int, + overlap: Int): List[HTMLElement] = { + + val words = element.content.split(" ") + val buffer = mutable.ListBuffer.empty[HTMLElement] + var chunk = new StringBuilder + + for (word <- words) { + if (chunk.length + word.length + 1 > maxLen) { + val text = chunk.toString().trim + buffer += element.copy(content = text) + chunk = new StringBuilder + if (overlap > 0 && text.length >= overlap) + chunk.append(text.takeRight(overlap)).append(" ") + } + chunk.append(word).append(" ") + } + + if (chunk.nonEmpty) + buffer += element.copy(content = chunk.toString().trim) + + buffer.toList + } + +} diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala index 17457c86396921..4eabfff8b4304a 100644 --- a/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala @@ -23,6 +23,7 @@ import org.scalatest.flatspec.AnyFlatSpec class PartitionChunkerTest extends AnyFlatSpec { import ResourceHelper.spark.implicits._ val txtDirectory = "src/test/resources/reader/txt" + val htmlDirectory = "src/test/resources/reader/html" "Partition" should "perform basic chunk text" taggedAs FastTest in { val partitionOptions = Map("contentType" -> "text/plain", "chunkingStrategy" -> "basic") @@ -38,4 +39,17 @@ class PartitionChunkerTest extends AnyFlatSpec { assert(chunkDf.count() > 1) } + it should "perform chunking by title" taggedAs FastTest in { + val partitionOptions = Map( + "contentType" -> "text/html", + "titleFontSize" -> "14", + "chunkingStrategy" -> "byTitle", + "combineTextUnderNChars" -> "50") + val textDf = Partition(partitionOptions).partition(s"$htmlDirectory/fake-html.html") + + val partitionDf = textDf.select(explode($"chunks.content")) + partitionDf.show(truncate = false) + assert(partitionDf.count() == 2) + } + } diff --git a/src/test/scala/com/johnsnowlabs/partition/TitleChunkerTest.scala b/src/test/scala/com/johnsnowlabs/partition/TitleChunkerTest.scala new file mode 100644 index 00000000000000..671b5403dcc43a --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/partition/TitleChunkerTest.scala @@ -0,0 +1,73 @@ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.reader.HTMLElement +import org.scalatest.flatspec.AnyFlatSpec + +import scala.collection.mutable + +class TitleChunkerTest extends AnyFlatSpec { + + def element(et: String, text: String, page: Int = 1): HTMLElement = + HTMLElement(et, text, mutable.Map("pageNumber" -> page.toString)) + + "chunkByTitle" should "include titles in same chunk with following text" in { + val elements = List( + element("Title", "My First Heading"), + element("Title", "My Second Heading"), + element("NarrativeText", "My first paragraph. lorem ipsum dolor set amet."), + element("Title", "A Third Heading")) + + val result = TitleChunker.chunkByTitle(elements, maxCharacters = 1000) + + assert(result.length == 1) + val content = result.head.elements.head.content + assert(content.contains("My First Heading")) + assert(content.contains("My Second Heading")) + } + + it should "split on soft limit newAfterNChars" in { + val elements = List( + element("Title", "Heading"), + element("NarrativeText", "a " * 50), + element("NarrativeText", "b " * 50)) + + val result = TitleChunker.chunkByTitle(elements, maxCharacters = 1000, newAfterNChars = 100) + + assert(result.length == 2) + } + + it should "add overlap context when overlapAll is true" in { + val elements = List( + element("Title", "Intro"), + element("NarrativeText", "The cow jumped over the moon. " * 5), + element("Title", "Next Section"), + element("NarrativeText", "And the dish ran away with the spoon.")) + + val maxCharacters = 100 + val overlap = 10 + val result = TitleChunker.chunkByTitle( + elements, + maxCharacters = maxCharacters, + overlap = overlap, + overlapAll = true) + assert(result.length >= 2) + + val prevText = ("The cow jumped over the moon. " * 5).trim + val expectedOverlap = prevText.takeRight(overlap).trim + assert(result(1).elements.head.content.contains(expectedOverlap)) + } + + it should "chunk content correctly across page boundaries" in { + val elements = List( + element("Title", "Page 1 Heading"), + element("NarrativeText", "Text on page 1."), + element("Title", "Page 2 Heading", page = 2), + element("NarrativeText", "Text on page 2.", page = 2)) + + val result = TitleChunker.chunkByTitle(elements, maxCharacters = 1000) + assert(result.length == 2) + assert(result(0).elements.head.content.contains("Page 1 Heading")) + assert(result(1).elements.head.content.contains("Page 2 Heading")) + } + +} From bb306b97e6722783ec895c50309ad97a2c8917b6 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <danilo@johnsnowlabs.com> Date: Fri, 6 Jun 2025 15:06:38 -0500 Subject: [PATCH 3/8] [SPARKNLP-1125] Adding partition with chunk demo notebook [skip test] --- ...parkNLP_Partition_with_Chunking_Demo.ipynb | 362 ++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 examples/python/data-preprocessing/SparkNLP_Partition_with_Chunking_Demo.ipynb diff --git a/examples/python/data-preprocessing/SparkNLP_Partition_with_Chunking_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Partition_with_Chunking_Demo.ipynb new file mode 100644 index 00000000000000..8bb4a05faf6d46 --- /dev/null +++ b/examples/python/data-preprocessing/SparkNLP_Partition_with_Chunking_Demo.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing Partition with Semantic Chunking SparkNLP\n", + "This notebook showcases the newly added `Partition` component in Spark NLP\n", + "providing a streamlined and user-friendly interface for interacting with Spark NLP readers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 \n", + "\n", + "Chunking support was added in Spark NLP 6.0.3\n", + "Please make sure you have upgraded to the latest Spark NLP release.\n", + "\n", + "For local files example we will download different files from Spark NLP Github repo:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ATDLz3Gws5ob" + }, + "source": [ + "**Downloading Files**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "g7PMCOJo0ZlU" + }, + "outputs": [], + "source": [ + "!mkdir txt-files\n", + "!mkdir html-files" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AV-krG6Ps8pq", + "outputId": "ea4c2484-6e83-4a7a-a000-537f38189ed0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-06 15:19:01-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/txt/long-text.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1032 (1.0K) [text/plain]\n", + "Saving to: ‘txt-files/long-text.txt’\n", + "\n", + "long-text.txt 100%[===================>] 1.01K --.-KB/s in 0s \n", + "\n", + "2025-06-06 15:19:01 (58.1 MB/s) - ‘txt-files/long-text.txt’ saved [1032/1032]\n", + "\n", + "--2025-06-06 15:19:01-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/html/fake-html.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 665 [text/plain]\n", + "Saving to: ‘html-files/fake-html.html’\n", + "\n", + "fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n", + "\n", + "2025-06-06 15:19:02 (26.7 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/txt/long-text.txt -P txt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/html/fake-html.html -P html-files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Partitioning Documents with Chunking\n", + "Use the `basic` chunking to segment data into coherent chunks based on character limits" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bAkMjJ1vdalE", + "outputId": "75831f62-c84a-4170-f87e-e70a6c1ef39d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "from sparknlp.partition.partition import Partition\n", + "\n", + "partition_df = Partition(content_type = \"text/plain\", chunking_strategy = \"basic\").partition(\"./txt-files/long-text.txt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k6uvYxiVzGsG" + }, + "source": [ + "Output without `basic` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3L-Tp017qgqb", + "outputId": "98af5f84-5abc-4554-bab7-7dd9c5212612" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|Ukrainian forces reportedly advanced in the western Donetsk-eastern Zaporizhia Oblast border area and in western Zaporizhia Oblast amid Ukrainian counteroffensive operations in southern and eastern Ukraine. Tavriisk Group of Forces Spokesperson Oleksandr Shtupun reported that Ukrainian forces are advancing in the directions of Novoprokopivka (13km south of Orikhiv), Mala Tokmachka (9km southeast of Orikhiv), and Ocheretuvate (25km southeast of Orikhiv) in western Zaporizhia Oblast.[1] Shtupun also stated that Ukrainian forces advanced near Urozhaine (9km south of Velyka Novosilka) and Robotyne (10km south of Orikhiv) and achieved unspecified successes near Staromayorske (9km south of Velyka Novosilka) in the Berdyansk direction (western Donetsk-eastern Zaporizhia Oblast border area) and in an unspecified location in the Melitopol direction (western Zaporizhia Oblast).[2] Ukrainian Eastern Group of Forces Spokesperson Ilya Yevlash stated that Ukrainian forces continued offensive operations in the Bakhmut direction.[3]|\n", + "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import explode, col\n", + "\n", + "result_df = partition_df.select(explode(col(\"txt.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EQJvQsnxzRg1" + }, + "source": [ + "Output with `basic` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VlhnXCV5qr4J", + "outputId": "cdaf98f1-3109-4770-adaf-f51c80a59ab9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|Ukrainian forces reportedly advanced in the western Donetsk-eastern Zaporizhia Oblast border area and in western Zaporizhia Oblast amid Ukrainian counteroffensive operations in southern and eastern Ukraine. Tavriisk Group of Forces Spokesperson Oleksandr Shtupun reported that Ukrainian forces are advancing in the directions of Novoprokopivka (13km south of Orikhiv), Mala Tokmachka (9km southeast of Orikhiv), and Ocheretuvate (25km southeast of Orikhiv) in western Zaporizhia Oblast.[1] Shtupun|\n", + "|also stated that Ukrainian forces advanced near Urozhaine (9km south of Velyka Novosilka) and Robotyne (10km south of Orikhiv) and achieved unspecified successes near Staromayorske (9km south of Velyka Novosilka) in the Berdyansk direction (western Donetsk-eastern Zaporizhia Oblast border area) and in an unspecified location in the Melitopol direction (western Zaporizhia Oblast).[2] Ukrainian Eastern Group of Forces Spokesperson Ilya Yevlash stated that Ukrainian forces continued offensive |\n", + "|operations in the Bakhmut direction.[3] |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df = partition_df.select(explode(col(\"chunks.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4YYTB7G6zbmN" + }, + "source": [ + "Use `by_title` chunking to group sections in documents with headings, tables, and mixed semantic elements" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PxTf0Ot23ZaO", + "outputId": "9b02a493-b4d0-41fc-c5ee-9ed8ab2de194" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "partition_df = Partition(content_type = \"text/html\", chunking_strategy = \"by_title\", combineTextUnderNChars = 50).partition(\"./html-files/fake-html.html\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YXMf3cBfz_2-" + }, + "source": [ + "Output without `by_title` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O-_R-W86sFo-", + "outputId": "6f07e491-c556-41af-89da-273e905d0e8b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+----------------------------------------------------------------------------------------------------------------------------------+\n", + "|My First Heading |\n", + "|My Second Heading |\n", + "|My first paragraph. lorem ipsum dolor set amet. if the cow comes home under the sun how do you fault the cow for it's worn hooves?|\n", + "|A Third Heading |\n", + "|Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2 |\n", + "+----------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df = partition_df.select(explode(col(\"html.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EhLOvpfe0JIe" + }, + "source": [ + "Output with `by_title` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WhSWaeYGrvP-", + "outputId": "8f5da326-029c-4ad6-c201-c5d2f2f8fa7d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|My First Heading My Second Heading My first paragraph. lorem ipsum dolor set amet. if the cow comes home under the sun how do you fault the cow for it's worn hooves? A Third Heading|\n", + "|Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2 |\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df = partition_df.select(explode(col(\"chunks.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BB2FEfegGuxl" + }, + "source": [ + "You can also use DFS file systems like:\n", + "- Databricks: `dbfs://`\n", + "- HDFS: `hdfs://`\n", + "- Microsoft Fabric OneLake: `abfss://`" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From d63a1c1a554149ff66799b512cd22e96ba779120 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <danilo@johnsnowlabs.com> Date: Fri, 6 Jun 2025 15:38:32 -0500 Subject: [PATCH 4/8] [SPARKNLP-1125] Fixs partition unit test in python --- python/test/partition/partition_transformer_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/test/partition/partition_transformer_test.py b/python/test/partition/partition_transformer_test.py index 283b6ace2f9c77..fbcf65b6f69b7c 100644 --- a/python/test/partition/partition_transformer_test.py +++ b/python/test/partition/partition_transformer_test.py @@ -89,7 +89,7 @@ class PartitionTransformerChunkTestSpec(unittest.TestCase): def setUp(self): self.spark = SparkContextForTest.spark - self.content_path = f"file:///{os.getcwd()}/../src/test/resources/reader/txt/rag-example.txt" + self.content_path = f"file:///{os.getcwd()}/../src/test/resources/reader/txt/long-text.txt" self.testDataSet = self.spark.createDataFrame( [("An example with DocumentAssembler annotator",)], ["text"] @@ -98,7 +98,7 @@ def setUp(self): def runTest(self): partition = PartitionTransformer() \ - .setInputCols(["document"]) \ + .setInputCols(["text"]) \ .setContentPath(self.content_path) \ .setOutputCol("partition") \ .setChunkingStrategy("basic") \ @@ -110,4 +110,4 @@ def runTest(self): resultDf = pipelineModel.transform(self.emptyDataSet) resultDf.show(truncate=False) - # self.assertTrue(resultDf.select("partition").count() >= 0) \ No newline at end of file + self.assertTrue(resultDf.select("partition").count() >= 0) \ No newline at end of file From 5ed5063a3805fa60b6ce0854494c49205d2777c5 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <danilo@johnsnowlabs.com> Date: Fri, 6 Jun 2025 18:19:38 -0500 Subject: [PATCH 5/8] [SPARKNLP-1125] Adding notebook example with RAG showcase --- ...ionTransformer_and_Semantic_Chunking.ipynb | 622 ++++++++++++++++++ .../partition/partition_transformer_test.py | 2 +- 2 files changed, 623 insertions(+), 1 deletion(-) create mode 100644 examples/python/data-preprocessing/SparkNLP_RAG_Demo_with_PartitionTransformer_and_Semantic_Chunking.ipynb diff --git a/examples/python/data-preprocessing/SparkNLP_RAG_Demo_with_PartitionTransformer_and_Semantic_Chunking.ipynb b/examples/python/data-preprocessing/SparkNLP_RAG_Demo_with_PartitionTransformer_and_Semantic_Chunking.ipynb new file mode 100644 index 00000000000000..5155d7f55a49d5 --- /dev/null +++ b/examples/python/data-preprocessing/SparkNLP_RAG_Demo_with_PartitionTransformer_and_Semantic_Chunking.ipynb @@ -0,0 +1,622 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing Chunking in Partition Transformer in SparkNLP\n", + "This notebook demonstrates how to use **Spark NLP's PartitionTransformer** for\n", + " chunking of documents, enabling efficient text segmentation.\n", + "\n", + "We further showcase a practical application of this chunking strategy in the context of **Retrieval-Augmented Generation (RAG)**.\n", + "\n", + "We can use this powerful method to enhance the performance of large language models by supplying context-relevant information from a knowledge base." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3y_JC9AmJtYr" + }, + "source": [ + "Creating Files" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "bo7s-jZVrE7W" + }, + "outputs": [], + "source": [ + "!echo -e \"Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!\" > rag_intro.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "lkJ-P8-50Nhy" + }, + "outputs": [], + "source": [ + "!echo -e \"Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer.\" > tomatoes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ay-nZLk_J0C4", + "outputId": "983de5e8-7ee8-434f-c4e2-7e742f97f189" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!\n" + ] + } + ], + "source": [ + "!cat rag_intro.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YmCMs_uU0Qkm", + "outputId": "55c22d57-c1ff-4628-b410-9ef322820dec" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer.\n" + ] + } + ], + "source": [ + "!cat tomatoes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "FpiTDDMx0Rx-" + }, + "outputs": [], + "source": [ + "!mkdir txt-data\n", + "!cp rag_intro.txt txt-data/rag_intro.txt\n", + "!cp tomatoes.txt txt-data/tomatoes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import Spark NLP\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "import sparknlp\n", + "\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Partitioning Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nluIcWMbM_rx" + }, + "source": [ + "Partition Transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mWnypHRwXruC", + "outputId": "a2a8e50b-dcf2-423b-94fe-1c61fa7deda2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+\n", + "| path| content| text| chunks|\n", + "+--------------------+--------------------+--------------------+--------------------+\n", + "|file:/content/txt...|Tomatoes grow bes...|[{NarrativeText, ...|[{document, 0, 19...|\n", + "|file:/content/txt...|Introduction: RAG...|[{NarrativeText, ...|[{document, 0, 33...|\n", + "+--------------------+--------------------+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.ml import Pipeline\n", + "from sparknlp.partition.partition_transformer import *\n", + "\n", + "empty_df = spark.createDataFrame([], \"string\").toDF(\"text\")\n", + "\n", + "partition_transformer = PartitionTransformer() \\\n", + " .setInputCols([\"text\"]) \\\n", + " .setContentType(\"text/plain\") \\\n", + " .setContentPath(\"./txt-data\") \\\n", + " .setOutputCol(\"chunks\") \\\n", + " .setChunkingStrategy(\"basic\") \\\n", + " .setMaxCharacters(140)\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " partition_transformer\n", + "])\n", + "\n", + "pipeline_model = pipeline.fit(empty_df)\n", + "result_df = pipeline_model.transform(empty_df)\n", + "\n", + "result_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EFMhyfnc_g1V", + "outputId": "57befaf7-91af-40b3-acca-9b67c623543a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|chunks |\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{document, 0, 198, Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer., {paragraph -> 0}, []}] |\n", + "|[{document, 0, 331, Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!, {paragraph -> 0}, []}]|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df.select(\"chunks\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gBNYByJ5Bqq6" + }, + "source": [ + "RAG Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W7LLHf_0BrtQ", + "outputId": "2e6e3577-044b-4c01-84a2-6d80d73e2a58" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "small_bert_L2_768 download started this may take some time.\n", + "Approximate size to download 135.3 MB\n", + "[OK!]\n" + ] + } + ], + "source": [ + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols([\"chunks\"]) \\\n", + " .setOutputCol(\"token\")\n", + "\n", + "bert_embeddings = BertEmbeddings.pretrained() \\\n", + " .setInputCols([\"chunks\", \"token\"]) \\\n", + " .setOutputCol(\"embeddings\")\n", + "\n", + "sentence_embeddings = SentenceEmbeddings() \\\n", + " .setInputCols([\"chunks\", \"embeddings\"]) \\\n", + " .setOutputCol(\"sentence_embeddings\") \\\n", + " .setPoolingStrategy(\"AVERAGE\")\n", + "\n", + "finisher = EmbeddingsFinisher().setInputCols([\"sentence_embeddings\"]).setOutputCols([\"finished_sentence_embeddings\"]).setOutputAsVector(True)\n", + "\n", + "rag_pipeline = Pipeline(stages=[\n", + " partition_transformer,\n", + " tokenizer,\n", + " bert_embeddings,\n", + " sentence_embeddings,\n", + " finisher\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sY3fW-93CL2J" + }, + "source": [ + "Embed a Knowledge Base" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "LR0E1EdjCEjS" + }, + "outputs": [], + "source": [ + "rag_model = rag_pipeline.fit(empty_df)\n", + "kb_df = rag_model.transform(empty_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fKfZCpqLl5WZ", + "outputId": "38e64d2b-ab95-4beb-e6fb-c4b0ce65d654" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "| path| content| text| chunks| token| embeddings| sentence_embeddings|finished_sentence_embeddings|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "|file:/content/txt...|Tomatoes grow bes...|[{NarrativeText, ...|[{document, 0, 19...|[{token, 0, 7, To...|[{word_embeddings...|[{sentence_embedd...| [[0.6935687065124...|\n", + "|file:/content/txt...|Introduction: RAG...|[{NarrativeText, ...|[{document, 0, 33...|[{token, 0, 11, I...|[{word_embeddings...|[{sentence_embedd...| [[0.5774036645889...|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "\n" + ] + } + ], + "source": [ + "kb_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-IhJqVfU2HJj", + "outputId": "9ac8bb5f-cc84-40fe-b181-95083baa3c25" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|chunks |\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{document, 0, 198, Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer., {paragraph -> 0}, []}] |\n", + "|[{document, 0, 331, Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!, {paragraph -> 0}, []}]|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "kb_df.select(\"chunks\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t6d9Za6jbdqF" + }, + "source": [ + "Preparing the output of a Spark NLP RAG pipeline by aligning each chunk of text with its embedding vector," + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "OZsD7pfZm0br" + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import posexplode, monotonically_increasing_id\n", + "from pyspark.ml.functions import vector_to_array\n", + "\n", + "kb_df = kb_df.withColumn(\"doc_id\", monotonically_increasing_id())\n", + "exploded_chunks = kb_df.selectExpr(\"doc_id\", \"chunks.result as chunks\") \\\n", + " .select(posexplode(\"chunks\").alias(\"pos\", \"chunk_text\"), \"doc_id\")\n", + "\n", + "exploded_vectors = kb_df.selectExpr(\"doc_id\", \"finished_sentence_embeddings as vectors\") \\\n", + " .select(posexplode(\"vectors\").alias(\"pos\", \"vector\"), \"doc_id\")\n", + "\n", + "aligned_df = exploded_chunks.join(exploded_vectors, on=[\"doc_id\", \"pos\"]).select(\"doc_id\", \"chunk_text\", \"vector\")\n", + "\n", + "aligned_df = aligned_df.withColumn(\"vector\", vector_to_array(\"vector\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uMelNiiiHfrU", + "outputId": "aa123f33-2522-458a-905c-cd66266f25cf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+--------------------+--------------------+\n", + "|doc_id| chunk_text| vector|\n", + "+------+--------------------+--------------------+\n", + "| 0|Tomatoes grow bes...|[0.69356870651245...|\n", + "| 1|Introduction: RAG...|[0.57740366458892...|\n", + "+------+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "aligned_df_clean = aligned_df.select(\"doc_id\", \"chunk_text\", \"vector\").cache()\n", + "aligned_df_clean.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UuyM3NdN4ttf" + }, + "source": [ + "Query Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a1HVp-g34z6g", + "outputId": "b67ec752-65d8-497e-ea90-4eabd72eaadd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "small_bert_L2_768 download started this may take some time.\n", + "Approximate size to download 135.3 MB\n", + "[OK!]\n" + ] + } + ], + "source": [ + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "sentence_detector = SentenceDetector() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"sentence\")\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols([\"sentence\"]) \\\n", + " .setOutputCol(\"token\")\n", + "\n", + "bert_embeddings = BertEmbeddings.pretrained() \\\n", + " .setInputCols([\"sentence\", \"token\"]) \\\n", + " .setOutputCol(\"embeddings\")\n", + "\n", + "sentence_embeddings = SentenceEmbeddings() \\\n", + " .setInputCols([\"sentence\", \"embeddings\"]) \\\n", + " .setOutputCol(\"sentence_embeddings\") \\\n", + " .setPoolingStrategy(\"AVERAGE\")\n", + "\n", + "query_pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " sentence_detector,\n", + " tokenizer,\n", + " bert_embeddings,\n", + " sentence_embeddings,\n", + " finisher\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "Numk3cjdoRI3" + }, + "outputs": [], + "source": [ + "query = \"What is semantic chunking?\"\n", + "query_df = spark.createDataFrame([[query]]).toDF(\"text\")\n", + "query_model = query_pipeline.fit(query_df)\n", + "# query_model = rag_pipeline.fit(query_df)\n", + "query_result = query_model.transform(query_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Kv_mpg-n4cvi", + "outputId": "28f4bcfc-3292-4fba-f274-59090bf423ac" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "| text| document| sentence| token| embeddings| sentence_embeddings|finished_sentence_embeddings|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "|What is semantic ...|[{document, 0, 25...|[{document, 0, 25...|[{token, 0, 3, Wh...|[{word_embeddings...|[{sentence_embedd...| [[0.3536282181739...|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "\n" + ] + } + ], + "source": [ + "query_result.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "JqfkKYkXoYd8" + }, + "outputs": [], + "source": [ + "query_vector = query_result.select(\"finished_sentence_embeddings\").first()[0][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "LvP5QoaSoEZv" + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf, col\n", + "from pyspark.sql.types import FloatType\n", + "import numpy as np\n", + "\n", + "def cosine_sim(vec1, vec2):\n", + " v1, v2 = np.array(vec1), np.array(vec2)\n", + " return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))\n", + "\n", + "# Register UDF\n", + "cosine_sim_udf = udf(lambda v: cosine_sim(v, query_vector), FloatType())\n", + "\n", + "# Add similarity score to each chunk\n", + "scored_chunks = aligned_df_clean.withColumn(\"similarity\", cosine_sim_udf(col(\"vector\"))) \\\n", + " .orderBy(col(\"similarity\").desc())" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "__Db-4tpJz6N", + "outputId": "55bf0969-b9fa-4fda-feea-6fd75a9e8804" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+--------------------+--------------------+----------+\n", + "|doc_id| chunk_text| vector|similarity|\n", + "+------+--------------------+--------------------+----------+\n", + "| 1|Introduction: RAG...|[0.57740366458892...|0.61944675|\n", + "| 0|Tomatoes grow bes...|[0.69356870651245...| 0.2762234|\n", + "+------+--------------------+--------------------+----------+\n", + "\n" + ] + } + ], + "source": [ + "scored_chunks.show()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/python/test/partition/partition_transformer_test.py b/python/test/partition/partition_transformer_test.py index fbcf65b6f69b7c..270486c561eece 100644 --- a/python/test/partition/partition_transformer_test.py +++ b/python/test/partition/partition_transformer_test.py @@ -84,7 +84,7 @@ def runTest(self): self.assertTrue(resultDf.select("partition").count() > 0) -@pytest.mark.slow +@pytest.mark.fast class PartitionTransformerChunkTestSpec(unittest.TestCase): def setUp(self): From 2b37fb06cf404e2ce66cdf702234c5fd8197b77f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Kuti-Kresz=C3=A1cs?= <47274810+thec0dewriter@users.noreply.github.com> Date: Thu, 29 May 2025 15:26:17 +0200 Subject: [PATCH 6/8] Update SparkNLP_PowerPoint_Reader_Demo.ipynb Fix reference copy pasted from Excel reader --- examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb index b70c0ac889c7b1..fcbe884c96baf9 100644 --- a/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb +++ b/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb @@ -109,7 +109,7 @@ }, "source": [ "## Parsing PowerPoint slides from Local Files\n", - "Use the `ppt()` method to parse Excel content from local directories." + "Use the `ppt()` method to parse PowerPoint content from local directories." ] }, { From 6e782e1bb8453287afb044a5a65a2760f0ce4e0c Mon Sep 17 00:00:00 2001 From: Danilo Burbano <danilo@johnsnowlabs.com> Date: Mon, 9 Jun 2025 14:23:33 -0500 Subject: [PATCH 7/8] [SPARKNLP-1119] Adding XML reader --- python/sparknlp/reader/sparknlp_reader.py | 45 ++++++++ python/test/sparknlp_test.py | 16 ++- .../partition/HasXmlReaderProperties.scala | 38 +++++++ .../johnsnowlabs/partition/Partition.scala | 3 + .../partition/PartitionTransformer.scala | 5 +- .../johnsnowlabs/reader/SparkNLPReader.scala | 66 ++++++++++- .../com/johnsnowlabs/reader/XMLReader.scala | 103 ++++++++++++++++++ src/test/resources/reader/xml/multi-level.xml | 20 ++++ src/test/resources/reader/xml/test.xml | 14 +++ .../partition/PartitionTest.scala | 8 ++ .../johnsnowlabs/reader/XMLReaderTest.scala | 43 ++++++++ 11 files changed, 358 insertions(+), 3 deletions(-) create mode 100644 src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala create mode 100644 src/main/scala/com/johnsnowlabs/reader/XMLReader.scala create mode 100644 src/test/resources/reader/xml/multi-level.xml create mode 100644 src/test/resources/reader/xml/test.xml create mode 100644 src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py index dfd865116f3821..86bf5781053050 100644 --- a/python/sparknlp/reader/sparknlp_reader.py +++ b/python/sparknlp/reader/sparknlp_reader.py @@ -322,4 +322,49 @@ def txt(self, docPath): if not isinstance(docPath, str): raise TypeError("docPath must be a string") jdf = self._java_obj.txt(docPath) + return self.getDataFrame(self.spark, jdf) + + def xml(self, docPath): + """Reads XML files and returns a Spark DataFrame. + + Parameters + ---------- + docPath : str + Path to an XML file or a directory containing XML files. + + Returns + ------- + pyspark.sql.DataFrame + A DataFrame containing parsed XML content. + + Examples + -------- + >>> from sparknlp.reader import SparkNLPReader + >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory") + + You can use SparkNLP for one line of code + + >>> import sparknlp + >>> xml_df = sparknlp.read().xml("home/user/xml-directory") + >>> xml_df.show(truncate=False) + +-----------------------------------------------------------+ + |xml | + +-----------------------------------------------------------+ + |[{Title, John Smith, {elementId -> ..., tag -> title}}] | + +-----------------------------------------------------------+ + + >>> xml_df.printSchema() + root + |-- path: string (nullable = true) + |-- xml: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- elementType: string (nullable = true) + | | |-- content: string (nullable = true) + | | |-- metadata: map (nullable = true) + | | | |-- key: string + | | | |-- value: string (valueContainsNull = true) + """ + if not isinstance(docPath, str): + raise TypeError("docPath must be a string") + jdf = self._java_obj.xml(docPath) return self.getDataFrame(self.spark, jdf) \ No newline at end of file diff --git a/python/test/sparknlp_test.py b/python/test/sparknlp_test.py index 68ea10b36476bf..c2baa14fec213d 100644 --- a/python/test/sparknlp_test.py +++ b/python/test/sparknlp_test.py @@ -125,4 +125,18 @@ def runTest(self): txt_df = sparknlp.read().txt(self.txt_file) txt_df.show() - self.assertTrue(txt_df.select("txt").count() > 0) \ No newline at end of file + self.assertTrue(txt_df.select("txt").count() > 0) + + +@pytest.mark.fast +class SparkNLPTestXMLFilesSpec(unittest.TestCase): + + def setUp(self): + self.data = SparkContextForTest.data + self.xml_files = f"file:///{os.getcwd()}/../src/test/resources/reader/xml" + + def runTest(self): + xml_df = sparknlp.read().xml(self.xml_files) + xml_df.show() + + self.assertTrue(xml_df.select("xml").count() > 0) \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala new file mode 100644 index 00000000000000..4993bc65a8cd8b --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala @@ -0,0 +1,38 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable +import org.apache.spark.ml.param.Param + +trait HasXmlReaderProperties extends ParamsAndFeaturesWritable { + + val xmlKeepTags = new Param[Boolean]( + this, + "xmlKeepTags", + "Whether to include XML tag names as metadata in the output.") + + def setXmlKeepTags(value: Boolean): this.type = set(xmlKeepTags, value) + + val onlyLeafNodes = new Param[Boolean]( + this, + "onlyLeafNodes", + "If true, only processes XML leaf nodes (no nested children).") + + def setOnlyLeafNodes(value: Boolean): this.type = set(onlyLeafNodes, value) + + setDefault(xmlKeepTags -> false, onlyLeafNodes -> true) +} diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala index a339fe9b258ee4..2e6f69b8c5b4c4 100644 --- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala +++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala @@ -188,6 +188,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) "application/vnd.openxmlformats-officedocument.presentationml.presentation" => sparkNLPReader.ppt case "application/pdf" => sparkNLPReader.pdf + case "application/xml" => sparkNLPReader.xml case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType") } } @@ -199,6 +200,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) case "text/plain" => sparkNLPReader.txtToHTMLElement case "text/html" => sparkNLPReader.htmlToHTMLElement case "url" => sparkNLPReader.urlToHTMLElement + case "application/xml" => sparkNLPReader.xmlToHTMLElement case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType") } } @@ -234,6 +236,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) case "xls" | "xlsx" => sparkNLPReader.xls case "ppt" | "pptx" => sparkNLPReader.ppt case "pdf" => sparkNLPReader.pdf + case "xml" => sparkNLPReader.xml case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension") } } diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala index 73d461c91aaafc..281af53931d72c 100644 --- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala @@ -86,6 +86,7 @@ class PartitionTransformer(override val uid: String) with HasPowerPointProperties with HasTextReaderProperties with HasPdfProperties + with HasXmlReaderProperties with HasChunkerProperties { def this() = this(Identifiable.randomUID("PartitionTransformer")) @@ -157,7 +158,9 @@ class PartitionTransformer(override val uid: String) "newAfterNChars" -> $(newAfterNChars).toString, "overlap" -> $(overlap).toString, "combineTextUnderNChars" -> $(combineTextUnderNChars).toString, - "overlapAll" -> $(overlapAll).toString) + "overlapAll" -> $(overlapAll).toString, + "xmlKeepTags" -> $(xmlKeepTags).toString, + "onlyLeafNodes" -> $(onlyLeafNodes).toString) val partitionInstance = new Partition(params.asJava) val inputColum = if (get(inputCols).isDefined) { diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala index a1637116cb7905..216492876cc718 100644 --- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala @@ -296,7 +296,6 @@ class SparkNLPReader( * |-- width_dimension: integer (nullable = true) * |-- content: binary (nullable = true) * |-- exception: string (nullable = true) - * |-- pagenum: integer (nullable = true) * }}} * * @param params @@ -642,4 +641,69 @@ class SparkNLPReader( default = BLOCK_SPLIT_PATTERN) } + /** Instantiates class to read XML files. + * + * xmlPath: this is a path to a directory of XML files or a path to an XML file. E.g., + * "path/xml/files" + * + * ==Example== + * {{{ + * val xmlPath = "home/user/xml-directory" + * val sparkNLPReader = new SparkNLPReader() + * val xmlDf = sparkNLPReader.xml(xmlPath) + * }}} + * + * ==Example 2== + * You can use SparkNLP for one line of code + * {{{ + * val xmlDf = SparkNLP.read.xml(xmlPath) + * }}} + * + * {{{ + * xmlDf.select("xml").show(false) + * +------------------------------------------------------------------------------------------------------------------------+ + * |xml | + * +------------------------------------------------------------------------------------------------------------------------+ + * |[{Title, John Smith, {elementId -> ..., tag -> title}}, {UncategorizedText, Some content..., {elementId -> ...}}] | + * +------------------------------------------------------------------------------------------------------------------------+ + * + * xmlDf.printSchema() + * root + * |-- path: string (nullable = true) + * |-- xml: array (nullable = true) + * | |-- element: struct (containsNull = true) + * | | |-- elementType: string (nullable = true) + * | | |-- content: string (nullable = true) + * | | |-- metadata: map (nullable = true) + * | | | |-- key: string + * | | | |-- value: string (valueContainsNull = true) + * }}} + * + * @param xmlPath + * Path to the XML file or directory + * @return + * A DataFrame with parsed XML as structured elements + */ + + def xml(xmlPath: String): DataFrame = { + val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes) + xmlReader.read(xmlPath) + } + + def xmlToHTMLElement(xml: String): Seq[HTMLElement] = { + val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes) + xmlReader.parseXml(xml) + } + + private def getXmlKeepTags: Boolean = { + getDefaultBoolean(params.asScala.toMap, Seq("xmlKeepTags", "xml_keep_tags"), default = false) + } + + private def getOnlyLeafNodes: Boolean = { + getDefaultBoolean( + params.asScala.toMap, + Seq("onlyLeafNodes", "only_leaf_nodes"), + default = true) + } + } diff --git a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala new file mode 100644 index 00000000000000..267ad10af760b2 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala @@ -0,0 +1,103 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.nlp.util.io.ResourceHelper.validFile +import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, udf} + +import scala.collection.mutable +import scala.collection.mutable.ListBuffer +import scala.xml.{Elem, Node, XML} + +class XMLReader( + storeContent: Boolean = false, + xmlKeepTags: Boolean = false, + onlyLeafNodes: Boolean = true) + extends Serializable { + + private lazy val spark = ResourceHelper.spark + + private var outputColumn = "xml" + + def setOutputColumn(value: String): this.type = { + require(value.nonEmpty, "Output column name cannot be empty.") + outputColumn = value + this + } + + def read(inputSource: String): DataFrame = { + if (validFile(inputSource)) { + val xmlDf = datasetWithTextFile(spark, inputSource) + .withColumn(outputColumn, parseXmlUDF(col("content"))) + if (storeContent) xmlDf.select("path", "content", outputColumn) + else xmlDf.select("path", outputColumn) + } else throw new IllegalArgumentException(s"Invalid inputSource: $inputSource") + } + + private val parseXmlUDF = udf((xml: String) => { + parseXml(xml) + }) + + def parseXml(xmlString: String): List[HTMLElement] = { + val xml = XML.loadString(xmlString) + val elements = ListBuffer[HTMLElement]() + + def traverse(node: Node, parentId: Option[String]): Unit = { + node match { + case elem: Elem => + val tagName = elem.label.toLowerCase + val textContent = elem.text.trim + val elementId = hash(tagName + textContent) + + val isLeaf = !elem.child.exists(_.isInstanceOf[Elem]) + + if (!onlyLeafNodes || isLeaf) { + val elementType = tagName match { + case "title" | "author" => ElementType.TITLE + case _ => ElementType.UNCATEGORIZED_TEXT + } + + val metadata = mutable.Map[String, String]("elementId" -> elementId) + if (xmlKeepTags) metadata += ("tag" -> tagName) + parentId.foreach(id => metadata += ("parentId" -> id)) + + val content = if (isLeaf) textContent else "" + elements += HTMLElement(elementType, content, metadata) + } + + // Traverse children + elem.child.foreach(traverse(_, Some(elementId))) + + case _ => // Ignore other types + } + } + + traverse(xml, None) + elements.toList + } + + def hash(s: String): String = { + java.security.MessageDigest + .getInstance("MD5") + .digest(s.getBytes) + .map("%02x".format(_)) + .mkString + } + +} diff --git a/src/test/resources/reader/xml/multi-level.xml b/src/test/resources/reader/xml/multi-level.xml new file mode 100644 index 00000000000000..e14e5ad684be30 --- /dev/null +++ b/src/test/resources/reader/xml/multi-level.xml @@ -0,0 +1,20 @@ +<library> + <section name="Fiction"> + <shelf number="1"> + <book> + <title>The Alchemist</title> + <author>Paulo Coelho</author> + <year>1988</year> + </book> + </shelf> + </section> + <section name="Science"> + <shelf number="2"> + <book> + <title>A Brief History of Time</title> + <author>Stephen Hawking</author> + <year>1988</year> + </book> + </shelf> + </section> +</library> diff --git a/src/test/resources/reader/xml/test.xml b/src/test/resources/reader/xml/test.xml new file mode 100644 index 00000000000000..44bdab910b4c96 --- /dev/null +++ b/src/test/resources/reader/xml/test.xml @@ -0,0 +1,14 @@ +<bookstore> + <book category="children"> + <title lang="en">Harry Potter</title> + <author>J K. Rowling</author> + <year>2005</year> + <price>29.99</price> + </book> + <book category="web"> + <title lang="en">Learning XML</title> + <author>Erik T. Ray</author> + <year>2003</year> + <price>39.95</price> + </book> +</bookstore> \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala index 9937b95f59e512..05c5916c843424 100644 --- a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala @@ -32,6 +32,7 @@ class PartitionTest extends AnyFlatSpec { val emailDirectory = "src/test/resources/reader/email" val htmlDirectory = "src/test/resources/reader/html" val pdfDirectory = "src/test/resources/reader/pdf" + val xmlDirectory = "src/test/resources/reader/xml" "Partition" should "work with text content_type" taggedAs FastTest in { val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory) @@ -181,4 +182,11 @@ class PartitionTest extends AnyFlatSpec { assert(elements == expectedElements) } + it should "work with XML content_type" taggedAs FastTest in { + val pdfDf = Partition(Map("content_type" -> "application/xml")).partition(xmlDirectory) + pdfDf.show() + + assert(!pdfDf.select(col("xml")).isEmpty) + } + } diff --git a/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala new file mode 100644 index 00000000000000..a75537803e61de --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala @@ -0,0 +1,43 @@ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.tags.FastTest +import org.apache.spark.sql.functions.{array_contains, col, explode, map_keys} +import org.scalatest.flatspec.AnyFlatSpec + +class XMLReaderTest extends AnyFlatSpec { + + val xmlFilesDirectory = "./src/test/resources/reader/xml/" + + "XMLReader" should "read xml as dataframe" taggedAs FastTest in { + val XMLReader = new XMLReader() + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/test.xml") + xmlDF.show(truncate = false) + + assert(!xmlDF.select(col("xml").getItem(0)).isEmpty) + assert(!xmlDF.columns.contains("content")) + } + + it should "include tags in the output" taggedAs FastTest in { + val XMLReader = new XMLReader(xmlKeepTags = true) + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml") + xmlDF.show(truncate = false) + + val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml"))) + val tagsDf = explodedDf.filter(col("xml_exploded.metadata")("tag") =!= "") + + assert(tagsDf.count() > 0) + } + + it should "output all nodes" taggedAs FastTest in { + val XMLReader = new XMLReader(onlyLeafNodes = false) + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml") + xmlDF.show(truncate = false) + val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml"))) + + val noParentIdCount = explodedDf + .filter(!array_contains(map_keys(col("xml_exploded.metadata")), "parentId")) + + assert(noParentIdCount.count() > 0) + } + +} From 9b45456af34f381999d3096ef9b380d26d2005ca Mon Sep 17 00:00:00 2001 From: Danilo Burbano <danilo@johnsnowlabs.com> Date: Mon, 9 Jun 2025 18:16:00 -0500 Subject: [PATCH 8/8] [SPARKNLP-1119] Adding documentation for XML reader [skip test] --- .../SparkNLP_Partition_Demo.ipynb | 567 ++++++++++-------- .../reader/SparkNLP_XML_Reader_Demo.ipynb | 339 +++++++++++ .../com/johnsnowlabs/reader/XMLReader.scala | 47 ++ 3 files changed, 714 insertions(+), 239 deletions(-) create mode 100644 examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb diff --git a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb index c5eecc20945f3f..659bdbc309f127 100644 --- a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb +++ b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb @@ -20,8 +20,30 @@ "## Setup and Initialization\n", "Let's keep in mind a few things before we start 😊\n", "\n", - "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release.\n", - "\n", + "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "For local files example we will download different files from Spark NLP Github repo:" ] }, @@ -42,34 +64,34 @@ "base_uri": "https://localhost:8080/" }, "id": "bo7s-jZVrE7W", - "outputId": "e7234d36-765e-4a29-f922-02ceab1626dd" + "outputId": "b0e91448-3b2c-4dab-84c7-5e7d8bad0be5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:05-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n", + "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 2456707 (2.3M) [text/plain]\n", "Saving to: ‘html-files/example-10k.html’\n", "\n", - "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.05s \n", + "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.04s \n", "\n", - "2025-05-26 23:11:06 (45.1 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", + "2025-06-09 22:10:23 (52.9 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", "\n", - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 665 [text/plain]\n", "Saving to: ‘html-files/fake-html.html’\n", "\n", "fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:06 (30.2 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", + "2025-06-09 22:10:24 (18.3 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", "\n" ] } @@ -97,38 +119,36 @@ "base_uri": "https://localhost:8080/" }, "id": "ya8qZe00dalC", - "outputId": "ba520f44-c4b9-45b1-f03c-6a8e3a33320b" + "outputId": "9b4fbf52-9ecc-454b-bef1-0ce31dadb7c7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 15629 (15K) [application/octet-stream]\n", "Saving to: ‘pdf-files/image_3_pages.pdf’\n", "\n", - "\r", - "image_3_pages.pdf 0%[ ] 0 --.-KB/s \r", "image_3_pages.pdf 100%[===================>] 15.26K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:06 (25.5 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", + "2025-06-09 22:10:24 (24.3 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", "\n", - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 25803 (25K) [application/octet-stream]\n", "Saving to: ‘pdf-files/pdf-title.pdf’\n", "\n", - "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0s \n", + "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:06 (58.5 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", + "2025-06-09 22:10:24 (21.2 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -137,7 +157,7 @@ "\n", "text_3_pages.pdf 100%[===================>] 9.26K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:07 (79.2 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", + "2025-06-09 22:10:24 (73.3 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", "\n" ] } @@ -166,47 +186,45 @@ "base_uri": "https://localhost:8080/" }, "id": "zLLEUl3KpYZ6", - "outputId": "4346e6e1-18ec-47a8-92c0-c8bc588f3441" + "outputId": "407e9405-6cc9-4724-f576-f52c503cb52d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 95087 (93K) [application/octet-stream]\n", "Saving to: ‘word-files/contains-pictures.docx’\n", "\n", - "\r", - "contains-pictures.d 0%[ ] 0 --.-KB/s \r", - "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.01s \n", + "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.02s \n", "\n", - "2025-05-26 23:11:07 (6.85 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", + "2025-06-09 22:10:25 (4.74 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12392 (12K) [application/octet-stream]\n", "Saving to: ‘word-files/fake_table.docx’\n", "\n", "fake_table.docx 100%[===================>] 12.10K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:07 (17.7 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", + "2025-06-09 22:10:25 (18.9 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 14584 (14K) [application/octet-stream]\n", "Saving to: ‘word-files/page-breaks.docx’\n", "\n", "page-breaks.docx 100%[===================>] 14.24K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:08 (22.4 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", + "2025-06-09 22:10:25 (21.5 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", "\n" ] } @@ -235,48 +253,58 @@ "base_uri": "https://localhost:8080/" }, "id": "G3-BCYP6qQ4x", - "outputId": "38489a6e-588d-4a1b-e319-0c7f66559ca0" + "outputId": "95c5a31d-eed9-47a1-bb55-0868daec7da7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12541 (12K) [application/octet-stream]\n", "Saving to: ‘excel-files/vodafone.xlsx’\n", "\n", "\r", "vodafone.xlsx 0%[ ] 0 --.-KB/s \r", - "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0.001s \n", + "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:08 (22.2 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", + "2025-06-09 22:10:26 (30.4 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", "\n", - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 38442 (38K) [application/octet-stream]\n", "Saving to: ‘excel-files/2023-half-year-analyses-by-segment.xlsx’\n", "\n", - "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.007s \n", + "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.01s \n", "\n", - "2025-05-26 23:11:08 (5.37 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", + "2025-06-09 22:10:26 (3.43 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", "\n", - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:09 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10676 (10K) [application/octet-stream]\n", + "Saving to: ‘excel-files/page-break-example.xlsx’\n", + "\n", + "page-break-example. 100%[===================>] 10.43K --.-KB/s in 0s \n", "\n", - "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", + "2025-06-09 22:10:26 (79.4 MB/s) - ‘excel-files/page-break-example.xlsx’ saved [10676/10676]\n", + "\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:09 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9210 (9.0K) [application/octet-stream]\n", + "Saving to: ‘excel-files/xlsx-subtable-cases.xlsx’\n", + "\n", + "xlsx-subtable-cases 100%[===================>] 8.99K --.-KB/s in 0s \n", + "\n", + "2025-06-09 22:10:26 (65.5 MB/s) - ‘excel-files/xlsx-subtable-cases.xlsx’ saved [9210/9210]\n", "\n" ] } @@ -289,17 +317,6 @@ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx -P excel-files" ] }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "LcSYn6q7jW9-" - }, - "outputs": [], - "source": [ - "!cp drive/MyDrive/JSL/PageBreakExample.xlsx ./excel-files" - ] - }, { "cell_type": "markdown", "metadata": { @@ -317,42 +334,45 @@ "base_uri": "https://localhost:8080/" }, "id": "1jDRFmcHqpxn", - "outputId": "4d59c445-3764-41a8-c91b-9231d401eac6" + "outputId": "cd7e3c96-bb5f-49ab-f466-56ec6be20f75" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 38412 (38K) [application/octet-stream]\n", "Saving to: ‘ppt-files/fake-power-point.pptx’\n", "\n", - "\r", - "fake-power-point.pp 0%[ ] 0 --.-KB/s \r", - "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.007s \n", + "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.01s \n", "\n", - "2025-05-26 23:11:10 (5.29 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", + "2025-06-09 22:10:27 (3.41 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", "\n", - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", + "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 39894 (39K) [application/octet-stream]\n", "Saving to: ‘ppt-files/fake-power-point-table.pptx’\n", "\n", - "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.006s \n", + "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.008s \n", "\n", - "2025-05-26 23:11:10 (6.73 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", + "2025-06-09 22:10:28 (4.93 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", "\n", - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:10 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39414 (38K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/speaker-notes.pptx’\n", + "\n", + "speaker-notes.pptx 100%[===================>] 38.49K --.-KB/s in 0.008s \n", + "\n", + "2025-06-09 22:10:28 (4.76 MB/s) - ‘ppt-files/speaker-notes.pptx’ saved [39414/39414]\n", "\n" ] } @@ -381,14 +401,14 @@ "base_uri": "https://localhost:8080/" }, "id": "yYMVpVQurk7G", - "outputId": "cedb0e39-f137-4759-a158-0b84ed31b282" + "outputId": "293a864a-2980-4502-c6dc-a1d3cee815ee" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -399,18 +419,18 @@ " email-tex 0%[ ] 0 --.-KB/s \r", "email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:11 (49.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", + "2025-06-09 22:10:28 (21.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", "\n", - "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 1324361 (1.3M) [text/plain]\n", "Saving to: ‘email-files/test-several-attachments.eml’\n", "\n", "test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.04s \n", "\n", - "2025-05-26 23:11:11 (32.0 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", + "2025-06-09 22:10:29 (30.2 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", "\n" ] } @@ -438,14 +458,14 @@ "base_uri": "https://localhost:8080/" }, "id": "AV-krG6Ps8pq", - "outputId": "c407a77f-11d5-4a3c-85e0-4abffa48bd12" + "outputId": "bd7317e0-97d3-4f30-a800-6ffa8148f266" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", + "--2025-06-09 22:10:29-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -456,7 +476,7 @@ "simple-text.txt 0%[ ] 0 --.-KB/s \r", "simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:11 (4.81 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", + "2025-06-09 22:10:29 (3.39 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", "\n" ] } @@ -466,6 +486,51 @@ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "QVq5C0Uqs4wU" + }, + "source": [ + "**Downloading XML files**" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Gip5P7Ess63U", + "outputId": "dde0fa15-2571-4b4a-ef73-517fe2b7a7a7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-09 22:15:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-06-09 22:15:15 (21.2 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files" + ] + }, { "cell_type": "markdown", "metadata": { @@ -478,13 +543,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bAkMjJ1vdalE", - "outputId": "15401bcc-3cb2-474a-d771-0efed1eaf9cd" + "outputId": "582dcc26-76ea-4cac-c5f6-46e009b639f9" }, "outputs": [ { @@ -519,13 +584,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VWbUgoVQrO8m", - "outputId": "36bbf310-7ee5-474a-93f2-4d940d3c0547" + "outputId": "56f4f9ce-41bb-48ba-b5db-7e1bde47d8d8" }, "outputs": [ { @@ -558,13 +623,13 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YFzeGJJ3ICVM", - "outputId": "01c349aa-16d2-4e0d-8a30-11399caf2ef2" + "outputId": "fc9bc68c-2b20-479e-8fe8-3e380877cebf" }, "outputs": [ { @@ -597,13 +662,13 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y_xl0ahaJ0Hy", - "outputId": "6040b119-2eca-4c58-f51b-e20fbefeef8d" + "outputId": "327222b8-0c6b-4578-8fde-4f14f9835edc" }, "outputs": [ { @@ -636,13 +701,13 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4JnKvFe5KVDf", - "outputId": "d91d1ee5-d4a3-48a1-b40a-d5f6bf997025" + "outputId": "c9252fb7-3840-4c95-d461-a56eef9adaea" }, "outputs": [ { @@ -675,13 +740,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_sldwjppKoPl", - "outputId": "467e9085-86dd-43df-f63b-a707b920d3b3" + "outputId": "0619383d-abf4-43a6-f63d-ad81897f8d9e" }, "outputs": [ { @@ -714,13 +779,13 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GltbZAjmKwQs", - "outputId": "c3f18b1f-06df-4233-8874-e9702c465e69" + "outputId": "df9ae11b-0186-4e61-d6ff-9581c597ccd1" }, "outputs": [ { @@ -731,9 +796,9 @@ "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "\n" ] @@ -798,7 +863,9 @@ "| `infer_table_structure` | Word, Excel, PowerPoint | Whether to generate an HTML table representation from structured table content. When enabled, a full `<table>` element is added alongside cell-level elements, based on row and column layout. |\n", "| `append_cells` | Excel | Whether to append all rows into a single content block instead of creating separate elements per row. |\n", "| `cell_separator` | Excel | String used to join cell values in a row when assembling textual output |\n", - "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |" + "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |\n", + "| `xml_keep_tags` | XML | Whether to retain original XML tag names and include them in the metadata for each extracted element |\n", + "| `only_leaf_nodes` | XML | If true, only the deepest elements are extracted. If false, all elements are extracted|" ] }, { @@ -812,13 +879,13 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gDJyUi_9R4fr", - "outputId": "4aebe625-444d-4161-be23-512708ced1b5" + "outputId": "181d8e88-7a0b-4a6e-f497-7fd4add3726c" }, "outputs": [ { @@ -830,8 +897,8 @@ "| path| doc|\n", "+--------------------+--------------------+\n", "|file:/content/wor...|[{NarrativeText, ...|\n", - "|file:/content/wor...|[{Header, An inli...|\n", "|file:/content/wor...|[{Table, Header C...|\n", + "|file:/content/wor...|[{Header, An inli...|\n", "+--------------------+--------------------+\n", "\n" ] @@ -843,50 +910,23 @@ ] }, { - "cell_type": "code", - "execution_count": 23, + "cell_type": "markdown", "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3vz48AHQHyON", - "outputId": "f3ba8c4b-3bfc-453a-d8d4-f86a5fca0a1b" + "id": "F0lCz9OyPYYh" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning::Spark Session already created, some configs may not take.\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "\n" - ] - } - ], "source": [ - "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n", - "partition_df.show()" + "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "S50lqIFskNO3", - "outputId": "e52f4cde-cfb9-4a55-d989-6e9fe40a0321" + "id": "qExdRJ2aPsYV", + "outputId": "9a033a02-4bae-4570-aaba-b81c23b8e0e1" }, "outputs": [ { @@ -894,38 +934,40 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|path |xls |\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|file:/content/excel-files/PageBreakExample.xlsx|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}]|\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+--------------------+--------------------+--------------------+\n", + "| path| doc| content|\n", + "+--------------------+--------------------+--------------------+\n", + "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n", + "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n", + "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n", + "+--------------------+--------------------+--------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(content_type = \"application/vnd.ms-excel\").partition(\"./excel-files/PageBreakExample.xlsx\")\n", - "partition_df.show(truncate=False)" + "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n", + "partition_df.show()" ] }, { "cell_type": "markdown", "metadata": { - "id": "F0lCz9OyPYYh" + "id": "E3bCFJZn8TS0" }, "source": [ - "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output" + "## Partitioning PDF Files" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "qExdRJ2aPsYV", - "outputId": "0284de34-ce6a-4d1e-91bc-268521111015" + "id": "3vz48AHQHyON", + "outputId": "19369e63-f963-4422-a791-57ea5394df1a" }, "outputs": [ { @@ -933,19 +975,23 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+--------------------+--------------------+--------------------+\n", - "| path| doc| content|\n", - "+--------------------+--------------------+--------------------+\n", - "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n", - "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n", - "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n", - "+--------------------+--------------------+--------------------+\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", + "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n", + "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n", "partition_df.show()" ] }, @@ -969,13 +1015,13 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_2J0zEmma8jm", - "outputId": "405391bf-60bf-4632-ef0e-e84496049c71" + "outputId": "90f668d7-03d9-496f-dc82-a620c59f9c08" }, "outputs": [ { @@ -1018,13 +1064,13 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4sY2ADN8dusy", - "outputId": "98af2c82-8a55-46ff-f631-7775431820cb" + "outputId": "8164237e-6835-404a-d7a7-b5ef0ef99c6d" }, "outputs": [ { @@ -1046,24 +1092,33 @@ "partition_df.show(truncate=False)" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMyqJX-K7dss" + }, + "source": [ + "## Partitioning MS Office documents" + ] + }, { "cell_type": "markdown", "metadata": { "id": "_9dDTCrpGdoN" }, "source": [ - "For Word documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output." + "For Excel documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7ICTZmLGk3Sa", - "outputId": "5e31a551-2746-4c45-b933-56f55e4866c9" + "outputId": "1796055a-808c-4eff-fc86-14e29cf9b53e" }, "outputs": [ { @@ -1087,13 +1142,13 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YId4UG1rOVQq", - "outputId": "7de8b4be-9936-4330-8a0f-019c3a55182a" + "outputId": "32827dea-d7b3-4137-abff-9e4502f8cd93" }, "outputs": [ { @@ -1118,38 +1173,21 @@ { "cell_type": "markdown", "metadata": { - "id": "jpRmFNPNNqkf" - }, - "source": [ - "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "id": "HwnYBQ5l7rDM" + "id": "E8ockED4NxLi" }, - "outputs": [], "source": [ - "text = (\n", - " \"The big brown fox\\n\"\n", - " \"was walking down the lane.\\n\"\n", - " \"\\n\"\n", - " \"At the end of the lane,\\n\"\n", - " \"the fox met a bear.\"\n", - " )" + "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output." ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "mutwZUFj720X", - "outputId": "87cd31c5-2f94-4777-9ea5-b6edf8277347" + "id": "fPCpk7RTGRjo", + "outputId": "a818ecd7-8580-4098-b30f-6e46b8ef6baa" }, "outputs": [ { @@ -1157,61 +1195,77 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|txt |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|path |ppt |\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n", - "text_df.show(truncate=False)" + "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n", + "partition_df.show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "E8ockED4NxLi" + "id": "qRfRSGvhN303" }, "source": [ - "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output." + "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display." ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "xF8F-5CP3qWY", - "outputId": "71b5e0cb-b22a-4774-a7b6-83c4fd67fadb" + "id": "twLdjGxZWiOJ", + "outputId": "8adcaa80-b02c-4e8f-8205-20efa8c40b4b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "fake-power-point.pptx fake-power-point-table.pptx\n" + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xls |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML, <table><tr><td>Date</td><td>Fri Jul 19 00:00:00 UTC 2024</td></tr><tr><td>Assets</td><td>Debts</td></tr><tr><td>Bank1</td><td>5865.43</td><td>Credit Card1</td><td>2000.0</td></tr><tr><td>Bank2</td><td>10140.19</td><td>Credit Card2</td><td>1500.0</td></tr><tr><td>Bank3</td><td>1200.0</td><td>Credit Card3</td><td>348.0</td></tr><tr><td>Bank4</td><td>1438.27</td><td>Total</td><td>SUM(F3:F5)</td></tr><tr><td>Total</td><td>SUM(B3:B6)</td></tr></table>, {SheetName -> Sheet1}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" ] } ], "source": [ - "!ls ppt-files" + "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n", + "partition_df.select(\"xls\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8txswwbjN8Mg" + }, + "source": [ + "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually." ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "fPCpk7RTGRjo", - "outputId": "74144c26-5060-4c99-f291-a097b838e774" + "id": "PQ4MpGw6xCko", + "outputId": "aaf807a7-27b9-40cc-8a75-58be077f8403" }, "outputs": [ { @@ -1219,38 +1273,64 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|path |ppt |\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xls |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n", - "partition_df.show(truncate=False)" + "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n", + "partition_df.select(\"xls\").show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "qRfRSGvhN303" + "id": "_GyL6D4N75i-" }, "source": [ - "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display." + "## Partitioning Text Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jpRmFNPNNqkf" + }, + "source": [ + "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "HwnYBQ5l7rDM" + }, + "outputs": [], + "source": [ + "text = (\n", + " \"The big brown fox\\n\"\n", + " \"was walking down the lane.\\n\"\n", + " \"\\n\"\n", + " \"At the end of the lane,\\n\"\n", + " \"the fox met a bear.\"\n", + " )" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "twLdjGxZWiOJ", - "outputId": "ec340358-7279-4247-b27c-5a0a25f38ee6" + "id": "mutwZUFj720X", + "outputId": "8b4f474d-2f3f-4e81-cecf-5de420561124" }, "outputs": [ { @@ -1258,38 +1338,47 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|xls |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML, <table><tr><td>Date</td><td>Fri Jul 19 00:00:00 UTC 2024</td></tr><tr><td>Assets</td><td>Debts</td></tr><tr><td>Bank1</td><td>5865.43</td><td>Credit Card1</td><td>2000.0</td></tr><tr><td>Bank2</td><td>10140.19</td><td>Credit Card2</td><td>1500.0</td></tr><tr><td>Bank3</td><td>1200.0</td><td>Credit Card3</td><td>348.0</td></tr><tr><td>Bank4</td><td>1438.27</td><td>Total</td><td>SUM(F3:F5)</td></tr><tr><td>Total</td><td>SUM(B3:B6)</td></tr></table>, {SheetName -> Sheet1}}]|\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|txt |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n", - "partition_df.select(\"xls\").show(truncate=False)" + "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n", + "text_df.show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "8txswwbjN8Mg" + "id": "epCp5DnQ8E7o" }, "source": [ - "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually." + "## Partitioning XML Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DWX0nkc4tM7J" + }, + "source": [ + "In Spark NLP 6.0.3 we added support for XML files" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 45, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "PQ4MpGw6xCko", - "outputId": "808783d2-f15b-45ae-90fb-a623243898f3" + "id": "AViMSzKQtP-o", + "outputId": "147a1ef9-3f14-4832-a050-e60c8ac9544b" }, "outputs": [ { @@ -1297,18 +1386,18 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|xls |\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n", - "partition_df.select(\"xls\").show(truncate=False)" + "partition_df = Partition(xml_keep_tags = True).partition(\"./xml-files/multi-level.xml\")\n", + "partition_df.select(\"xml\").show(truncate=False)" ] } ], diff --git a/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb new file mode 100644 index 00000000000000..38b43aed37b95e --- /dev/null +++ b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb @@ -0,0 +1,339 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing XML reader in SparkNLP\n", + "This notebook showcases the newly added `sparknlp.read().xml()` method in Spark NLP that parses XML content from both local files and real-time URLs into a Spark DataFrame.\n", + "\n", + "**Key Features:**\n", + "- Ability to parse XML from local directories and URLs.\n", + "- Versatile support for varied data ingestion scenarios." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y3hWfT5q-npM" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "u3ORYVyb-pRI" + }, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oIbFQyEo-tat" + }, + "source": [ + "For local files example we will download a couple of XML files from Spark NLP Github repo:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ya8qZe00dalC", + "outputId": "7d597910-9826-4472-9fdc-5b8ac398e6cf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-06-09 21:43:40 (34.0 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n", + "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/test.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 382 [text/plain]\n", + "Saving to: ‘xml-files/test.xml’\n", + "\n", + "test.xml 100%[===================>] 382 --.-KB/s in 0s \n", + "\n", + "2025-06-09 21:43:40 (7.58 MB/s) - ‘xml-files/test.xml’ saved [382/382]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/test.xml -P xml-files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Parsing XML from Local Files\n", + "Use the `xml()` method to parse XML content from local directories." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bAkMjJ1vdalE", + "outputId": "0bba10be-75de-48de-9a06-d6197d35218f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+--------------------+--------------------+\n", + "| path| xml|\n", + "+--------------------+--------------------+\n", + "|file:/content/xml...|[{Title, Harry Po...|\n", + "|file:/content/xml...|[{Title, The Alch...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "xml_df = sparknlp.read().xml(\"./xml-files\")\n", + "\n", + "xml_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oBj0cHPXSD1m", + "outputId": "00951736-40d4-4f9e-fe25-cc5117405269" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- path: string (nullable = true)\n", + " |-- xml: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- elementType: string (nullable = true)\n", + " | | |-- content: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + "\n" + ] + } + ], + "source": [ + "xml_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FrVKxdySz8pR" + }, + "source": [ + "### Configuration Parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CC_klLwhV8um" + }, + "source": [ + "`xmlKeepTags`: When true, includes the tag name of each XML element in the metadata under the key `tag`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aNfN0fQC0Vzz", + "outputId": "ebdb1393-b91c-4c60-d7e7-b7ecc6465171" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> title}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> author}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> year}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> price}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> title}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> author}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> year}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> price}}]|\n", + "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}] |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"xmlKeepTags\": \"true\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.select(\"xml\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t06KtTItWQ4R" + }, + "source": [ + "`onlyLeafNodes`: When true, includes only leaf elements (i.e., elements with no child elements) in the output. When false, all elements (including containers) are included." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jTM1btqNntUL", + "outputId": "f86a0b28-73ac-46d1-8d26-f920e2d935cd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{UncategorizedText, , {elementId -> 931f811d0c9b488a01a7875f80992a62}}, {UncategorizedText, , {elementId -> 1f610d9429ab17d0d7ab49ee3069b4fc, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, , {elementId -> 249aff1b3e9835325b45e51cdfc4ad46, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}] |\n", + "|[{UncategorizedText, , {elementId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> d7416d9cac3ba3af57ef6b6b71d7841b, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> b79ae4ca74ec00f63a00b6cd66acc1e0, parentId -> d7416d9cac3ba3af57ef6b6b71d7841b}}, {UncategorizedText, , {elementId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, parentId -> b79ae4ca74ec00f63a00b6cd66acc1e0}}, {Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, , {elementId -> 9ebecf846e7dea80c563ebcb2f7d4a9a, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> 80472cd1880f453b8adecc61870748ba, parentId -> 9ebecf846e7dea80c563ebcb2f7d4a9a}}, {UncategorizedText, , {elementId -> 9708b29025b53d9f54c723ee005b647b, parentId -> 80472cd1880f453b8adecc61870748ba}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"onlyLeafNodes\": \"false\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.select(\"xml\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O8DePUq8nkYm" + }, + "source": [ + "You can access the raw content of the file using the `storeContent` parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E0S5aRb5WFLf", + "outputId": "5e624eeb-fbc1-47a4-ff21-aef410a10bb2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|path |content |xml |\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|file:/content/xml-files/test.xml |<bookstore>\\n <book category=\"children\">\\n <title lang=\"en\">Harry Potter</title>\\n <author>J K. Rowling</author>\\n <year>2005</year>\\n <price>29.99</price>\\n </book>\\n <book category=\"web\">\\n <title lang=\"en\">Learning XML</title>\\n <author>Erik T. Ray</author>\\n <year>2003</year>\\n <price>39.95</price>\\n </book>\\n</bookstore> |[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}]|\n", + "|file:/content/xml-files/multi-level.xml|<library>\\n <section name=\"Fiction\">\\n <shelf number=\"1\">\\n <book>\\n <title>The Alchemist</title>\\n <author>Paulo Coelho</author>\\n <year>1988</year>\\n </book>\\n </shelf>\\n </section>\\n <section name=\"Science\">\\n <shelf number=\"2\">\\n <book>\\n <title>A Brief History of Time</title>\\n <author>Stephen Hawking</author>\\n <year>1988</year>\\n </book>\\n </shelf>\\n </section>\\n</library>\\n|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}] |\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"storeContent\": \"true\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.show(truncate=False)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala index 267ad10af760b2..fc777458dafb83 100644 --- a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala @@ -25,6 +25,53 @@ import scala.collection.mutable import scala.collection.mutable.ListBuffer import scala.xml.{Elem, Node, XML} +/** Class to parse and read XML files. + * + * @param storeContent + * Whether to include the raw XML content in the resulting DataFrame as a separate 'content' + * column. By default, this is false. + * + * @param xmlKeepTags + * Whether to retain original XML tag names and include them in the metadata for each extracted + * element. Useful for preserving structure. Default is false. + * + * @param onlyLeafNodes + * If true, only the deepest elements (those without child elements) are extracted. If false, + * all elements are extracted. Default is true. + * + * ==Input Format== + * Input must be a valid path to an XML file or a directory containing XML files. + * + * ==Example== + * {{{ + * val xmlPath = "./data/sample.xml" + * val xmlReader = new XMLReader() + * val xmlDf = xmlReader.read(xmlPath) + * }}} + * + * {{{ + * xmlDf.show(truncate = false) + * +----------------------+--------------------------------------------------+ + * |path |xml | + * +----------------------+--------------------------------------------------+ + * |file:/data/sample.xml |[{Title, My Book, {tag -> title}}, ...] | + * +----------------------+--------------------------------------------------+ + * + * xmlDf.printSchema() + * root + * |-- path: string (nullable = true) + * |-- xml: array (nullable = true) + * | |-- element: struct (containsNull = true) + * | | |-- elementType: string (nullable = true) + * | | |-- content: string (nullable = true) + * | | |-- metadata: map (nullable = true) + * | | | |-- key: string + * | | | |-- value: string (valueContainsNull = true) + * }}} + * + * For more examples refer to: + * [[https://github.com/JohnSnowLabs/spark-nlp/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb notebook]] + */ class XMLReader( storeContent: Boolean = false, xmlKeepTags: Boolean = false,