[SPARKNLP-1119] Adding XML reader

danilojsl · danilojsl · commit 6e782e1bb845 · 2025-06-09T14:23:33.000-05:00
diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py
@@ -322,4 +322,49 @@ def txt(self, docPath):
         if not isinstance(docPath, str):
             raise TypeError("docPath must be a string")
         jdf = self._java_obj.txt(docPath)
+        return self.getDataFrame(self.spark, jdf)
+
+    def xml(self, docPath):
+        """Reads XML files and returns a Spark DataFrame.
+
+        Parameters
+        ----------
+        docPath : str
+            Path to an XML file or a directory containing XML files.
+
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            A DataFrame containing parsed XML content.
+
+        Examples
+        --------
+        >>> from sparknlp.reader import SparkNLPReader
+        >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
+
+        You can use SparkNLP for one line of code
+
+        >>> import sparknlp
+        >>> xml_df = sparknlp.read().xml("home/user/xml-directory")
+        >>> xml_df.show(truncate=False)
+        +-----------------------------------------------------------+
+        |xml                                                       |
+        +-----------------------------------------------------------+
+        |[{Title, John Smith, {elementId -> ..., tag -> title}}]   |
+        +-----------------------------------------------------------+
+
+        >>> xml_df.printSchema()
+        root
+         |-- path: string (nullable = true)
+         |-- xml: array (nullable = true)
+         |    |-- element: struct (containsNull = true)
+         |    |    |-- elementType: string (nullable = true)
+         |    |    |-- content: string (nullable = true)
+         |    |    |-- metadata: map (nullable = true)
+         |    |    |    |-- key: string
+         |    |    |    |-- value: string (valueContainsNull = true)
+        """
+        if not isinstance(docPath, str):
+            raise TypeError("docPath must be a string")
+        jdf = self._java_obj.xml(docPath)
         return self.getDataFrame(self.spark, jdf)
diff --git a/python/test/sparknlp_test.py b/python/test/sparknlp_test.py
@@ -125,4 +125,18 @@ def runTest(self):
         txt_df = sparknlp.read().txt(self.txt_file)
         txt_df.show()
 
-        self.assertTrue(txt_df.select("txt").count() > 0)
+        self.assertTrue(txt_df.select("txt").count() > 0)
+
+
+@pytest.mark.fast
+class SparkNLPTestXMLFilesSpec(unittest.TestCase):
+
+    def setUp(self):
+        self.data = SparkContextForTest.data
+        self.xml_files = f"file:///{os.getcwd()}/../src/test/resources/reader/xml"
+
+    def runTest(self):
+        xml_df = sparknlp.read().xml(self.xml_files)
+        xml_df.show()
+
+        self.assertTrue(xml_df.select("xml").count() > 0)
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable
+import org.apache.spark.ml.param.Param
+
+trait HasXmlReaderProperties extends ParamsAndFeaturesWritable {
+
+  val xmlKeepTags = new Param[Boolean](
+    this,
+    "xmlKeepTags",
+    "Whether to include XML tag names as metadata in the output.")
+
+  def setXmlKeepTags(value: Boolean): this.type = set(xmlKeepTags, value)
+
+  val onlyLeafNodes = new Param[Boolean](
+    this,
+    "onlyLeafNodes",
+    "If true, only processes XML leaf nodes (no nested children).")
+
+  def setOnlyLeafNodes(value: Boolean): this.type = set(onlyLeafNodes, value)
+
+  setDefault(xmlKeepTags -> false, onlyLeafNodes -> true)
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
@@ -188,6 +188,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
           "application/vnd.openxmlformats-officedocument.presentationml.presentation" =>
         sparkNLPReader.ppt
       case "application/pdf" => sparkNLPReader.pdf
+      case "application/xml" => sparkNLPReader.xml
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -199,6 +200,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "text/plain" => sparkNLPReader.txtToHTMLElement
       case "text/html" => sparkNLPReader.htmlToHTMLElement
       case "url" => sparkNLPReader.urlToHTMLElement
+      case "application/xml" => sparkNLPReader.xmlToHTMLElement
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -234,6 +236,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "xls" | "xlsx" => sparkNLPReader.xls
       case "ppt" | "pptx" => sparkNLPReader.ppt
       case "pdf" => sparkNLPReader.pdf
+      case "xml" => sparkNLPReader.xml
       case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
     }
   }
diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
@@ -86,6 +86,7 @@ class PartitionTransformer(override val uid: String)
     with HasPowerPointProperties
     with HasTextReaderProperties
     with HasPdfProperties
+    with HasXmlReaderProperties
     with HasChunkerProperties {
 
   def this() = this(Identifiable.randomUID("PartitionTransformer"))
@@ -157,7 +158,9 @@ class PartitionTransformer(override val uid: String)
       "newAfterNChars" -> $(newAfterNChars).toString,
       "overlap" -> $(overlap).toString,
       "combineTextUnderNChars" -> $(combineTextUnderNChars).toString,
-      "overlapAll" -> $(overlapAll).toString)
+      "overlapAll" -> $(overlapAll).toString,
+      "xmlKeepTags" -> $(xmlKeepTags).toString,
+      "onlyLeafNodes" -> $(onlyLeafNodes).toString)
     val partitionInstance = new Partition(params.asJava)
 
     val inputColum = if (get(inputCols).isDefined) {
diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
@@ -296,7 +296,6 @@ class SparkNLPReader(
     *  |-- width_dimension: integer (nullable = true)
     *  |-- content: binary (nullable = true)
     *  |-- exception: string (nullable = true)
-    *  |-- pagenum: integer (nullable = true)
     * }}}
     *
     * @param params
@@ -642,4 +641,69 @@ class SparkNLPReader(
       default = BLOCK_SPLIT_PATTERN)
   }
 
+  /** Instantiates class to read XML files.
+    *
+    * xmlPath: this is a path to a directory of XML files or a path to an XML file. E.g.,
+    * "path/xml/files"
+    *
+    * ==Example==
+    * {{{
+    * val xmlPath = "home/user/xml-directory"
+    * val sparkNLPReader = new SparkNLPReader()
+    * val xmlDf = sparkNLPReader.xml(xmlPath)
+    * }}}
+    *
+    * ==Example 2==
+    * You can use SparkNLP for one line of code
+    * {{{
+    * val xmlDf = SparkNLP.read.xml(xmlPath)
+    * }}}
+    *
+    * {{{
+    * xmlDf.select("xml").show(false)
+    * +------------------------------------------------------------------------------------------------------------------------+
+    * |xml                                                                                                                    |
+    * +------------------------------------------------------------------------------------------------------------------------+
+    * |[{Title, John Smith, {elementId -> ..., tag -> title}}, {UncategorizedText, Some content..., {elementId -> ...}}]     |
+    * +------------------------------------------------------------------------------------------------------------------------+
+    *
+    * xmlDf.printSchema()
+    * root
+    *  |-- path: string (nullable = true)
+    *  |-- xml: array (nullable = true)
+    *  |    |-- element: struct (containsNull = true)
+    *  |    |    |-- elementType: string (nullable = true)
+    *  |    |    |-- content: string (nullable = true)
+    *  |    |    |-- metadata: map (nullable = true)
+    *  |    |    |    |-- key: string
+    *  |    |    |    |-- value: string (valueContainsNull = true)
+    * }}}
+    *
+    * @param xmlPath
+    *   Path to the XML file or directory
+    * @return
+    *   A DataFrame with parsed XML as structured elements
+    */
+
+  def xml(xmlPath: String): DataFrame = {
+    val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+    xmlReader.read(xmlPath)
+  }
+
+  def xmlToHTMLElement(xml: String): Seq[HTMLElement] = {
+    val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+    xmlReader.parseXml(xml)
+  }
+
+  private def getXmlKeepTags: Boolean = {
+    getDefaultBoolean(params.asScala.toMap, Seq("xmlKeepTags", "xml_keep_tags"), default = false)
+  }
+
+  private def getOnlyLeafNodes: Boolean = {
+    getDefaultBoolean(
+      params.asScala.toMap,
+      Seq("onlyLeafNodes", "only_leaf_nodes"),
+      default = true)
+  }
+
 }
diff --git a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.reader
+
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.nlp.util.io.ResourceHelper.validFile
+import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.{col, udf}
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.xml.{Elem, Node, XML}
+
+class XMLReader(
+    storeContent: Boolean = false,
+    xmlKeepTags: Boolean = false,
+    onlyLeafNodes: Boolean = true)
+    extends Serializable {
+
+  private lazy val spark = ResourceHelper.spark
+
+  private var outputColumn = "xml"
+
+  def setOutputColumn(value: String): this.type = {
+    require(value.nonEmpty, "Output column name cannot be empty.")
+    outputColumn = value
+    this
+  }
+
+  def read(inputSource: String): DataFrame = {
+    if (validFile(inputSource)) {
+      val xmlDf = datasetWithTextFile(spark, inputSource)
+        .withColumn(outputColumn, parseXmlUDF(col("content")))
+      if (storeContent) xmlDf.select("path", "content", outputColumn)
+      else xmlDf.select("path", outputColumn)
+    } else throw new IllegalArgumentException(s"Invalid inputSource: $inputSource")
+  }
+
+  private val parseXmlUDF = udf((xml: String) => {
+    parseXml(xml)
+  })
+
+  def parseXml(xmlString: String): List[HTMLElement] = {
+    val xml = XML.loadString(xmlString)
+    val elements = ListBuffer[HTMLElement]()
+
+    def traverse(node: Node, parentId: Option[String]): Unit = {
+      node match {
+        case elem: Elem =>
+          val tagName = elem.label.toLowerCase
+          val textContent = elem.text.trim
+          val elementId = hash(tagName + textContent)
+
+          val isLeaf = !elem.child.exists(_.isInstanceOf[Elem])
+
+          if (!onlyLeafNodes || isLeaf) {
+            val elementType = tagName match {
+              case "title" | "author" => ElementType.TITLE
+              case _ => ElementType.UNCATEGORIZED_TEXT
+            }
+
+            val metadata = mutable.Map[String, String]("elementId" -> elementId)
+            if (xmlKeepTags) metadata += ("tag" -> tagName)
+            parentId.foreach(id => metadata += ("parentId" -> id))
+
+            val content = if (isLeaf) textContent else ""
+            elements += HTMLElement(elementType, content, metadata)
+          }
+
+          // Traverse children
+          elem.child.foreach(traverse(_, Some(elementId)))
+
+        case _ => // Ignore other types
+      }
+    }
+
+    traverse(xml, None)
+    elements.toList
+  }
+
+  def hash(s: String): String = {
+    java.security.MessageDigest
+      .getInstance("MD5")
+      .digest(s.getBytes)
+      .map("%02x".format(_))
+      .mkString
+  }
+
+}
diff --git a/src/test/resources/reader/xml/multi-level.xml b/src/test/resources/reader/xml/multi-level.xml
@@ -0,0 +1,20 @@
+<library>
+    <section name="Fiction">
+        <shelf number="1">
+            <book>
+                <title>The Alchemist</title>
+                <author>Paulo Coelho</author>
+                <year>1988</year>
+            </book>
+        </shelf>
+    </section>
+    <section name="Science">
+        <shelf number="2">
+            <book>
+                <title>A Brief History of Time</title>
+                <author>Stephen Hawking</author>
+                <year>1988</year>
+            </book>
+        </shelf>
+    </section>
+</library>
diff --git a/src/test/resources/reader/xml/test.xml b/src/test/resources/reader/xml/test.xml
@@ -0,0 +1,14 @@
+<bookstore>
+    <book category="children">
+        <title lang="en">Harry Potter</title>
+        <author>J K. Rowling</author>
+        <year>2005</year>
+        <price>29.99</price>
+    </book>
+    <book category="web">
+        <title lang="en">Learning XML</title>
+        <author>Erik T. Ray</author>
+        <year>2003</year>
+        <price>39.95</price>
+    </book>
+</bookstore>
diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala
@@ -32,6 +32,7 @@ class PartitionTest extends AnyFlatSpec {
   val emailDirectory = "src/test/resources/reader/email"
   val htmlDirectory = "src/test/resources/reader/html"
   val pdfDirectory = "src/test/resources/reader/pdf"
+  val xmlDirectory = "src/test/resources/reader/xml"
 
   "Partition" should "work with text content_type" taggedAs FastTest in {
     val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory)
@@ -181,4 +182,11 @@ class PartitionTest extends AnyFlatSpec {
     assert(elements == expectedElements)
   }
 
+  it should "work with XML content_type" taggedAs FastTest in {
+    val pdfDf = Partition(Map("content_type" -> "application/xml")).partition(xmlDirectory)
+    pdfDf.show()
+
+    assert(!pdfDf.select(col("xml")).isEmpty)
+  }
+
 }
diff --git a/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala

Original file line number	Diff line number	Diff line change
`@@ -188,6 +188,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())`
`188`	`188`	`"application/vnd.openxmlformats-officedocument.presentationml.presentation" =>`
`189`	`189`	`sparkNLPReader.ppt`
`190`	`190`	`case "application/pdf" => sparkNLPReader.pdf`
	`191`	`+ case "application/xml" => sparkNLPReader.xml`
`191`	`192`	`case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")`
`192`	`193`	`}`
`193`	`194`	`}`
`@@ -199,6 +200,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())`
`199`	`200`	`case "text/plain" => sparkNLPReader.txtToHTMLElement`
`200`	`201`	`case "text/html" => sparkNLPReader.htmlToHTMLElement`
`201`	`202`	`case "url" => sparkNLPReader.urlToHTMLElement`
	`203`	`+ case "application/xml" => sparkNLPReader.xmlToHTMLElement`
`202`	`204`	`case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")`
`203`	`205`	`}`
`204`	`206`	`}`
`@@ -234,6 +236,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())`
`234`	`236`	`case "xls" \| "xlsx" => sparkNLPReader.xls`
`235`	`237`	`case "ppt" \| "pptx" => sparkNLPReader.ppt`
`236`	`238`	`case "pdf" => sparkNLPReader.pdf`
	`239`	`+ case "xml" => sparkNLPReader.xml`
`237`	`240`	`case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")`
`238`	`241`	`}`
`239`	`242`	`}`