From 9cac2bb608c65f7fec19927711aff6dd1c54005a Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Fri, 8 Mar 2024 09:44:12 +0900 Subject: [PATCH] [SPARK-47309][SQL][XML] Fix schema inference issues in XML ### What changes were proposed in this pull request? This PR fixes XML schema inference issues: 1. when there's an empty tag 2. when merging schema for NullType ### Why are the changes needed? Fix a bug ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? Unit tests. There's a follow-up [PR](https://github.com/apache/spark/pull/45411) that introduces comprehensive tests for schema inference. ### Was this patch authored or co-authored using generative AI tooling? No Closes #45426 from shujingyang-db/fix-xml-schema-inference. Authored-by: Shujing Yang Signed-off-by: Hyukjin Kwon --- .../org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala index be5a29d299a3..b9342c53d020 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala @@ -195,7 +195,9 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean) private def inferField(parser: XMLEventReader): DataType = { parser.peek match { - case _: EndElement => NullType + case _: EndElement => + parser.nextEvent() + NullType case _: StartElement => inferObject(parser) case _: Characters => val structType = inferObject(parser).asInstanceOf[StructType] @@ -450,7 +452,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean) oldTypeOpt match { // If the field name already exists, // merge the type and infer the combined field as an array type if necessary - case Some(oldType) if !oldType.isInstanceOf[ArrayType] && !newType.isInstanceOf[NullType] => + case Some(oldType) if !oldType.isInstanceOf[ArrayType] => ArrayType(compatibleType(caseSensitive, options.valueTag)(oldType, newType)) case Some(oldType) => compatibleType(caseSensitive, options.valueTag)(oldType, newType)