Skip to content

Adding Documentation for SparkNLP Readers and Partition class #14571

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions python/sparknlp/partition/partition.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion python/sparknlp/reader/sparknlp_reader.py
Original file line number Diff line number Diff line change
@@ -259,7 +259,7 @@ def ppt(self, docPath):
Parameters
----------
docPath : str
Path to an excel document file.
Path to an power point document file.

Returns
-------
163 changes: 163 additions & 0 deletions src/main/scala/com/johnsnowlabs/partition/Partition.scala
Original file line number Diff line number Diff line change
@@ -21,8 +21,97 @@ import org.apache.spark.sql.DataFrame
import java.net.URL
import scala.collection.JavaConverters._

/** The Partition class provides a streamlined interface for interacting with Spark NLP readers.
* It allows you to extract content from various file formats while providing customization using
* keyword arguments. File types include Email, Excel, HTML, PPT, Text, Word documents.
*
* The Partition class simplifies document handling by automatically detecting the document type
* based on either the file extension or the provided contentType parameter. It then initializes
* the appropriate document reader using the sparkNLPReader class, making it convenient to use.
*
* @param params
* Map of parameters with custom configurations.
* It includes the following parameters:
*
* - content_type (All): Override automatic file type detection.
* - store_content (All): Include raw file content in the output DataFrame as a separate
* 'content' column.
* - timeout (HTML): Timeout in seconds for fetching remote HTML content.
* - title_font_size (HTML, Excel): Minimum font size used to identify titles based on
* formatting.
* - include_page_breaks (Word, Excel): Whether to tag content with page break metadata.
* - group_broken_paragraphs (Text): Whether to merge broken lines into full paragraphs using
* heuristics.
* - title_length_size (Text): Max character length used to qualify text blocks as titles.
* - paragraph_split (Text): Regex to detect paragraph boundaries when grouping lines.
* - short_line_word_threshold (Text): Max word count for a line to be considered short.
* - threshold (Text): Ratio of empty lines used to switch between newline-based and paragraph
* grouping.
* - max_line_count (Text): Max lines evaluated when analyzing paragraph structure.
* - include_slide_notes (PowerPoint): Whether to include speaker notes from slides as
* narrative text.
* - infer_table_structure (Word, Excel, PowerPoint): Generate full HTML table structure from
* parsed table content.
* - append_cells (Excel): Append all rows into a single content block instead of individual
* elements.
* - cell_separator (Excel): String used to join cell values in a row for text output.
* - add_attachment_content (Email): Include text content of plain-text attachments in the
* output.
* - headers (HTML): This is used when a URL is provided, allowing you to set the necessary
* headers for the request.
*
* ==Example 1 (Reading Text Files)==
* {{{
* val txtDirectory = "/content/txtfiles/reader/txt"
* val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory)
* textDf.show()
*
* +--------------------+--------------------+
* | path| txt|
* +--------------------+--------------------+
* |file:/content/txt...|[{Title, BIG DATA...|
* +--------------------+--------------------+
* }}}
*
* ==Example 2 (Reading Email Files)==
* {{{
* emailDirectory = "./email-files/test-several-attachments.eml"
* partitionDf = Partition(Map("content_type" -> "message/rfc822")).partition(emailDirectory)
* partitionDf.show()
* +--------------------+--------------------+
* | path| email|
* +--------------------+--------------------+
* |file:/content/ema...|[{Title, Test Sev...|
* +--------------------+--------------------+
* }}}
*
* ==Example 3 (Reading Webpages)==
* {{{
* val htmlDf = Partition().partition("https://www.wikipedia.org")
* htmlDf.show()
*
* +--------------------+--------------------+
* | url| html|
* +--------------------+--------------------+
* |https://www.wikip...|[{Title, Wikipedi...|
* +--------------------+--------------------+
*
* }}}
* *
*/

class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) {

/** Takes a URL/file/directory path to read and parse it's content.
*
* @param path
* Path to a file or local directory where all files are stored. Supports URLs and DFS file
* systems like databricks, HDFS and Microsoft Fabric OneLake.
* @param headers
* If the path is a URL it sets the necessary headers for the request.
* @return
* DataFrame with parsed file content.
*/
def partition(
path: String,
headers: java.util.Map[String, String] = new java.util.HashMap()): DataFrame = {
@@ -78,6 +167,41 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
}
}

/** Parses multiple URL's.
*
* @param urls
* list of URL's
* @param headers
* sets the necessary headers for the URL request.
* @return
* DataFrame with parsed url content.
*
* ==Example==
* {{{
* val htmlDf =
* Partition().partitionUrls(Array("https://www.wikipedia.org", "https://example.com/"))
* htmlDf.show()
*
* +--------------------+--------------------+
* | url| html|
* +--------------------+--------------------+
* |https://www.wikip...|[{Title, Wikipedi...|
* |https://example.com/|[{Title, Example ...|
* +--------------------+--------------------+
*
* htmlDf.printSchema()
* root
* |-- url: string (nullable = true)
* |-- html: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
* }}}
*/

def partitionUrls(urls: Array[String], headers: Map[String, String] = Map.empty): DataFrame = {
if (urls.isEmpty) throw new IllegalArgumentException("URL array is empty")
val sparkNLPReader = new SparkNLPReader(params, headers.asJava)
@@ -90,6 +214,45 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
partitionUrls(urls.asScala.toArray, headers.asScala.toMap)
}

/** Parses and reads data from a string.
*
* @param text
* Text data in the form of a string.
* @return
* DataFrame with parsed text content.
*
* ==Example==
* {{{
* val content =
* """
* |The big brown fox
* |was walking down the lane.
* |
* |At the end of the lane,
* |the fox met a bear.
* |""".stripMargin
*
* val textDf = Partition(Map("groupBrokenParagraphs" -> "true")).partitionText(content)
* textDf.show()
*
* +-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |txt |
* +-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|
* +-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
*
* textDf.printSchema()
* root
* |-- txt: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
*
* }}}
*/
def partitionText(text: String): DataFrame = {
val sparkNLPReader = new SparkNLPReader(params)
sparkNLPReader.txtContent(text)
Loading
Oops, something went wrong.