In [1]:
from sparknlp.annotator import *
from sparknlp.base import *

In [2]:
!ls


sparknlp.jar  TestHtmlParse.ipynb


In [3]:
from pyspark.sql import SparkSession

import os
spark = SparkSession.builder \
        .master("local[4]") \
        .config("spark.jars", 'sparknlp.jar') \
        .config("spark.driver.memory", "6500M") \
        .getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [4]:
data = spark.createDataFrame([
            [1,""" <html>
              <head>
               <title>
                The title is good.
                Another sentence to detect.
               </title>
              </head>
              <body>
               <p class="title">
                <b>
                 The Dormouse's story title.
                </b>
               </p>
               <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a class="sister" href="http://example.com/elsie" id="link1">
                 Elsie
                </a>
                ,
                <a class="sister" href="http://example.com/lacie" id="link2">
                 Lacie
                </a>
                and
                <a class="sister" href="http://example.com/tillie" id="link3">
                 Tillie
                </a>
                ; and they lived at the bottom of a well.
               </p>
               <p class="story">
                 OtherStory
               </p>
              </body>
             </html>"""]
        ]).toDF("id", "text").cache()



In [5]:
document_assembler = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")

html_parser = HtmlParser() \
        .setInputCols('document') \
        .setOutputCol('html') \
        .setTag('html.head.title')

sentence_detector = SentenceDetector() \
            .setInputCols(["html"]) \
            .setOutputCol("sentence") \
            .setCustomBounds(["%%"]) \
            .setSplitLength(235) \
            .setMinLength(4) \
            .setMaxLength(50)

tokenizer = Tokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
            

finisher = Finisher() \
        .setInputCols(["token","sentence"]) \
        .setOutputCols(["token_views","sentence_views"]) \
        .setOutputAsArray(False) \
        .setAnnotationSplitSymbol('@') \
        .setValueSplitSymbol('#')

pipeline = Pipeline(stages=[document_assembler, html_parser,sentence_detector, tokenizer, finisher])
pipeline.fit(data).transform(data).select('token_views',"sentence_views").show(1,False)



+------------------------------------------------+----------------------------------------------+
|token_views                                     |sentence_views                                |
+------------------------------------------------+----------------------------------------------+
|The@title@is@good@.@Another@sentence@to@detect@.|The title is good.@Another sentence to detect.|
+------------------------------------------------+----------------------------------------------+

