![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb)


# Document Normalizer annotator notebook

# Set up Colab environment

In [0]:
# Only run this cell when you are using Spark NLP on Google Colab
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [0]:
!wget http://ckl-it.de/wp-content/uploads/2022/12/docs.zip
!unzip -f docs.zip

# Start Spark NLP session

In [0]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp

spark = sparknlp.start()

# Document Normalizer annotator overview

The DocumentNormalizer is an annotator that can be used after the DocumentAssembler to
normalizes documents once that they have been processed and indexed.

It takes in input annotated documents of type `Array[AnnotatorType]` (DOCUMENT) and gives
as output annotated document of type AnnotatorType.DOCUMENT .

Parameters are:
- inputCol: input column name string which targets a column of type
  Array(AnnotatorType.DOCUMENT).
- outputCol: output column name string which targets a column of type
  AnnotatorType.DOCUMENT.
- action: action string to perform applying regex patterns, i.e. (clean | extract).
  Default is "clean".
- cleanupPatterns: normalization regex patterns which match will be removed from
  document. Default is "<[^>]*>" (e.g., it removes all HTML tags).
- replacement: replacement string to apply when regexes match. Default is " ".
- lowercase: whether to convert strings to lowercase. Default is False.
- removalPolicy: removalPolicy to remove patterns from text with a given policy. Valid
  policy values are: "all", "pretty_all", "first", "pretty_first". Defaults is
  "pretty_all".
- encoding: file encoding to apply on normalized documents. Supported encodings are:
  UTF_8, UTF_16, US_ASCII, ISO-8859-1, UTF-16BE, UTF-16LE. Default is "UTF-8".

In [0]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

inpuColName = "document"
outputColName = "normalizedDocument"

action = "clean"
cleanUpPatterns = ["<[^>]*>"]
replacement = " "
removalPolicy = "pretty_all"
encoding = "UTF-8"

documentNormalizer = DocumentNormalizer() \
    .setInputCols(inpuColName) \
    .setOutputCol(outputColName) \
    .setAction(action) \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(replacement) \
    .setPolicy(removalPolicy) \
    .setLowercase(True) \
    .setEncoding(encoding)

# Data loading

In [0]:
path = "html-docs"

data = spark.sparkContext.wholeTextFiles("html-docs")
df = data.toDF(schema=["filename", "text"]).select("text")

df.show()

# Example 1: remove all the tags from HTML text files

In [0]:
# Once data is loaded we can process the textual document applying a pipeline that normalizes the document right after the DocumentAssembler.
# For instance, let's imagine we are loading some HTML pages in our DataFrame and we want to remove all the tags in it:

documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

cleanUpPatterns = ["<[^>]*>"]

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10)

# Example 2: obfuscate PII such as emails in HTML content

In [0]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

action = "clean"
patterns = ["([^.@\\s]+)(\\.[^.@\\s]+)*@([^.@\\s]+\\.)+([^.@\\s]+)"]
replacement = "***OBFUSCATED PII***"

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(replacement) \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10, False)

# Example 3: obfuscate PII such as ages in HTML content

In [0]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

action = "clean"
patterns = ["\\d+(?=[\\s]?year)", "(aged)[\\s]?\\d+"]
replacement = "***OBFUSCATED PII***"

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement(replacement) \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10, False)

# Example 4: extract XML name tag contents

In [0]:
# data loading
data = spark.sparkContext.wholeTextFiles("xml-docs")
df = data.toDF(schema=["filename", "text"]).select("text")
df.show()

In [0]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

action = "extract"

tag = "name"
patterns = [tag]

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement("") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10, False)

# Example 5 : apply lookaround patterns

In [0]:
articles = [
            (1, "10.2",),
            (2, "9,53",),
            (3, "11.01 mg",),
            (4, "mg 11.01",),
            (5, "14,220",),
            (6, "Amoxiciline 4,5 mg for $10.35; Ibuprofen 5,5mg for $9.00.",)
]

In [0]:
articles_cols = ["id", "text"]
df = spark.createDataFrame(data=articles, schema=articles_cols)
df.printSchema()
df.show(truncate=False)

## Annotate replacing . to , using positive lookahead

In [0]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *

# Targetting text 11.01 mg annotating to 11,01 mg

action = "lookaround"
patterns = [".*\d+(\.)\d+(?= mg).*"]
replacement = ","

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

doc_norm = DocumentNormalizer() \
    .setInputCols(["document"]) \
    .setOutputCol("normalized") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement(replacement)

pipeline = Pipeline(stages=[
    document_assembler,
    doc_norm
])

model = pipeline.fit(df)
model.transform(df).select("text", "normalized").show(20, False)

## Annotate replacing . to , using positive lookbehind

In [0]:
# Targetting text mg 11.01 annotating to mg 11,01

action = "lookaround"
patterns = [".*(?<=mg )\d+(\.)\d+.*"]
replacement = ","

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

doc_norm = DocumentNormalizer() \
    .setInputCols(["document"]) \
    .setOutputCol("normalized") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement(replacement)

pipeline = Pipeline(stages=[
    document_assembler,
    doc_norm
])

model = pipeline.fit(df)
model.transform(df).select("text", "normalized").show(20, False)

## Annotate replacing , to . using iterative positive lookahead

In [0]:
# Targetting text Amoxiciline 4,5 mg for $10.35; Ibuprofen 5,5mg for $9.00.
# annotating to 
# Amoxiciline 4.5 mg for $10.35; Ibuprofen 5.5mg for $9.00

action = "lookaround"
patterns = [".*\d+(\,)\d+(?=\s?mg).*"]
replacement = "."

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

doc_norm = DocumentNormalizer() \
    .setInputCols(["document"]) \
    .setOutputCol("normalized") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement(replacement)

pipeline = Pipeline(stages=[
    document_assembler,
    doc_norm
])

model = pipeline.fit(df)
model.transform(df).select("text", "normalized").show(20, False)