In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
import sparknlp

In [2]:
spark = sparknlp.start()

In [3]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.1.0
Apache Spark version:  3.1.1


In [4]:
spark

In [5]:
from sparknlp.pretrained import PretrainedPipeline

In [6]:
pretrained_pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')

recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [7]:
text = "The Mona Lisa is a 16th century oil painting created by Leonardo. It's held at the Louvre in Paris."

result = pretrained_pipeline.annotate(text)
list(zip(result['token'], result['ner']))

[('The', 'O'),
 ('Mona', 'B-PER'),
 ('Lisa', 'I-PER'),
 ('is', 'O'),
 ('a', 'O'),
 ('16th', 'O'),
 ('century', 'O'),
 ('oil', 'O'),
 ('painting', 'O'),
 ('created', 'O'),
 ('by', 'O'),
 ('Leonardo', 'B-PER'),
 ('.', 'O'),
 ("It's", 'O'),
 ('held', 'O'),
 ('at', 'O'),
 ('the', 'O'),
 ('Louvre', 'B-LOC'),
 ('in', 'O'),
 ('Paris', 'B-LOC'),
 ('.', 'O')]

In [8]:
data = spark.createDataFrame([
    ["Apple is looking at buying U.K. startup for $1 billion"],
    ["Elon Reeve Musk FRS is an entrepreneur and business magnate. He is the founder, CEO, and Chief Engineer at SpaceX; early stage investor, CEO, and Product Architect of Tesla, Inc."],
    ["Timothy Donald Cook (born November 1, 1960) is an American business executive who has been the chief executive officer of Apple Inc. since 2011."],
    ["Apache Spark is an open-source unified analytics engine for large-scale data processing."]
]).toDF('text')

In [9]:
data.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                              |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Apple is looking at buying U.K. startup for $1 billion                                                                                                                            |
|Elon Reeve Musk FRS is an entrepreneur and business magnate. He is the founder, CEO, and Chief Engineer at SpaceX; early stage investor, CEO, and Product Architect of Tesla, Inc.|
|Timothy Donald Cook (born November 1, 1960) is an American business executive who has been the

In [10]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document').setCleanupMode('shrink')

In [11]:
sentence = SentenceDetector().setInputCols('document').setOutputCol('sentence')
sentence.setExplodeSentences(True)

SentenceDetector_7ba33fa7c64f

In [12]:
tokenizer = Tokenizer().setInputCols('sentence').setOutputCol('token')

In [15]:
checker = NorvigSweetingModel.pretrained().setInputCols(['token']).setOutputCol('checked')

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]


In [16]:
embeddings = WordEmbeddingsModel.pretrained().setInputCols(['sentence','token']).setOutputCol('embeddings')

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [17]:
ner = NerDLModel.pretrained().setInputCols(['sentence','checked','embeddings']).setOutputCol('ner')

ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [18]:
converter = NerConverter().setInputCols(['sentence','checked','ner']).setOutputCol('chunk')

In [21]:
from pyspark.ml import Pipeline

In [26]:
pipeline = Pipeline().setStages([
    document,
    sentence,
    tokenizer,
    checker,
    embeddings,
    ner,
    converter
])

In [27]:
model = pipeline.fit(data)

In [30]:
result = model.transform(data)

In [31]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|             checked|          embeddings|                 ner|               chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Apple is looking ...|[{document, 0, 53...|[{document, 0, 53...|[{token, 0, 4, Ap...|[{token, 0, 4, Ap...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 4, Ap...|
|Elon Reeve Musk F...|[{document, 0, 17...|[{document, 0, 59...|[{token, 0, 3, El...|[{token, 0, 3, El...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 18, E...|
|Elon Reeve Musk F...|[{document, 0, 17...|[{document, 61, 1...|[{token, 61, 62, ...|[{token, 61, 62, ...|[{word_embeddings...|[{named_entity, 6...|[{

In [32]:
result.select('checked.result').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                             |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Apple, is, looking, at, buying, U.K, ., startup, for, $1, billion]                                                                                                                |
|[Elon, Reeve, Musk, FRS, is, an, entrepreneur, and, business, magnate, .]                                                                                                          |
|[He, is, the, founder, ,, CEO, ,, and, Chief, Engineer, at, SpaceX, ;]                   

In [41]:
sent = [
    "Apple is looking at buying U.K. startup for $1 billion",
    "Elon Reeve Musk FRS is an entrepreneur and business magnate. He is the founder, CEO, and Chief Engineer at SpaceX; early stage investor, CEO, and Product Architect of Tesla, Inc.",
    "Timothy Donald Cook (born November 1, 1960) is an American business executive who has been the chief executive officer of Apple Inc. since 2011.",
    "Apache Spark is an open-source unified analytics engine for large-scale data processing."
]
for i in sent:
    print(i,end = "\n\n")

Apple is looking at buying U.K. startup for $1 billion

Elon Reeve Musk FRS is an entrepreneur and business magnate. He is the founder, CEO, and Chief Engineer at SpaceX; early stage investor, CEO, and Product Architect of Tesla, Inc.

Timothy Donald Cook (born November 1, 1960) is an American business executive who has been the chief executive officer of Apple Inc. since 2011.

Apache Spark is an open-source unified analytics engine for large-scale data processing.



In [43]:
result.select(['ner.result','ner.begin','ner.end']).show()

+--------------------+--------------------+--------------------+
|              result|               begin|                 end|
+--------------------+--------------------+--------------------+
|[B-ORG, O, O, O, ...|[0, 6, 9, 17, 20,...|[4, 7, 15, 18, 25...|
|[B-ORG, I-ORG, I-...|[0, 5, 11, 16, 20...|[3, 9, 14, 18, 21...|
|[O, O, O, O, O, O...|[61, 64, 67, 71, ...|[62, 65, 69, 77, ...|
|[O, O, O, O, O, O...|[115, 121, 127, 1...|[119, 125, 134, 1...|
|[B-PER, I-PER, I-...|[0, 8, 15, 20, 21...|[6, 13, 18, 20, 2...|
|[B-ORG, I-ORG, O,...|[0, 7, 13, 16, 19...|[5, 11, 14, 17, 2...|
+--------------------+--------------------+--------------------+



In [44]:
result.select(['chunk.result','chunk.begin','chunk.end']).show(truncate=False)

+------------------------------------------+------------+-------------+
|result                                    |begin       |end          |
+------------------------------------------+------------+-------------+
|[Apple, U.K]                              |[0, 27]     |[4, 29]      |
|[Elon Reeve Musk FRS]                     |[0]         |[18]         |
|[SpaceX]                                  |[107]       |[112]        |
|[Architect of Tesla, Inc]                 |[154]       |[176]        |
|[Timothy Donald Cook, American, Apple Inc]|[0, 50, 122]|[18, 57, 130]|
|[Apache Spark]                            |[0]         |[11]         |
+------------------------------------------+------------+-------------+



# LightPipeline

In [45]:
light =LightPipeline(model)

In [46]:
light.annotate('Christiano Ronaldo is a part of the Portugal football team')

{'chunk': ['Christiano Ronaldo', 'Portugal'],
 'checked': ['Christiano',
  'Ronaldo',
  'is',
  'a',
  'part',
  'of',
  'the',
  'Portugal',
  'football',
  'team'],
 'document': ['Christiano Ronaldo is a part of the Portugal football team'],
 'token': ['Christiano',
  'Ronaldo',
  'is',
  'a',
  'part',
  'of',
  'the',
  'Portugal',
  'football',
  'team'],
 'ner': ['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O'],
 'embeddings': ['Christiano',
  'Ronaldo',
  'is',
  'a',
  'part',
  'of',
  'the',
  'Portugal',
  'football',
  'team'],
 'sentence': ['Christiano Ronaldo is a part of the Portugal football team']}