In [1]:
from pyspark.sql import SparkSession
import sparknlp
import os
import numpy as np
import pandas as pd
#import databricks.koalas as ks

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

spark = SparkSession.builder.appName("Spark NLP").master("local[4]").config("spark.driver.memory","16G").\
                    config("spark.driver.maxResultSize", "0").config("spark.kryoserializer.buffer.max", "2000M").\
                    config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.0.1").getOrCreate()

# 1 Initiate a Spark session with default settings

In [3]:
spark = sparknlp.start()

In [4]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.0.1
Apache Spark version:  3.1.1


In [5]:
from sparknlp.pretrained import PretrainedPipeline
#pipeline = PretrainedPipeline("recognize_entities_dl", "en")
pipeline = PretrainedPipeline('onto_recognize_entities_sm', lang = 'en')

onto_recognize_entities_sm download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [6]:
result = pipeline.annotate('Google has announced the release of a beta version of the popular TensorFlow machine learning library.')
#print(result['ner'])
print(result['ner'])
print(result['entities'])

['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PRODUCT', 'O', 'O', 'O']
['Google', 'TensorFlow']


In [7]:
pipeline = PretrainedPipeline('analyze_sentiment', 'en')

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [8]:
result = pipeline.annotate('This is a very boring movie. I recommend others to awoid this movie is not good..')
#result
print(result['sentiment'])
print(result['checked'])

['negative', 'negative', 'negative']
['This', 'is', 'a', 'very', 'boring', 'movie', '.', 'I', 'recommend', 'others', 'to', 'avoid', 'this', 'movie', 'is', 'not', 'good', '.', '.']


#### The word awoid has been corrected to avoid by spell checker insdie this pipeline

# 2. Document Assembler

In [9]:
from sparknlp.base import *
import sparknlp

In [10]:
spark = sparknlp.start() # start spark session

In [11]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.0.1
Apache Spark version:  3.1.1


In [12]:
sen = [['Hello, this is an example sentence'],['And this is a second sentence.']]
# spark is the Spark Session automatically started by pyspark.
spark_df = spark.createDataFrame(sen, ['text'])
spark_df.show()

+--------------------+
|                text|
+--------------------+
|Hello, this is an...|
|And this is a sec...|
+--------------------+



In [13]:
spark_df.printSchema()

root
 |-- text: string (nullable = true)



In [14]:
documentAssembler = DocumentAssembler().setInputCol('text').setOutputCol('document')
doc_df=documentAssembler.transform(spark_df)
doc_df.show()


+--------------------+--------------------+
|                text|            document|
+--------------------+--------------------+
|Hello, this is an...|[{document, 0, 33...|
|And this is a sec...|[{document, 0, 29...|
+--------------------+--------------------+



In [15]:
doc_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)



In [16]:
doc_df.select('document.result').take(2)

[Row(result=['Hello, this is an example sentence']),
 Row(result=['And this is a second sentence.'])]

## ------------------- 

In [17]:
import pyspark.sql.functions as F
doc_df.withColumn("tmp", F.explode("document")).select('tmp.*').show()

+-------------+-----+---+--------------------+---------------+----------+
|annotatorType|begin|end|              result|       metadata|embeddings|
+-------------+-----+---+--------------------+---------------+----------+
|     document|    0| 33|Hello, this is an...|{sentence -> 0}|        []|
|     document|    0| 29|And this is a sec...|{sentence -> 0}|        []|
+-------------+-----+---+--------------------+---------------+----------+

