In [1]:
import json
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

In [2]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.0.1
Apache Spark version:  3.1.1


In [3]:
from pyspark.ml import Pipeline
document_assembler = DocumentAssembler().setInputCol('text').setOutputCol('document')
sentenceDetector = SentenceDetector().setInputCols(['document']).setOutputCol('sentences')
tokenizer = Tokenizer().setInputCols(['sentences']).setOutputCol('token')
normalizer = Normalizer().setInputCols(['token']).setOutputCol('normal')
word_embeddings=WordEmbeddingsModel.pretrained().setInputCols(['document', 'normal']).setOutputCol('embeddings')

nlpPipeline = Pipeline(stages=[
 document_assembler, 
 sentenceDetector,
 tokenizer,
 normalizer,
 word_embeddings,
 ])


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
sen = [['Hello, this is an example sentence'],['And this is a second sentence.']]
# spark is the Spark Session automatically started by pyspark.
spark_df = spark.createDataFrame(sen, ['text'])
spark_df.show()

In [None]:
%time
pipelineModel = nlpPipeline.fit(spark_df)
result = pipelineModel.transform(spark_df)
result.show()

## 2. Lightpipeline vs Spark ML pipeline

In [None]:
from pyspark.sql import Row
txt = "How did serfdom develop in and then leave Russia ?"
line_df = spark.createDataFrame(list(map(lambda x: Row(txt=x), [txt])), ["text"])
line_df.show()

In [None]:
%time result = pipelineModel.transform(line_df).collect()

In [None]:
from sparknlp.base import LightPipeline
lightModel = LightPipeline(pipelineModel, parse_embeddings=True)
%time lightModel.annotate("How did serfdom develop in and then leave Russia ?");