In [1]:
# Install PySpark and Spark NLP
!pip install -q spark-nlp==4.3.2

In [2]:
import sparknlp
from sparknlp.base import DocumentAssembler

# Let Spark NLP start the SparkSession
spark = sparknlp.start()

data = [
    (1, "I love working with SparkNLP."),
    (2, "Today is sunny.")
]

# Create a DataFrame
columns = ["id", "text"]
df = spark.createDataFrame(data, columns)

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

result = documentAssembler.transform(df)

result.show(truncate=False)

+---+-----------------------------+-----------------------------------------------------------------------+
|id |text                         |document                                                               |
+---+-----------------------------+-----------------------------------------------------------------------+
|1  |I love working with SparkNLP.|[{document, 0, 28, I love working with SparkNLP., {sentence -> 0}, []}]|
|2  |Today is sunny.              |[{document, 0, 14, Today is sunny., {sentence -> 0}, []}]              |
+---+-----------------------------+-----------------------------------------------------------------------+



In [3]:
import sparknlp
from sparknlp.base import DocumentAssembler

# Let Spark NLP start the SparkSession
spark = sparknlp.start()

data = [
    (1, "I love working with SparkNLP."),
    (2, "Today is sunny.")
]

# Create a DataFrame
columns = ["id", "text"]
df = spark.createDataFrame(data, columns)

# Create a DocumentAssembler object, note the parentheses after DocumentAssembler
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

result = documentAssembler.transform(df)

result.show(truncate=False)

+---+-----------------------------+-----------------------------------------------------------------------+
|id |text                         |document                                                               |
+---+-----------------------------+-----------------------------------------------------------------------+
|1  |I love working with SparkNLP.|[{document, 0, 28, I love working with SparkNLP., {sentence -> 0}, []}]|
|2  |Today is sunny.              |[{document, 0, 14, Today is sunny., {sentence -> 0}, []}]              |
+---+-----------------------------+-----------------------------------------------------------------------+



In [4]:
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("processed_text")

In [5]:

result = document_assembler.transform(df)

result.show(truncate=False)

+---+-----------------------------+-----------------------------------------------------------------------+
|id |text                         |processed_text                                                         |
+---+-----------------------------+-----------------------------------------------------------------------+
|1  |I love working with SparkNLP.|[{document, 0, 28, I love working with SparkNLP., {sentence -> 0}, []}]|
|2  |Today is sunny.              |[{document, 0, 14, Today is sunny., {sentence -> 0}, []}]              |
+---+-----------------------------+-----------------------------------------------------------------------+



In [6]:
from sparknlp.annotator import Tokenizer
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

pipeline = Pipeline(stages=[document_assembler, tokenizer])
model = pipeline.fit(df)
result = model.transform(df)

result.select("token.result").show(truncate=False)


+-------------------------------------+
|result                               |
+-------------------------------------+
|[I, love, working, with, SparkNLP, .]|
|[Today, is, sunny, .]                |
+-------------------------------------+



Named Entity Recognition with BERT


In [7]:

example_df = spark.createDataFrame([["Microsoft founder Bill Gates plans to build a new factory in Germany."]]).toDF("text")

example_df = pipeline.fit(example_df).transform(example_df)

In [8]:
from sparknlp.annotator import Tokenizer, BertForTokenClassification
import pyspark.sql.functions as F
bert_tagger = BertForTokenClassification.pretrained("bert_base_token_classifier_conll03", "en") \
        .setInputCols(['document', 'token']) \
        .setOutputCol('ner')\
        .setMaxSentenceLength(512)\
        .setCaseSensitive(True)

bert_base_token_classifier_conll03 download started this may take some time.
Approximate size to download 385.4 MB
[OK!]


In [9]:
result = bert_tagger.transform(example_df)
result.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                 ner|
+--------------------+--------------------+--------------------+--------------------+
|Microsoft founder...|[{document, 0, 68...|[{token, 0, 8, Mi...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+



In [10]:
import sparknlp
print("Spark NLP Version:", sparknlp.version())
print("Spark Version:", spark.version)

Spark NLP Version: 4.3.2
Spark Version: 3.2.4


In [11]:
result.select(F.posexplode("token.result").alias("pos", "token"), "ner") \
    .select(F.col("token"), F.col("ner").getItem(F.col("pos")).alias("ner_label")) \
    .show(50, truncate=False)



+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|token    |ner_label                                                                                                                                                                                                                                                                    |
+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Microsoft|{named_entity, 0, 8, B-ORG, {B-LOC -> 6.298694E-4, I-ORG -> 2.1694727E-4, I-MISC -> 1.0996349E-4, I-LOC -> 1.5734222E-5, I-PER -> 6.565089E-5, 

In [12]:
result.printSchema()  # Check the schema of the DataFrame
result.select("ner.result").show(truncate=False)  # Inspect the 'ner.result' column

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 

In [13]:
bert_tagger.extractParamMap()

{Param(parent='BERT_FOR_TOKEN_CLASSIFICATION_675a6a750b89', name='batchSize', doc='Size of every batch'): 8,
 Param(parent='BERT_FOR_TOKEN_CLASSIFICATION_675a6a750b89', name='engine', doc='Deep Learning engine used for this model'): 'tensorflow',
 Param(parent='BERT_FOR_TOKEN_CLASSIFICATION_675a6a750b89', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='BERT_FOR_TOKEN_CLASSIFICATION_675a6a750b89', name='maxSentenceLength', doc='Max sentence length to process'): 512,
 Param(parent='BERT_FOR_TOKEN_CLASSIFICATION_675a6a750b89', name='caseSensitive', doc='whether to ignore case in tokens for embeddings matching'): True,
 Param(parent='BERT_FOR_TOKEN_CLASSIFICATION_675a6a750b89', name='inputCols', doc='previous annotations columns, if renamed'): ['document',
  'token'],
 Param(parent='BERT_FOR_TOKEN_CLASSIFICATION_675a6a750b89', name='outputCol', doc='output annotation column. can be left default.'): 'ner'}

In [14]:
from sparknlp.pretrained import PretrainedPipeline

# Load pre-trained NER pipeline
pipeline = PretrainedPipeline("recognize_entities_dl", lang="en")

# Sample text
text = "IBM, which has an office in Germany, is a leader in AI and NLP."

# Annotate the text
result = pipeline.annotate(text)

# Print the results
print(result['entities'])


recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]
['IBM', 'Germany', 'AI', 'NLP']


Text Classification

In [15]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, BertForSequenceClassification, AlbertForSequenceClassification, DistilBertForSequenceClassification, SentenceDetector
from pyspark.ml import Pipeline
import pyspark.sql.functions as F

In [16]:
document_assembler = DocumentAssembler() \
        .setInputCol('text') \
        .setOutputCol('document')

tokenizer = Tokenizer() \
        .setInputCols(['document']) \
        .setOutputCol('token')

pipeline = Pipeline(stages=[document_assembler,
                            tokenizer])


In [17]:
example_df = spark.createDataFrame([["The movie was brilliant."]]).toDF("text")

example_df = pipeline.fit(example_df).transform(example_df)

In [18]:
example_df.show()

+--------------------+--------------------+--------------------+
|                text|            document|               token|
+--------------------+--------------------+--------------------+
|The movie was bri...|[{document, 0, 23...|[{token, 0, 2, Th...|
+--------------------+--------------------+--------------------+



In [19]:
bert_cls = BertForSequenceClassification.pretrained("bert_classifier_fabriceyhc_base_uncased_imdb", "en") \
        .setInputCols(['document', 'token']) \
        .setOutputCol('class')

bert_classifier_fabriceyhc_base_uncased_imdb download started this may take some time.
[OK!]


In [20]:
!pip install pyspark==3.2.4



In [21]:
import sparknlp
print(sparknlp.version())

4.3.2


In [22]:
import pyspark
print(pyspark.__version__)

3.2.4


In [23]:
result = bert_cls.transform(example_df)
result.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|               class|
+--------------------+--------------------+--------------------+--------------------+
|The movie was bri...|[{document, 0, 23...|[{token, 0, 2, Th...|[{category, 0, 23...|
+--------------------+--------------------+--------------------+--------------------+



In [24]:
result.select("class.result").show(truncate=False)

+------+
|result|
+------+
|[pos] |
+------+



In [25]:
result.select("class").show(truncate=False)

+------------------------------------------------------------------------------------+
|class                                                                               |
+------------------------------------------------------------------------------------+
|[{category, 0, 23, pos, {sentence -> 0, neg -> 3.6695242E-4, pos -> 0.9996331}, []}]|
+------------------------------------------------------------------------------------+



In [26]:
bert_cls.extractParamMap()

{Param(parent='BERT_FOR_SEQUENCE_CLASSIFICATION_4fcc53fde2bc', name='activation', doc='Whether to calculate logits via Softmax or Sigmoid. Default is Softmax'): 'softmax',
 Param(parent='BERT_FOR_SEQUENCE_CLASSIFICATION_4fcc53fde2bc', name='batchSize', doc='Size of every batch'): 8,
 Param(parent='BERT_FOR_SEQUENCE_CLASSIFICATION_4fcc53fde2bc', name='coalesceSentences', doc="Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences."): False,
 Param(parent='BERT_FOR_SEQUENCE_CLASSIFICATION_4fcc53fde2bc', name='engine', doc='Deep Learning engine used for this model'): 'tensorflow',
 Param(parent='BERT_FOR_SEQUENCE_CLASSIFICATION_4fcc53fde2bc', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='BERT_FOR_SEQUENCE_CLASSIFICATION_4fcc53fde2bc', name='maxSentenceLength', doc='Max sentence length to process'): 256,
 Param(parent='BERT_FOR_SEQUENCE_CLAS

In [27]:
bert_cls.getCaseSensitive()

False

Sentiment Analysis

In [28]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline, Finisher
from sparknlp.annotator import (
    Tokenizer,
    Normalizer,
    ViveknSentimentModel
)
import pyspark.sql.functions as F

# Step 1: Transforms raw texts to `document` annotation
document_assembler = (
    DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")
)

# Step 2: Tokenization
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

# Step 3: Normalizer
normalizer= Normalizer().setInputCols("token").setOutputCol("normal")

# Step 4: Sentiment Detection
vivekn= (
    ViveknSentimentModel.pretrained('sentiment_vivekn')
    .setInputCols(["document", "normal"])
    .setOutputCol("result_sentiment")
    .setFeatureLimit(4)
)

# Step 5: Finisher
finisher= (
    Finisher()
    .setInputCols(["result_sentiment"]).setOutputCols("final_sentiment")
)

# Define the pipeline
pipeline = Pipeline(
    stages=[
        document_assembler,
        tokenizer,
        normalizer,
        vivekn,
        finisher
    ]
)

sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
[OK!]


In [29]:
# Define dataframe with sample texts
data = spark.createDataFrame([
    ["I dont recommend this movie"],
    ["Dont waste your time!!!"]
]).toDF("text")

model = pipeline.fit(data)

result = model.transform(data)

# Display both the sample text and the predicted sentiment
result.select("text", "final_sentiment").show(truncate=False)

+---------------------------+---------------+
|text                       |final_sentiment|
+---------------------------+---------------+
|I dont recommend this movie|[negative]     |
|Dont waste your time!!!    |[negative]     |
+---------------------------+---------------+



Normalization and Lemmatization

In [30]:
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel
from pyspark.ml import Pipeline

data = spark.createDataFrame([["The birds are flying and they are beautiful."]]).toDF("text")

document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")
lemmatizer = LemmatizerModel.pretrained().setInputCols(["normalized"]).setOutputCol("lemma")
finisher = Finisher().setInputCols(["lemma"]).setOutputCols(["lemmatized_text"])

pipeline = Pipeline(stages=[document_assembler, tokenizer, normalizer, lemmatizer, finisher])
model = pipeline.fit(data)
result = model.transform(data)

result.select("text", "lemmatized_text").show(truncate=False)


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
+--------------------------------------------+----------------------------------------------+
|text                                        |lemmatized_text                               |
+--------------------------------------------+----------------------------------------------+
|The birds are flying and they are beautiful.|[The, bird, be, fly, and, they, be, beautiful]|
+--------------------------------------------+----------------------------------------------+



LLMs

In [31]:
import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import GPT2Transformer
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Spark NLP Text Generation") \
    .getOrCreate()

# Sample data
data = spark.createDataFrame([["The future of artificial intelligence is"]]).toDF("text")

# Document Assembler
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

# GPT2 Text Generator
gpt2 = GPT2Transformer.pretrained() \
    .setInputCols(["document"]) \
    .setOutputCol("generated_text") \
    .setMaxOutputLength(50)  # Set the maximum length of the generated text

# Pipeline
pipeline = Pipeline(stages=[document_assembler, gpt2])

# Fit and transform the pipeline
model = pipeline.fit(data)
result = model.transform(data)

# Show the results
result.select("text", "generated_text.result").show(truncate=False)


gpt2 download started this may take some time.
Approximate size to download 442.7 MB
[OK!]
+----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                    |result                                                                                                                                                                                                                                   |
+----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|The future of artificial intelligence is|[ The future of artificial intellige