In [1]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

In [2]:
spark = sparknlp.start()

In [3]:
pipeline = PretrainedPipeline('explain_document_dl', lang='en')

explain_document_dl download started this may take some time.
Approx size to download 169.4 MB
[OK!]


In [4]:
text = """
The Mona Lisa is a 16th century oil painting created by Leonardo.
It's held at the Louvre in Paris.
"""

In [5]:
# Annotate your testing dataset
result = pipeline.annotate(text)

In [6]:
list(result)

['entities',
 'stem',
 'checked',
 'lemma',
 'document',
 'pos',
 'token',
 'ner',
 'embeddings',
 'sentence']

In [8]:
result['entities']

['Mona Lisa', 'Leonardo', 'Louvre', 'Paris']

In [14]:
mkdir -p data

In [16]:
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz

--2022-03-29 23:05:00--  https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/mini_newsgroups.tar.gz
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1860687 (1,8M) [application/x-httpd-php]
Saving to: ‘mini_newsgroups.tar.gz’


2022-03-29 23:05:03 (1,23 MB/s) - ‘mini_newsgroups.tar.gz’ saved [1860687/1860687]



In [17]:
! tar xzf mini_newsgroups.tar.gz -C ./data/

In [4]:
import os

from pyspark.sql.types import *
from pyspark.ml import Pipeline

import sparknlp
from sparknlp import DocumentAssembler, Finisher

spark = sparknlp.start()



In [5]:
space_path = os.path.join('data', 'mini_newsgroups', 'sci.space')
texts = spark.sparkContext.wholeTextFiles(space_path)

schema = StructType([
    StructField('path', StringType()),
    StructField('text', StringType()),
])

texts = spark.createDataFrame(texts, schema=schema).persist()

In [6]:
texts.show() 

+--------------------+--------------------+
|                path|                text|
+--------------------+--------------------+
|file:/home/yogend...|Xref: cantaloupe....|
|file:/home/yogend...|Newsgroups: sci.s...|
|file:/home/yogend...|Path: cantaloupe....|
+--------------------+--------------------+



In [7]:
## excerpt from mini newsgroups modified for examples
example = '''
Nick's right about this.  It's always easier to obtian forgiveness than
permission.  Not many poeple remember that Britan's Kng George III
expressly forbade his american subjects to cross the alleghany/appalachian
mountains.  Said subjects basically said, "Stop us if you can."  He
couldn't.
'''

In [8]:
example = spark.createDataFrame([('.', example)], schema=schema).persist()

In [9]:
example.show(5)

+----+--------------------+
|path|                text|
+----+--------------------+
|   .|\nNick's right ab...|
+----+--------------------+



In [10]:
from pyspark.ml.feature import RegexTokenizer

ws_tokenizer = RegexTokenizer()\
    .setInputCol('text')\
    .setOutputCol('ws_tokens')\
    .setPattern('\\s+')\
    .setGaps(True)\
    .setToLowercase(False)

text, tokens = ws_tokenizer.transform(example)\
    .select("text", "ws_tokens").first()

In [11]:
print(text)


Nick's right about this.  It's always easier to obtian forgiveness than
permission.  Not many poeple remember that Britan's Kng George III
expressly forbade his american subjects to cross the alleghany/appalachian
mountains.  Said subjects basically said, "Stop us if you can."  He
couldn't.



In [12]:
b_tokenizer = RegexTokenizer()\
    .setInputCol('text')\
    .setOutputCol('b_tokens')\
    .setPattern('\\s+|\\b')\
    .setGaps(True)\
    .setToLowercase(False)

text, tokens = b_tokenizer.transform(example)\
    .select("text", "b_tokens").first()

In [13]:
print(text)


Nick's right about this.  It's always easier to obtian forgiveness than
permission.  Not many poeple remember that Britan's Kng George III
expressly forbade his american subjects to cross the alleghany/appalachian
mountains.  Said subjects basically said, "Stop us if you can."  He
couldn't.



In [14]:
print(tokens)

['Nick', "'", 's', 'right', 'about', 'this', '.', 'It', "'", 's', 'always', 'easier', 'to', 'obtian', 'forgiveness', 'than', 'permission', '.', 'Not', 'many', 'poeple', 'remember', 'that', 'Britan', "'", 's', 'Kng', 'George', 'III', 'expressly', 'forbade', 'his', 'american', 'subjects', 'to', 'cross', 'the', 'alleghany', '/', 'appalachian', 'mountains', '.', 'Said', 'subjects', 'basically', 'said', ',', '"', 'Stop', 'us', 'if', 'you', 'can', '."', 'He', 'couldn', "'", 't', '.']


In [15]:
from sparknlp.annotator import Tokenizer

assembler = DocumentAssembler()\
    .setInputCol('text')\
    .setOutputCol('doc')
tokenizer = Tokenizer()\
    .setInputCols(['doc'])\
    .setOutputCol('tokens_annotations')
finisher = Finisher()\
    .setInputCols(['tokens_annotations'])\
    .setOutputCols(['tokens'])\
    .setOutputAsArray(True)
pipeline = Pipeline()\
    .setStages([assembler, tokenizer, finisher])

text, tokens = pipeline.fit(texts).transform(example)\
    .select("text", "tokens").first()

In [16]:
print(text)


Nick's right about this.  It's always easier to obtian forgiveness than
permission.  Not many poeple remember that Britan's Kng George III
expressly forbade his american subjects to cross the alleghany/appalachian
mountains.  Said subjects basically said, "Stop us if you can."  He
couldn't.



In [17]:
## excerpt from mini newsgroups modified for examples
example = '''
Nick's right about this.  It's always easier to obtian forgiveness than
permission.  Not many poeple remember that Britan's Kng George III
expressly forbade his american subjects to cross the alleghany/appalachian
mountains.  Said subjects basically said, "Stop us if you can."  He
couldn't.
'''

In [18]:
from sparknlp.annotator import Stemmer, Lemmatizer, LemmatizerModel

assembler = DocumentAssembler()\
    .setInputCol('text')\
    .setOutputCol('doc')
tokenizer = Tokenizer()\
    .setInputCols(['doc'])\
    .setOutputCol('tokens_annotations')
stemmer = Stemmer()\
    .setInputCols(['tokens_annotations'])\
    .setOutputCol('stems_annotations')
# The next line downloads lemmatizer "model". Here, "training" 
# is reading the user supplied dictionary
lemmatizer = LemmatizerModel.pretrained()\
    .setInputCols(['tokens_annotations'])\
    .setOutputCol('lemma_annotations')
finisher = Finisher()\
    .setInputCols(['stems_annotations', 'lemma_annotations'])\
    .setOutputCols(['stems', 'lemmas'])\
    .setOutputAsArray(True)

pipeline = Pipeline()\
    .setStages([
        assembler, tokenizer, stemmer, lemmatizer, finisher])
text, stems, lemmas = pipeline.fit(texts).transform(example)\
    .select("text", "stems", "lemmas").first()

lemma_antbnc download started this may take some time.


Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize.
: java.lang.NoClassDefFoundError: org/json4s/package$MappingException
	at org.json4s.ext.EnumNameSerializer.deserialize(EnumSerializer.scala:53)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at scala.collection.TraversableOnce.collectFirst(TraversableOnce.scala:180)
	at scala.collection.TraversableOnce.collectFirst$(TraversableOnce.scala:167)
	at scala.collection.AbstractTraversable.collectFirst(Traversable.scala:108)
	at org.json4s.Formats$.customDeserializer(Formats.scala:66)
	at org.json4s.Extraction$.customOrElse(Extraction.scala:775)
	at org.json4s.Extraction$.extract(Extraction.scala:454)
	at org.json4s.Extraction$.extract(Extraction.scala:56)
	at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:22)
	at com.johnsnowlabs.util.JsonParser$.parseObject(JsonParser.scala:28)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.parseJson(ResourceMetadata.scala:109)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:138)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:137)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at scala.collection.Iterator$$anon$13.next(Iterator.scala:593)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:184)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:47)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toList(TraversableOnce.scala:350)
	at scala.collection.TraversableOnce.toList$(TraversableOnce.scala:350)
	at scala.collection.AbstractIterator.toList(Iterator.scala:1431)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:137)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:132)
	at com.johnsnowlabs.client.aws.AWSGateway.getMetadata(AWSGateway.scala:78)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.downloadMetadataIfNeed(S3ResourceDownloader.scala:62)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.resolveLink(S3ResourceDownloader.scala:68)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.getDownloadSize(S3ResourceDownloader.scala:145)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.getDownloadSize(ResourceDownloader.scala:445)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.getDownloadSize(ResourceDownloader.scala:585)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize(ResourceDownloader.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.package$MappingException
	at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	... 51 more


In [19]:
from pyspark.ml.feature import CountVectorizer

assembler = DocumentAssembler()\
    .setInputCol('text')\
    .setOutputCol('doc')
tokenizer = Tokenizer()\
    .setInputCols(['doc'])\
    .setOutputCol('tokens_annotations')
norvig_pretrained = NorvigSweetingModel.pretrained()\
    .setInputCols(['tokens_annotations'])\
    .setOutputCol('norvig_annotations')
lemmatizer = LemmatizerModel.pretrained()\
    .setInputCols(['norvig_annotations'])\
    .setOutputCol('lemma_annotations')
normalizer = Normalizer()\
    .setInputCols(['lemma_annotations'])\
    .setOutputCol('normtoken_annotations')\
    .setLowercase(True)
finisher = Finisher()\
    .setInputCols(['normtoken_annotations'])\
    .setOutputCols(['normtokens'])\
    .setOutputAsArray(True)

sparknlp_pipeline = Pipeline().setStages([
    assembler, tokenizer, norvig_pretrained, 
    lemmatizer, normalizer, finisher
])

count_vectorizer = CountVectorizer()\
    .setInputCol('normtokens')\
    .setOutputCol('bows')

pipeline = Pipeline().setStages([sparknlp_pipeline, count_vectorizer])
model = pipeline.fit(texts)
processed = model.transform(example)
text, normtokens, bow = processed\
    .select("text", "normtokens", 'bows').first()

spellcheck_norvig download started this may take some time.


Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize.
: java.lang.NoClassDefFoundError: org/json4s/package$MappingException
	at org.json4s.ext.EnumNameSerializer.deserialize(EnumSerializer.scala:53)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at scala.collection.TraversableOnce.collectFirst(TraversableOnce.scala:180)
	at scala.collection.TraversableOnce.collectFirst$(TraversableOnce.scala:167)
	at scala.collection.AbstractTraversable.collectFirst(Traversable.scala:108)
	at org.json4s.Formats$.customDeserializer(Formats.scala:66)
	at org.json4s.Extraction$.customOrElse(Extraction.scala:775)
	at org.json4s.Extraction$.extract(Extraction.scala:454)
	at org.json4s.Extraction$.extract(Extraction.scala:56)
	at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:22)
	at com.johnsnowlabs.util.JsonParser$.parseObject(JsonParser.scala:28)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.parseJson(ResourceMetadata.scala:109)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:138)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:137)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at scala.collection.Iterator$$anon$13.next(Iterator.scala:593)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:184)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:47)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toList(TraversableOnce.scala:350)
	at scala.collection.TraversableOnce.toList$(TraversableOnce.scala:350)
	at scala.collection.AbstractIterator.toList(Iterator.scala:1431)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:137)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:132)
	at com.johnsnowlabs.client.aws.AWSGateway.getMetadata(AWSGateway.scala:78)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.downloadMetadataIfNeed(S3ResourceDownloader.scala:62)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.resolveLink(S3ResourceDownloader.scala:68)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.getDownloadSize(S3ResourceDownloader.scala:145)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.getDownloadSize(ResourceDownloader.scala:445)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.getDownloadSize(ResourceDownloader.scala:585)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize(ResourceDownloader.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.package$MappingException
	at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	... 51 more
