## spark nlp test

### This test script instantiates a spark nlp session , then uses a spark nlp pipeline to proprocess text data. Finally, a pretrained POS tagger is used to render part of speech IDs for tokens. 

In [1]:
# Instantiate spark NLP session configurations
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ner")\
    .master("local[4]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.0.1")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [2]:
#Ensure spark home is set with the appropriate .jar files
import os
os.environ["SPARK_HOME"] = "/opt/spark/"

In [4]:
#examine .jar files
spark.sparkContext

In [5]:
##### Pipeline using PerceptronModel (pretrained) Part of Speech Tagger and spark nlp preprocessing components. 


from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

#this should correspond to the fast part of speech model, (pos_fast_en_1.8.0_2.4_15454346533742.zip)
from sparknlp.annotator import PerceptronModel

sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "text"])


document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

wordEmbeddings = WordEmbeddingsModel.pretrained().setOutputCol("word_embeddings")    

# download directly - models
pos = PerceptronModel.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("pos")
    
advancedPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, pos, wordEmbeddings])

output = advancedPipeline.fit(sentenceDataFrame).transform(sentenceDataFrame)
output.show()


+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|                text|            document|            sentence|               token|                 pos|     word_embeddings|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|Hi I heard about ...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 1, Hi...|[[pos, 0, 1, NNP,...|[[word_embeddings...|
|  1|I wish Java could...|[[document, 0, 33...|[[document, 0, 33...|[[token, 0, 0, I,...|[[pos, 0, 0, PRP,...|[[word_embeddings...|
|  2|Logistic,regressi...|[[document, 0, 34...|[[document, 0, 34...|[[token, 0, 34, L...|[[pos, 0, 34, NNP...|[[word_embeddings...|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [6]:
#Does appear to be returning parts of speech
output.select("pos").show(1, truncate=False)

#example output 
#pos, 0, 1, NNP, [word -> Hi], [], []], [pos, 3, 3, PRP, [word -> I], [], []], [pos, 5, 9, VBD, [word -> heard], [], []], [pos, 11, 15, IN, [word -> about], [], []], [pos, 17, 21, NNP, [word -> Spark], [], []]]|



+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pos                                                                                                                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[pos, 0, 1, NNP, [word -> Hi], [], []], [pos, 3, 3, PRP, [word -> I], [], []], [pos, 5, 9, VBD, [word -> heard], [], []], [pos, 11, 15, IN, [word -> about], [], []], [pos, 17, 21, NNP, [word -> Spark], [], []]]|
+-----------------------------------------------------------------------------------------------------------------------------------------------