## NLP with Spark

### Install the necessary library

In [None]:
!pip install spark-nlp==1.7.3

### Import basic libraries

In [2]:
import pandas as pd
pd.set_option('max_colwidth', 800)

### Create Spark session with NLP library

In [None]:
# spark has to know which jars/packages are needed in this session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.8.2") \
    .getOrCreate()
spark

### Read the reddit json file

In [None]:
dataPath = "./data/*.json"
df = spark.read.json(dataPath)
print(df.count())
df.printSchema()

In [None]:
title = "data.title"
author = "data.author"
dfAuthorTilte = df.select(title, author).dropna()
dfAuthorTilte.limit(5).toPandas()

### NLP portion starts here

#### import basic pipeline and annotate

In [None]:
from com.johnsnowlabs.nlp.pretrained.pipeline.en import BasicPipeline as bp
dfAnnotated = bp.annotate(dfAuthorTilte, "title")
dfAnnotated.printSchema()

#### work with the two important columns

In [None]:
dfPos = dfAnnotated.select("text", "pos")
dfPos.limit(5).toPandas()

#### get the data

In [None]:
dfPos = dfAnnotated.select("text", "pos", "pos.metadata", "pos.result")
dfPos.limit(5).toPandas()

#### explode function to get an array into rows

In [None]:
dfPos= dfAnnotated.select(F.explode("pos").alias("pos"))
nnpFilter = "pos.result = 'NNP' or pos.result = 'NNPS' "
dfNNP = dfPos.where(nnpFilter).limit(100)
dfNNP.toPandas()

#### get the refreshed and required content

In [None]:
dfWordTag = dfNNP.selectExpr("pos.metadata['word'] as word", "pos.result as tag")
dfWordTag.limit(100).toPandas()

In [None]:
from pyspark.sql.functions import desc
dfWordTag.groupBy("word").count().orderBy(desc("count")).show()