# Preprocessing Text Data with PySpark

## Installation

In [1]:
# import os
# # Install java
# ! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
# ! java -version

# # Install pyspark
# ! pip install --ignore-installed pyspark==3.5

# # Install Spark NLP
# ! pip install --ignore-installed spark-nlp==5.3.3

# # Colab for Spark NLP
# !wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

In [2]:
import logging
import sparknlp
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, Stemmer, NorvigSweetingApproach, NGramGenerator
from sparknlp.base import DocumentAssembler, Finisher
from pyspark import SparkContext as sc
from pyspark.sql.types import *
from pyspark.ml import Pipeline
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/john/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
logging.getLogger
TF_CPP_MIN_LOG_LEVEL=3

<function logging.getLogger(name=None)>

In [4]:
SOURCE = "data/02_exp/review.csv"
DESTINATION = "data/02_exp/reviews_transformed.parquet"
TEXT_COL = "content"

In [5]:
spark = sparknlp.start(real_time_output=False, output_level=3)
spark.sparkContext.setLogLevel("ERROR")

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

your 131072x1 screen size is bogus. expect trouble
24/05/10 01:30:55 WARN Utils: Your hostname, Leviathan resolves to a loopback address: 127.0.1.1; using 172.29.219.103 instead (on interface eth0)
24/05/10 01:30:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/john/anaconda3/envs/appvocai-discover/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/john/.ivy2/cache
The jars for the packages stored in: /home/john/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a5ac0578-8502-4ebd-b33f-83521ed95f15;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central
	f

Spark NLP version:  5.3.3
Apache Spark version:  3.5.0


## Data

In [6]:
df = spark.read.csv(SOURCE, header=True)

## Spark Pipeline


### Document Assembler

In [7]:
document = DocumentAssembler() \
     .setInputCol(TEXT_COL) \
     .setOutputCol('document')

### Tokenizer

In [8]:
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('tokenized')

### Normalizer

In [9]:
normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

### Stop Word Cleaner

In [10]:
eng_stopwords = stopwords.words('english')
stopword_cleaner = StopWordsCleaner() \
    .setInputCols(['normalized']) \
    .setOutputCol("sans_stopwords") \
    .setStopWords(eng_stopwords)

### Spell Check

In [11]:
spellchecker = NorvigSweetingApproach() \
  .setInputCols(["sans_stopwords"]) \
  .setOutputCol("spellchecked") \
  .setDictionary("data/04_ref/wordlist.txt")

### Lemmatizer

In [12]:
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['spellchecked']) \
     .setOutputCol('output')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[OK!]


### NGram Generator

In [38]:
bigrammer = NGramGenerator() \
    .setInputCols(['output']) \
    .setOutputCol('bigrams') \
    .setN(2) \
    .setEnableCumulative(False) \
    .setDelimiter('_')

In [39]:
trigrammer = NGramGenerator() \
    .setInputCols(['output']) \
    .setOutputCol('trigrams') \
    .setN(3) \
    .setEnableCumulative(False) \
    .setDelimiter('_')

### Finisher

In [40]:
finisher = Finisher() \
    .setInputCols("output")  \
    .setOutputCols("result")

### Pipeline

In [41]:
pipeline = Pipeline() \
    .setStages([document, tokenizer, normalizer, stopword_cleaner, spellchecker, lemmatizer,  bigrammer, trigrammer])

## Fit the Model

In [42]:
df2 = pipeline.fit(df).transform(df)


                                                                                

## Save Results

In [43]:
df2.write.parquet(DESTINATION, mode="overwrite")


                                                                                

In [44]:
df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_name: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- author: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)
 |-- vote_sum: string (nullable = true)
 |-- vote_count: string (nullable = true)
 |-- date: string (nullable = true)
 |-- extracted: string (nullable = true)
 |-- review_length: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- emb

In [45]:
df2.select("id","app_id", "app_name", "category_id", "category", "author", "rating", "title", "vote_sum", "vote_count", "review_length", "date").limit(5).show(truncate=False)

+----------+----------+-----------------------------+-----------+----------------+-----------------+------+---------------+--------+----------+-------------+-------------------+
|id        |app_id    |app_name                     |category_id|category        |author           |rating|title          |vote_sum|vote_count|review_length|date               |
+----------+----------+-----------------------------+-----------+----------------+-----------------+------+---------------+--------+----------+-------------+-------------------+
|53730432  |292987597 |White Noise Lite             |6013       |Health & Fitness|♥musiclova302♥   |5.0   |Sounds real    |1       |1         |29           |2009-05-03 19:42:03|
|5568556126|1300396049|Cubatel                      |6000       |Business        |addemiiya        |5.0   |Lo mejor       |0       |0         |5            |2020-02-23 05:59:00|
|833432102 |577232024 |Lumosity: Brain Training     |6017       |Education       |Braveheart04     |4.0   |Goo

In [46]:
df2 = spark.read.parquet(DESTINATION)


In [47]:
df2.select("id","app_id", "app_name", "category_id", "category", "author", "rating", "title", "vote_sum", "vote_count", "review_length", "date").limit(5).show(truncate=False)

+----------+----------+-----------------------------+-----------+----------------+-----------------+------+---------------+--------+----------+-------------+-------------------+
|id        |app_id    |app_name                     |category_id|category        |author           |rating|title          |vote_sum|vote_count|review_length|date               |
+----------+----------+-----------------------------+-----------+----------------+-----------------+------+---------------+--------+----------+-------------+-------------------+
|53730432  |292987597 |White Noise Lite             |6013       |Health & Fitness|♥musiclova302♥   |5.0   |Sounds real    |1       |1         |29           |2009-05-03 19:42:03|
|5568556126|1300396049|Cubatel                      |6000       |Business        |addemiiya        |5.0   |Lo mejor       |0       |0         |5            |2020-02-23 05:59:00|
|833432102 |577232024 |Lumosity: Brain Training     |6017       |Education       |Braveheart04     |4.0   |Goo

In [53]:
df2.select("bigrams.result").limit(5).show(truncate=False)
df2.select("trigrams.result").limit(5).show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                                                                                                                                                                                                                     