In [1]:
from configspark import create_session, read_data
import pyspark.sql.functions as f 
import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer, IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [None]:
spark = create_session()

In [3]:
df = read_data(spark)

In [4]:
train_ratio = 0.7
test_ratio = 0.15
validation_ratio = 0.15

# Split the data using randomSplit()
train_data, test_data, validation_data = df.randomSplit([train_ratio, test_ratio, validation_ratio], seed=45)


In [5]:
train_data

DataFrame[review: string, rating: string]

In [6]:
# Preprocess the data to concatenate feature columns into one column called text
# featureConcat = FeatureConcatenator(outputCols = ["text"], inputCols = [target_col])

# Prepares data into a format that is processable by Spark NLP. This is the entry point for every Spark NLP pipeline. 
# The DocumentAssembler can read either a String column or an Array[String]
documentAssembler = DocumentAssembler().setInputCol("review").setOutputCol("document")

# Tokenizes raw text in document type columns into TokenizedSentence
tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

# Annotator that cleans out tokens.
# Remove white space
normalizer = Normalizer().setInputCols("token").setOutputCol("normalized").setLowercase(True).setCleanupPatterns(["[^\w\s]"])

# Remove years (integers starting with 19XX or 20XX)
removeYear = Normalizer().setInputCols(["normalized"]).setOutputCol("remove_year").setCleanupPatterns(["(?:(?:19|20)\d\d)"])

# Find lemmas out of words with the objective of returning a base dictionary word
lemmatizer = LemmatizerModel.pretrained().setInputCols("remove_year").setOutputCol("lemmatized")

# A feature transformer that converts the input array of strings (annotatorType TOKEN) into an array of n-grams (annotatorType CHUNK). 
#  Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words.
ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[OK!]


In [7]:
confidence_pipeline = Pipeline().setStages([
                                    documentAssembler,
                                    tokenizer,
                                    normalizer,
                                    removeYear,
                                    lemmatizer,
                                    ngrammer,
                                    ])

In [8]:
new_df = confidence_pipeline.fit(train_data)

In [9]:
new = new_df.transform(train_data)

In [10]:
new.show()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              review|rating|            document|               token|          normalized|         remove_year|          lemmatized|              ngrams|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|"5 Stars - ""Very...|   5.0|[{document, 0, 25...|[{token, 0, 0, ",...|[{token, 1, 1, 5,...|[{token, 1, 1, 5,...|[{token, 1, 1, 5,...|[{chunk, 1, 1, 5,...|
|"ALMOST everythin...|   3.0|[{document, 0, 32...|[{token, 0, 0, ",...|[{token, 1, 6, al...|[{token, 1, 6, al...|[{token, 1, 6, al...|[{chunk, 1, 6, al...|
|"After a few week...|   4.0|[{document, 0, 57...|[{token, 0, 0, ",...|[{token, 1, 5, af...|[{token, 1, 5, af...|[{token, 1, 5, af...|[{chunk, 1, 5, af...|
|"After my old Ham...|   1.0|[{document, 0, 43...|[{token, 0, 0,

                                                                                

In [11]:
new.select("lemmatized").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
new.select("ngrams").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------