In [1]:
from configspark import create_session, read_data
import pyspark.sql.functions as f 
import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer, IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

## Goal
* Objective is to classify Reviews based on ratings to be honest you probably need to do chisquared first to see uniqueness of words per class
  but that doesn't stop the fun this project is to show how to pipeline the data and some basic cleaning not trying to get the best model 
* The note books are separated in parts because I want to exime inside the pipelines.

##### Configspark
* I was being lazy and I added the spark config function and also the read data it's all in the configspark.py at list I added a schema 

In [None]:
spark = create_session()

In [3]:
df = read_data(spark)

##### Split the data to train and validate 
* when you fit a pipeline I line to transform a different dataset than the one I have 

In [4]:
train_ratio = 0.7
test_ratio = 0.15
validation_ratio = 0.15

# Split the data using randomSplit()
train_data, test_data, validation_data = df.randomSplit([train_ratio, test_ratio, validation_ratio], seed=45)


In [5]:
train_data

DataFrame[review: string, rating: string]

#### Pipelining
* like it says pipeline it's a chain the previous transformation is linked to the next transformation order of excecution is maintained 

In [6]:
# Preprocess the data to concatenate feature columns into one column called text
# featureConcat = FeatureConcatenator(outputCols = ["text"], inputCols = [target_col])

# Prepares data into a format that is processable by Spark NLP. This is the entry point for every Spark NLP pipeline. 
# The DocumentAssembler can read either a String column or an Array[String]
documentAssembler = DocumentAssembler().setInputCol("review").setOutputCol("document")

# Tokenizes raw text in document type columns into TokenizedSentence
tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

# Annotator that cleans out tokens.
# Remove white space
normalizer = Normalizer().setInputCols("token").setOutputCol("normalized").setLowercase(True).setCleanupPatterns(["[^\w\s]"])

# Remove years (integers starting with 19XX or 20XX)
removeYear = Normalizer().setInputCols(["normalized"]).setOutputCol("remove_year").setCleanupPatterns(["(?:(?:19|20)\d\d)"])

# Find lemmas out of words with the objective of returning a base dictionary word
lemmatizer = LemmatizerModel.pretrained().setInputCols("remove_year").setOutputCol("lemmatized")

# A feature transformer that converts the input array of strings (annotatorType TOKEN) into an array of n-grams (annotatorType CHUNK). 
#  Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words.
ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ / ]Download done! Loading the resource.
[OK!]


In [7]:
training_pipeline = Pipeline().setStages([
                                    documentAssembler,
                                    tokenizer,
                                    normalizer,
                                    removeYear,
                                    lemmatizer,
                                    ngrammer,
                                    ])

In [8]:
tokenization = training_pipeline.fit(train_data)

In [9]:
tokenized_df = tokenization.transform(train_data)

In [10]:
tokenized_df.select("lemmatized").show(10,truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                          lemmatized|
+----------------------------------------------------------------------------------------------------+
|[{token, 1, 1, 5, {sentence -> 0}, []}, {token, 3, 7, star, {sentence -> 0}, []}, {token, 13, 16,...|
|[{token, 1, 6, almost, {sentence -> 0}, []}, {token, 8, 17, everything, {sentence -> 0}, []}, {to...|
|[{token, 1, 5, after, {sentence -> 0}, []}, {token, 7, 7, a, {sentence -> 0}, []}, {token, 9, 11,...|
|[{token, 1, 5, after, {sentence -> 0}, []}, {token, 7, 8, i, {sentence -> 0}, []}, {token, 10, 12...|
|[{token, 1, 5, after, {sentence -> 0}, []}, {token, 7, 11, year, {sentence -> 0}, []}, {token, 13...|
|[{token, 1, 2, as, {sentence -> 0}, []}, {token, 4, 4, a, {sentence -> 0}, []}, {token, 6, 11, co...|
|[{token, 1, 2, at, {sentence -> 0}, []}, {token, 4, 8, first, {sentence 

In [11]:
tokenized_df.select("ngrams").show(10,truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                              ngrams|
+----------------------------------------------------------------------------------------------------+
|[{chunk, 1, 1, 5, {sentence -> 0, chunk -> 0}, []}, {chunk, 3, 7, star, {sentence -> 0, chunk -> ...|
|[{chunk, 1, 6, almost, {sentence -> 0, chunk -> 0}, []}, {chunk, 8, 17, everything, {sentence -> ...|
|[{chunk, 1, 5, after, {sentence -> 0, chunk -> 0}, []}, {chunk, 7, 7, a, {sentence -> 0, chunk ->...|
|[{chunk, 1, 5, after, {sentence -> 0, chunk -> 0}, []}, {chunk, 7, 8, i, {sentence -> 0, chunk ->...|
|[{chunk, 1, 5, after, {sentence -> 0, chunk -> 0}, []}, {chunk, 7, 11, year, {sentence -> 0, chun...|
|[{chunk, 1, 2, as, {sentence -> 0, chunk -> 0}, []}, {chunk, 4, 4, a, {sentence -> 0, chunk -> 1}...|
|[{chunk, 1, 2, at, {sentence -> 0, chunk -> 0}, []}, {chunk, 4, 8, first