In [1]:
#
# Start SparkNLP
#
from sparknlp.annotator import * 
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline 
from pyspark.ml import Pipeline
import sparknlp  
import nltk
from pyspark.sql.functions import *
from nltk.corpus import stopwords
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import IDF, CountVectorizer, HashingTF

import tensorflow as tf
import numpy as np
import os

%matplotlib inline
import matplotlib.pyplot as plt

spark = sparknlp.start()

In [12]:
#Number of features(vocabulary size)
VOCAB_SIZE = 300

In [17]:
#download train and test data
#data is taken from https://towardsdatascience.com/text-classification-in-spark-nlp-with-bert-and-universal-sentence-encoders-e644d618ca32

! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv

--2020-05-04 14:46:45--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24032125 (23M) [text/plain]
Saving to: ‘news_category_train.csv.1’


2020-05-04 14:47:00 (1,60 MB/s) - ‘news_category_train.csv.1’ saved [24032125/24032125]

--2020-05-04 14:47:00--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request

In [19]:
#load data into spark
train_data = spark.read.option("header", "true").csv("news_category_train.csv") 
test_data = spark.read.option("header", "true").csv("news_category_test.csv") 

In [20]:
#get English stop words from nltk
eng_stopwords = stopwords.words('english')

In [27]:
#Define the pipeline components
#
#We generate features by tokenizing single words and bigrams, getting their lemmas, removing stopwords
#selecting the most frequently occuring words (at at least 10 documents) and weighting them by their TF-IDF
#
#The final annotation is in the 'features' column, which is going to be sued to train the Dummy TF model\
#It is a sparse vector of size VOCAB_SIZE which constitute the input to the model
doccumentAssembler = DocumentAssembler().setInputCol("description").setOutputCol("document").setCleanupMode("shrink")
sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentences")
tokenizer = Tokenizer().setInputCols(["sentences"]).setOutputCol("token")
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normal").setLowercase(True)
lemmatizer = LemmatizerModel.pretrained().setInputCols(["normal"]).setOutputCol("lemma")
stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["lemma"]).setOutputCol("clean_lemma").setCaseSensitive(False).setStopWords(eng_stopwords)
ngramGenerator = NGramGenerator() \
            .setInputCols(["clean_lemma"]) \
            .setOutputCol("ngrams") \
            .setN(2) \
            .setEnableCumulative(True) \
            .setDelimiter("_")
ngramsFinisher = Finisher().setInputCols(["ngrams"]).setCleanAnnotations(False)
vectorizer = CountVectorizer(inputCol="finished_ngrams", outputCol="rawFeatures", vocabSize=VOCAB_SIZE, minDF=10.0)
idf = IDF(inputCol="rawFeatures", outputCol="features")


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [28]:
#Build pipline
nlpPipeline = Pipeline(stages=[
    doccumentAssembler, sentenceDetector, tokenizer, normalizer, lemmatizer, stopwords_cleaner, 
    ngramGenerator, ngramsFinisher, vectorizer, idf])

In [29]:
#Fit and transform. There is no rocket science here

m = nlpPipeline.fit(train_data)
p_train_data = m.transform(train_data)
p_test_data = m.transform(test_data)


In [32]:
#Save the processed data so that we can continue with Scala
p_train_data.write.mode("overwrite").save("train.processed.parquet")
p_test_data.write.mode("overwrite").save("test.processed.parquet")