In [1]:
! pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.7/486.7 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import os
import sys

import sparknlp

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from sparknlp.pretrained import *

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

import pandas as pd

spark = sparknlp.start(gpu = True)

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  4.4.1
Apache Spark version:  3.4.0


In [3]:
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv

--2023-05-01 21:54:20--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24032125 (23M) [text/plain]
Saving to: ‘news_category_train.csv’


2023-05-01 21:54:20 (161 MB/s) - ‘news_category_train.csv’ saved [24032125/24032125]

--2023-05-01 21:54:22--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP re

In [4]:
trainDataset = spark.read.option("header", True).csv("news_category_train.csv")
trainDataset.show(10, truncate=50)

+--------+--------------------------------------------------+
|category|                                       description|
+--------+--------------------------------------------------+
|Business| Short sellers, Wall Street's dwindling band of...|
|Business| Private investment firm Carlyle Group, which h...|
|Business| Soaring crude prices plus worries about the ec...|
|Business| Authorities have halted oil export flows from ...|
|Business| Tearaway world oil prices, toppling records an...|
|Business| Stocks ended slightly higher on Friday but sta...|
|Business| Assets of the nation's retail money market mut...|
|Business| Retail sales bounced back a bit in July, and n...|
|Business|" After earning a PH.D. in Sociology, Danny Baz...|
|Business| Short sellers, Wall Street's dwindling  band o...|
+--------+--------------------------------------------------+
only showing top 10 rows



In [5]:
from pyspark.sql.functions import col

trainDataset.groupBy("category").count().orderBy(col("count").desc()).show()

+--------+-----+
|category|count|
+--------+-----+
|   World|30000|
|Sci/Tech|30000|
|  Sports|30000|
|Business|30000|
+--------+-----+



In [6]:
testDataset = spark.read.option("header", True).csv("news_category_test.csv")
testDataset.groupBy("category").count().orderBy(col("count").desc()).show()

+--------+-----+
|category|count|
+--------+-----+
|   World| 1900|
|Sci/Tech| 1900|
|  Sports| 1900|
|Business| 1900|
+--------+-----+



In [7]:
#Question 2a)

In [8]:
document = DocumentAssembler().setInputCol("description").setOutputCol("document")

bert_embed = BertSentenceEmbeddings.pretrained('sent_small_bert_L8_512').setInputCols(["document"]).setOutputCol("sentence_embeddings")

classifierDL = ClassifierDLApproach().setInputCols(["sentence_embeddings"]).setOutputCol("prediction").setOutputCol("class").setLabelColumn("category").setMaxEpochs(5).setEnableOutputLogs(True)

bert_clfpipeline = Pipeline(stages = [document, bert_embed, classifierDL])

sent_small_bert_L8_512 download started this may take some time.
Approximate size to download 149.1 MB
[OK!]


In [9]:
bert_pipelinemodel = bert_clfpipeline.fit(trainDataset)

In [10]:
pred = bert_pipelinemodel.transform(testDataset)

In [11]:
from sklearn.metrics import classification_report, accuracy_score

df = bert_pipelinemodel.transform(testDataset).select('category', 'description', 'class.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.86      0.83      0.85      1900
    Sci/Tech       0.84      0.89      0.86      1900
      Sports       0.95      0.97      0.96      1900
       World       0.92      0.87      0.89      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600

0.8906578947368421


In [12]:
#Question 2b) Lemmatization and Stopwords

In [49]:
document = DocumentAssembler().setInputCol("description").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner().setInputCols(["normalized"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc").setInputCols(["cleanTokens"]).setOutputCol("lemma")

word_embed = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["document", "lemma"]).setOutputCol("embeddings").setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings().setInputCols(["document", "embeddings"]).setOutputCol("sentence_embeddings").setPoolingStrategy("AVERAGE")

classsifier = ClassifierDLApproach().setInputCols(["sentence_embeddings"]).setOutputCol("class").setLabelColumn("category").setMaxEpochs(5).setEnableOutputLogs(True)

bert_Pipeline = Pipeline(stages = [document, tokenizer, normalizer, stopwords_cleaner, lemmatizer, word_embed, embeddingsSentence, classsifier])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [50]:
bert_pipelinemodel = bert_Pipeline.fit(trainDataset)

In [51]:
from sklearn.metrics import classification_report, accuracy_score

df = bert_pipelinemodel.transform(testDataset).select('category', 'description', 'class.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.84      0.79      0.81      1900
    Sci/Tech       0.80      0.86      0.83      1900
      Sports       0.92      0.96      0.94      1900
       World       0.89      0.85      0.87      1900

    accuracy                           0.86      7600
   macro avg       0.86      0.86      0.86      7600
weighted avg       0.86      0.86      0.86      7600

0.8640789473684211


In [16]:
#2b lemmatization

In [31]:
document = DocumentAssembler().setInputCol("description").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc").setInputCols(["normalized"]).setOutputCol("lemma")

word_embed = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["document", "lemma"]).setOutputCol("embeddings").setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings().setInputCols(["document", "embeddings"]).setOutputCol("sentence_embeddings").setPoolingStrategy("AVERAGE")

classsifier = ClassifierDLApproach().setInputCols(["sentence_embeddings"]).setOutputCol("class").setLabelColumn("category").setMaxEpochs(5).setEnableOutputLogs(True)

bert_pipelineLemma = Pipeline(stages = [document, tokenizer, normalizer, lemmatizer, word_embed, embeddingsSentence, classsifier])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [32]:
bert_pipelinemodelLemma = bert_pipelineLemma.fit(trainDataset)

In [33]:
from sklearn.metrics import classification_report, accuracy_score

df = bert_pipelinemodelLemma.transform(testDataset).select('category', 'description', 'class.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.82      0.83      0.83      1900
    Sci/Tech       0.86      0.82      0.84      1900
      Sports       0.92      0.97      0.94      1900
       World       0.88      0.86      0.87      1900

    accuracy                           0.87      7600
   macro avg       0.87      0.87      0.87      7600
weighted avg       0.87      0.87      0.87      7600

0.8714473684210526


In [20]:
#2b stopwords

In [46]:
document = DocumentAssembler().setInputCol("description").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner().setInputCols(["normalized"]).setOutputCol("cleanTokens").setCaseSensitive(False)

word_embed = BertEmbeddings.pretrained('bert_base_cased', 'en').setInputCols(["document", "cleanTokens"]).setOutputCol("embeddings").setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings().setInputCols(["document", "embeddings"]).setOutputCol("sentence_embeddings").setPoolingStrategy("AVERAGE")

classsifier = ClassifierDLApproach().setInputCols(["sentence_embeddings"]).setOutputCol("class").setLabelColumn("category").setMaxEpochs(5).setEnableOutputLogs(True)

bert_pipelineSW = Pipeline(stages = [document, tokenizer, normalizer, stopwords_cleaner, word_embed, embeddingsSentence, classsifier])

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [47]:
bert_pipelinemodelSW = bert_pipelineSW.fit(trainDataset)

In [48]:
from sklearn.metrics import classification_report, accuracy_score

df = bert_pipelinemodelSW.transform(testDataset).select('category', 'description', 'class.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.81      0.84      0.82      1900
    Sci/Tech       0.87      0.82      0.84      1900
      Sports       0.94      0.94      0.94      1900
       World       0.86      0.88      0.87      1900

    accuracy                           0.87      7600
   macro avg       0.87      0.87      0.87      7600
weighted avg       0.87      0.87      0.87      7600

0.8698684210526316


In [24]:
#Question 2c)

In [43]:
document = DocumentAssembler().setInputCol("description").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner().setInputCols(["normalized"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc").setInputCols(["cleanTokens"]).setOutputCol("lemma")

word_embed = RoBertaEmbeddings.pretrained('roberta_base', 'en').setInputCols(["document", "lemma"]).setOutputCol("embeddings").setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings().setInputCols(["document", "embeddings"]).setOutputCol("sentence_embeddings").setPoolingStrategy("AVERAGE")

classsifier = ClassifierDLApproach().setInputCols(["sentence_embeddings"]).setOutputCol("class").setLabelColumn("category").setMaxEpochs(5).setEnableOutputLogs(True)

roberta_pipeline = Pipeline(stages = [document, tokenizer, normalizer, stopwords_cleaner, lemmatizer, word_embed, embeddingsSentence, classsifier])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
roberta_base download started this may take some time.
Approximate size to download 284.8 MB
[OK!]


In [44]:
roberta_pipelinemodel = roberta_pipeline.fit(trainDataset)

In [45]:
from sklearn.metrics import classification_report, accuracy_score

df = roberta_pipelinemodel.transform(testDataset).select('category', 'description', 'class.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

print(classification_report(df.category, df.result))
print(accuracy_score(df.category, df.result))

              precision    recall  f1-score   support

    Business       0.79      0.86      0.83      1900
    Sci/Tech       0.87      0.81      0.84      1900
      Sports       0.93      0.94      0.94      1900
       World       0.89      0.85      0.87      1900

    accuracy                           0.87      7600
   macro avg       0.87      0.87      0.87      7600
weighted avg       0.87      0.87      0.87      7600

0.8675
