In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install nltk

import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("sms-spam-classifier")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000/")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .getOrCreate()
)

In [3]:
df = spark.read.parquet("s3a://delta/")
df.show()

+-----+--------------------+--------------------+-------+
|label|                 sms|                 raw|is_spam|
+-----+--------------------+--------------------+-------+
|  ham|Dear i have reach...|Dear i have reach...|  false|
|  ham|Fighting with the...|Fighting with the...|  false|
|  ham|When can ü come out?|When can ü come out?|  false|
|  ham|Check with nuerol...|Check with nuerol...|  false|
|  ham|Lolnice. I went f...|Lolnice. I went f...|  false|
| spam|+123 Congratulati...|+123 Congratulati...|   true|
|  ham|No it's waiting i...|No it's waiting i...|  false|
|  ham|Maybe westshore o...|Maybe westshore o...|  false|
|  ham|You should know n...|You should know n...|  false|
|  ham|What's the signif...|What's the signif...|  false|
|  ham|Your opinion abou...|Your opinion abou...|  false|
|  ham|8 at the latest, ...|8 at the latest, ...|  false|
|  ham|Prabha..i'm soryd...|Prabha..i'm soryd...|  false|
|  ham|Lol ok your forgi...|Lol ok your forgi...|  false|
|  ham|No..jst

In [4]:
from pyspark.ml.feature import (
    CountVectorizer, StringIndexer, VectorAssembler, Tokenizer, RegexTokenizer, StopWordsRemover)
from nltk.corpus import stopwords

stages = []
regexTokenizer = RegexTokenizer(inputCol="sms", outputCol="tokens", pattern="\\W+")
stages += [regexTokenizer]

STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
remover = StopWordsRemover(stopWords=STOPWORDS, inputCol="tokens", outputCol="real_tokens")
stages += [remover]

cv = CountVectorizer(inputCol="real_tokens", outputCol="token_features", minDF=2.0)#, vocabSize=3, minDF=2.0
stages += [cv]

indexer = StringIndexer(inputCol="label", outputCol="label_num")
stages += [indexer]

vecAssembler = VectorAssembler(inputCols=['token_features'], outputCol="features")
stages += [vecAssembler]

for stage in stages:
    print(stage)

RegexTokenizer_444d3b3b8292
StopWordsRemover_379771c19215
CountVectorizer_dffef28e6f15
StringIndexer_e58b30b79d21
VectorAssembler_d13ab4b981f2


In [5]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
data = pipeline.fit(df).transform(df)

In [6]:
train, test = data.randomSplit([0.75, 0.25], seed = 42)

train_s  = train.where('is_spam').count()
train_ns = train.where('not is_spam').count()
test_s   = test.where('is_spam').count()
test_ns  = test.where('not is_spam').count()
print(f'''
      Train spam:  {train_s}
      Train ham:   {train_ns}
      Train ratio: {train_s / train_ns} (spam/ham)
      
      Test spam:  {test_s}
      Test ham:   {test_ns}
      Test ratio: {test_s / test_ns} (spam/ham)
      ''')


      Train spam:  578
      Train ham:   3624
      Train ratio: 0.15949227373068434 (spam/ham)
      
      Test spam:  169
      Test ham:   1203
      Test ratio: 0.1404821280133001 (spam/ham)
      


In [7]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="label_num")
model = nb.fit(train)

In [8]:
predictions = model.transform(test)
predictions.select("label_num", "prediction", "probability").show()

+---------+----------+-------------------------------------------+
|label_num|prediction|probability                                |
+---------+----------+-------------------------------------------+
|0.0      |0.0       |[0.9926156838978694,0.0073843161021305125] |
|0.0      |0.0       |[0.9999997660093859,2.3399061404023128E-7] |
|0.0      |0.0       |[0.6681870392142397,0.3318129607857604]    |
|0.0      |0.0       |[0.9994436806706122,5.563193293877113E-4]  |
|0.0      |0.0       |[0.9907012513195488,0.009298748680451299]  |
|0.0      |0.0       |[0.9990242555780583,9.757444219416205E-4]  |
|0.0      |0.0       |[0.9813950690835235,0.018604930916476584]  |
|0.0      |0.0       |[0.945806700713427,0.054193299286572905]   |
|0.0      |0.0       |[0.9990964676377087,9.03532362291324E-4]   |
|0.0      |0.0       |[0.9999773905606338,2.2609439366254404E-5] |
|0.0      |0.0       |[0.9999678701542742,3.212984572574789E-5]  |
|0.0      |0.0       |[0.999999997591384,2.4086159175915283E-9

In [9]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label_num")
accuracy = evaluator.evaluate(predictions)
print ("Test Area Under ROC: ", accuracy)

Test Area Under ROC:  0.9760141067449719


In [10]:
print("END EXECUTION HERE")
1 / 0

END EXECUTION HERE


ZeroDivisionError: division by zero