In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("sms-spam-classifier")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000/")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .getOrCreate()
)

In [3]:
df = spark.read.parquet("s3a://delta/")
df.show()

+-----+--------------------+--------------------+-------+
|label|                 sms|                 raw|is_spam|
+-----+--------------------+--------------------+-------+
|  ham|Dear i have reach...|Dear i have reach...|  false|
|  ham|Fighting with the...|Fighting with the...|  false|
|  ham|When can ü come out?|When can ü come out?|  false|
|  ham|Check with nuerol...|Check with nuerol...|  false|
|  ham|Lolnice. I went f...|Lolnice. I went f...|  false|
| spam|+123 Congratulati...|+123 Congratulati...|   true|
|  ham|No it's waiting i...|No it's waiting i...|  false|
|  ham|Maybe westshore o...|Maybe westshore o...|  false|
|  ham|You should know n...|You should know n...|  false|
|  ham|What's the signif...|What's the signif...|  false|
|  ham|Your opinion abou...|Your opinion abou...|  false|
|  ham|8 at the latest, ...|8 at the latest, ...|  false|
|  ham|Prabha..i'm soryd...|Prabha..i'm soryd...|  false|
|  ham|Lol ok your forgi...|Lol ok your forgi...|  false|
|  ham|No..jst

In [4]:
import mlflow
from datetime import datetime

mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.pyspark.ml.autolog()
mlflow.start_run()

<ActiveRun: >

In [5]:
from pyspark.ml.feature import (
    CountVectorizer, StringIndexer, VectorAssembler, Tokenizer, RegexTokenizer, StopWordsRemover)
from nltk.corpus import stopwords

stages = []
regexTokenizer = RegexTokenizer(inputCol="sms", outputCol="tokens", pattern="\\W+")
stages += [regexTokenizer]

STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
remover = StopWordsRemover(stopWords=STOPWORDS, inputCol="tokens", outputCol="real_tokens")
stages += [remover]

cv = CountVectorizer(inputCol="real_tokens", outputCol="token_features", minDF=2.0)#, vocabSize=3, minDF=2.0
stages += [cv]

indexer = StringIndexer(inputCol="label", outputCol="label_num")
stages += [indexer]

vecAssembler = VectorAssembler(inputCols=['token_features'], outputCol="features")
stages += [vecAssembler]

for stage in stages:
    print(stage)

RegexTokenizer_09eb1382bc2e
StopWordsRemover_8f6f93784e8f
CountVectorizer_bb9990a3ed93
StringIndexer_38f62958e333
VectorAssembler_ca52d7dda6c0


In [6]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
data = pipeline.fit(df).transform(df)



In [7]:
train, test = data.randomSplit([0.75, 0.25], seed = 42)

train_s  = train.where('is_spam').count()
train_ns = train.where('not is_spam').count()
test_s   = test.where('is_spam').count()
test_ns  = test.where('not is_spam').count()
print(f'''
      Train spam:  {train_s}
      Train ham:   {train_ns}
      Train ratio: {train_s / train_ns} (spam/ham)
      
      Test spam:  {test_s}
      Test ham:   {test_ns}
      Test ratio: {test_s / test_ns} (spam/ham)
      ''')


      Train spam:  302
      Train ham:   1951
      Train ratio: 0.15479241414659148 (spam/ham)
      
      Test spam:  107
      Test ham:   640
      Test ratio: 0.1671875 (spam/ham)
      


In [8]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="label_num")
model = nb.fit(train)



In [9]:
predictions = model.transform(test)
predictions.select("label_num", "prediction", "probability").show()

+---------+----------+--------------------+
|label_num|prediction|         probability|
+---------+----------+--------------------+
|      0.0|       0.0|[0.99999931023026...|
|      0.0|       0.0|[0.61250491536961...|
|      0.0|       0.0|[0.59263392506081...|
|      0.0|       0.0|[0.99315906235259...|
|      0.0|       0.0|[0.99999962018803...|
|      0.0|       0.0|[0.92158442655367...|
|      0.0|       0.0|[0.99896842750763...|
|      0.0|       0.0|[0.99987070398322...|
|      0.0|       0.0|[0.99999999232327...|
|      0.0|       0.0|[0.99999990872023...|
|      0.0|       0.0|[0.99855559704832...|
|      0.0|       0.0|[0.99719387987603...|
|      0.0|       0.0|[0.99816190309014...|
|      0.0|       0.0|[0.99977386895803...|
|      0.0|       0.0|[0.99592511130266...|
|      0.0|       0.0|[0.99647928138122...|
|      0.0|       0.0|[0.85335475912843...|
|      0.0|       0.0|[0.99168697462218...|
|      0.0|       0.0|[0.99993816408669...|
|      0.0|       0.0|[0.9998393

In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label_num")
accuracy = evaluator.evaluate(predictions)
print ("Test Area Under ROC: ", accuracy)

Test Area Under ROC:  0.9626022196261682


In [11]:
model_info = mlflow.spark.log_model(model, "naive-bayes")
print(model_info)

<mlflow.models.model.ModelInfo object at 0xffff638eb0a0>


In [12]:
mlflow.end_run()