# Create SPARK session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spamFilter_NBModel').getOrCreate()

In [2]:
sc = spark.sparkContext

# Import necessary modules

In [3]:
    import numpy as np
    import pandas as pd


# Load CSV file

In [4]:
df = spark.read.format("csv").load("spam_train.csv")

In [5]:
df.show(5)

+---+--------------------+----+----+----+
|_c0|                 _c1| _c2| _c3| _c4|
+---+--------------------+----+----+----+
|ham|Pass dis to all u...|null|null|null|
|ham|Sorry, I can't he...|null|null|null|
|ham|Die... I accident...|null|null|null|
|ham|�� called dad ore...|null|null|null|
|ham|Yes.mum lookin st...|null|null|null|
+---+--------------------+----+----+----+
only showing top 5 rows



In [6]:
df.count()

4574

# Preprocessing the data - remove unwanted columns and rename first two columns

In [7]:
df.columns

['_c0', '_c1', '_c2', '_c3', '_c4']

# Remove extra columns

In [8]:
df = df.drop("_c2","_c3","_c4")

In [9]:
df.columns

['_c0', '_c1']

In [10]:
df = df.withColumnRenamed("_c0", "Label_Desc")

In [11]:
df.columns

['Label_Desc', '_c1']

In [12]:
df = df.withColumnRenamed("_c1","Text")

In [13]:
df.show(5)

+----------+--------------------+
|Label_Desc|                Text|
+----------+--------------------+
|       ham|Pass dis to all u...|
|       ham|Sorry, I can't he...|
|       ham|Die... I accident...|
|       ham|�� called dad ore...|
|       ham|Yes.mum lookin st...|
+----------+--------------------+
only showing top 5 rows



# Remove Null rows

In [14]:
df = df.dropna()
df.count()

4573

In [15]:
df.count()

4573

# Add Labels

In [16]:
from pyspark.sql.functions import expr, col, column

In [17]:
df = df.withColumn("Label", expr("Label_Desc == 'spam'"))

In [18]:
df = df.selectExpr("Text","cast(Label as double) as label")

In [1]:
df.show()

NameError: name 'df' is not defined

# Tokening 'Text' column

In [20]:
from pyspark.ml.feature import Tokenizer

In [21]:
df_tkn = Tokenizer().setInputCol("Text").setOutputCol("TextTokenized").transform(df.select("Text", "label"))

In [22]:
df_tkn.show(5)

+--------------------+-----+--------------------+
|                Text|label|       TextTokenized|
+--------------------+-----+--------------------+
|Pass dis to all u...|  0.0|[pass, dis, to, a...|
|Sorry, I can't he...|  0.0|[sorry,, i, can't...|
|Die... I accident...|  0.0|[die..., i, accid...|
|�� called dad ore...|  0.0|[��, called, dad,...|
|Yes.mum lookin st...|  0.0|[yes.mum, lookin,...|
+--------------------+-----+--------------------+
only showing top 5 rows



# Filtering stopwords

In [23]:
from pyspark.ml.feature import StopWordsRemover

In [24]:
engStopWords = StopWordsRemover.loadDefaultStopWords("english")

In [25]:
df_filtered = StopWordsRemover().setStopWords(engStopWords).setInputCol("TextTokenized").setOutputCol("TextFiltered")

In [26]:
df_filtered = df_filtered.transform(df_tkn)

In [27]:
df_filtered.show(5)

+--------------------+-----+--------------------+--------------------+
|                Text|label|       TextTokenized|        TextFiltered|
+--------------------+-----+--------------------+--------------------+
|Pass dis to all u...|  0.0|[pass, dis, to, a...|[pass, dis, ur, c...|
|Sorry, I can't he...|  0.0|[sorry,, i, can't...|[sorry,, help, th...|
|Die... I accident...|  0.0|[die..., i, accid...|[die..., accident...|
|�� called dad ore...|  0.0|[��, called, dad,...|[��, called, dad,...|
|Yes.mum lookin st...|  0.0|[yes.mum, lookin,...|[yes.mum, lookin,...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



# Using CountVectorizer to get numerical representation of strings

In [28]:
from pyspark.ml.feature import CountVectorizer

In [29]:
cv = CountVectorizer()\
.setInputCol("TextFiltered")\
.setOutputCol("TextCV")\
.setVocabSize(500)\
.setMinTF(1)\
.setMinDF(2)

In [30]:
print(cv)

CountVectorizer_7813ac27ab9d


In [31]:
num_rows = df_filtered.count()
print(num_rows);

4573


In [32]:
df_cv = cv.fit(df_filtered)

In [33]:
df_cv = df_cv.transform(df_filtered)

In [35]:
df_cv.show(5)

+--------------------+-----+--------------------+--------------------+--------------------+
|                Text|label|       TextTokenized|        TextFiltered|              TextCV|
+--------------------+-----+--------------------+--------------------+--------------------+
|Pass dis to all u...|  0.0|[pass, dis, to, a...|[pass, dis, ur, c...|(500,[0,4,7,26,29...|
|Sorry, I can't he...|  0.0|[sorry,, i, can't...|[sorry,, help, th...|(500,[117,225],[1...|
|Die... I accident...|  0.0|[die..., i, accid...|[die..., accident...|(500,[2,69,122,21...|
|�� called dad ore...|  0.0|[��, called, dad,...|[��, called, dad,...|(500,[129,231,349...|
|Yes.mum lookin st...|  0.0|[yes.mum, lookin,...|[yes.mum, lookin,...|         (500,[],[])|
+--------------------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [36]:
df_input = df_cv.selectExpr("TextCV as features", "label" )

In [37]:
df_input.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(500,[0,4,7,26,29...|  0.0|
|(500,[117,225],[1...|  0.0|
|(500,[2,69,122,21...|  0.0|
|(500,[129,231,349...|  0.0|
|         (500,[],[])|  0.0|
+--------------------+-----+
only showing top 5 rows



# NaiveBayes Model fitting

In [38]:
from pyspark.ml.classification import NaiveBayes,NaiveBayesModel

In [39]:
nb = NaiveBayes()

In [40]:
print (nb.explainParams())

featuresCol: features column name. (default: features)
labelCol: label column name. (default: label)
modelType: The model type which is a string (case-sensitive). Supported options: multinomial (default) and bernoulli. (default: multinomial)
predictionCol: prediction column name. (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
rawPredictionCol: raw prediction (a.k.a. confidence) column name. (default: rawPrediction)
smoothing: The smoothing parameter, should be >= 0, default is 1.0 (default: 1.0)
thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0, excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is 

In [41]:
trainedModel = nb.fit(df_input)
trainedModel.save("./nbModel")

Py4JJavaError: An error occurred while calling o216.save.
: java.io.IOException: Path ./nbModel already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:702)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:179)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


# Evaluation Matrix

In [42]:
#trainedModel=NaiveBayesModel.load("./nbModel")

In [43]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [44]:
output = trainedModel.transform(df_input)\
.select("prediction", "label")\
.rdd.map(lambda x: (float(x[0]), float(x[1])))

In [45]:
metrics = BinaryClassificationMetrics(output)

In [46]:
print (metrics.areaUnderPR)

0.8704465814770835


In [47]:
print (metrics.areaUnderROC)

0.9387165003962046


In [48]:
print ("Receiver Operating Characteristic")
#metrics.roc.toDF().show()

Receiver Operating Characteristic


In [51]:
output = trainedModel.transform(df_input).show()
#\
#.select("prediction", "label")

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(500,[0,4,7,26,29...|  0.0|[-126.11207801974...|[0.99999984400693...|       0.0|
|(500,[117,225],[1...|  0.0|[-12.722487096796...|[0.97849002091688...|       0.0|
|(500,[2,69,122,21...|  0.0|[-28.364668860146...|[0.99757378447684...|       0.0|
|(500,[129,231,349...|  0.0|[-19.290936361914...|[0.99958564280319...|       0.0|
|         (500,[],[])|  0.0|[-0.1466379872982...|[0.86360655737704...|       0.0|
|(500,[22,146,162]...|  0.0|[-18.892182688394...|[0.89426178844061...|       0.0|
|(500,[40,153],[1....|  0.0|[-11.456820723465...|[0.99602992668883...|       0.0|
|(500,[42,63,495],...|  0.0|[-18.663762079979...|[0.93917936379729...|       0.0|
|         (500,[],[])|  0.0|[-0.1466379872982...|[0.86360655737704...|       0.0|
|(500,[8,17,67,8