
# Email Spam Classification Pipeline
## 1. Read labeled email data

1 = ham 0 = spam


In [1]:
val spamData = spark.read.format("jdbc").options(Map("driver" -> "com.ibm.db2.jcc.DB2Driver",
      "url" -> "jdbc:db2://9.125.72.72:430/LOCDB11", "user" -> "TUSER01",
      "password" -> "C6DESHOP", "dbtable" -> "MLZ.SPAMEMAIL")).load()   
spamData.cache()
spamData.printSchema()
spamData.show(5)
spamData.count()

root
 |-- email_id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)

+--------+--------------------+-----+
|email_id|                text|label|
+--------+--------------------+-----+
|       1|One of a kind Mon...|    0|
|      10|Re: What to choos...|    1|
|     100|Strictly Private....|    0|
|    1000|Re: Flash is open...|    1|
|    1001|Re: Alsa/Redhat 8...|    1|
+--------+--------------------+-----+
only showing top 5 rows



2500

In [2]:
// Split data into training (80%) and test (20%)
val Array(trainDF,testDF) = spamData.randomSplit(Array(0.8, 0.2))
println ("The number of training data is ",trainDF.count())
println ("The number of test data is ",testDF.count())

(The number of training data is ,2002)
(The number of test data is ,498)



## 2. Create a Spark ML pipeline consisting of:

    Tokenizer - extract tokens from raw text
    Count vectorizer - convert tokens to term-frequency vectors
    IDF - normalize term-frequency vectors using TF-IDF
    Logistic Regression for binary classification



In [3]:
//import libraries
import org.apache.spark.ml.feature.{RegexTokenizer,CountVectorizer,IDF}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{CrossValidator,ParamGridBuilder}
import com.ibm.analytics.ngp.pipeline._

In [5]:
val tokenizer=new RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("[^a-zA-Z_0-9]+")
val cv=new CountVectorizer().setInputCol("words").setOutputCol("tf")
val idf=new IDF().setInputCol("tf").setOutputCol("features")
val lr=new LogisticRegression().setMaxIter(150)
val pipeline=new IBMSparkPipeline().setStages(Array(tokenizer,cv,idf,lr))


## 3. Use K-fold Cross Validation for Model Selection for Pipeline

In [6]:
val auc_eval=new BinaryClassificationEvaluator()
val grid= new ParamGridBuilder().addGrid(lr.regParam,Array(1e-3,1e-2)).
                                 addGrid(lr.elasticNetParam,Array(0.25,0.0)).
                                 addGrid(cv.vocabSize,Array(10000,50000)).
                                 addGrid(idf.minDocFreq,Array(0,3)).build()
val cross_val=new CrossValidator().setEstimator(pipeline).setEvaluator(auc_eval).setEstimatorParamMaps(grid).setNumFolds(3) 

In [9]:
val pipeline_model=cross_val.fit(trainDF)
val testResult=pipeline_model.transform(testDF)

In [8]:
import com.ibm.analytics.ngp.util.Connections
val model = pipeline.fit(trainDF)
Connections.setEnvironment("dev")
Connections.setMetaServiceHost("http://9.30.109.52:12501") 

model.save("tuser01/SpamIn52")

## 4. Evaluate the trained model and draw the ROC curve

In [13]:
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
val testingResultScores = testResult.select(col("prediction"),col("label").cast(DoubleType)).rdd.map(r => (r(0).asInstanceOf[Double], r(1) .asInstanceOf[Double]))
val bc = new BinaryClassificationMetrics(testingResultScores)

In [14]:
val rocDF = spark.createDataFrame(bc.roc).
                    withColumnRenamed("_1", "FPR").
                    withColumnRenamed("_2", "TPR")
rocDF.show()

+-------------------+----------------+
|                FPR|             TPR|
+-------------------+----------------+
|                0.0|             0.0|
|0.05517241379310345|0.97953216374269|
|                1.0|             1.0|
|                1.0|             1.0|
+-------------------+----------------+



In [15]:
%AddJar -magic https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar -f

Starting download from https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar
Finished download of spark-kernel-brunel-all-2.3.jar


In [16]:
%%brunel data('rocDF') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid)


## 5. Predict data with the trained model

In [17]:
// example pipeline output
spamData.show(5)
pipeline_model.transform(spamData).select("text", "words", "features", "label", "prediction").show(5)

+--------+--------------------+-----+
|email_id|                text|label|
+--------+--------------------+-----+
|       1|One of a kind Mon...|    0|
|      10|Re: What to choos...|    1|
|     100|Strictly Private....|    0|
|    1000|Re: Flash is open...|    1|
|    1001|Re: Alsa/Redhat 8...|    1|
+--------+--------------------+-----+
only showing top 5 rows

+--------------------+--------------------+--------------------+-----+----------+
|                text|               words|            features|label|prediction|
+--------------------+--------------------+--------------------+-----+----------+
|One of a kind Mon...|[one, of, a, kind...|(10000,[0,1,2,3,4...|    0|       0.0|
|Re: What to choos...|[re, what, to, ch...|(10000,[0,1,2,3,4...|    1|       1.0|
|Strictly Private....|[strictly, privat...|(10000,[0,1,2,3,4...|    0|       0.0|
|Re: Flash is open...|[re, flash, is, o...|(10000,[0,1,2,3,4...|    1|       1.0|
|Re: Alsa/Redhat 8...|[re, alsa, redhat...|(10000,[0,1,2,3,