
# Email Spam Classification Pipeline
## 1. Read labeled email data

1 = ham 0 = spam


In [2]:

from pyspark.sql import SparkSession

# @hidden_cell
# This function is used to setup the access of Spark to your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
def set_hadoop_config_with_credentials_64e9c4bd3e4148978db0a312dfcc0a93(name):
    """This function sets the Hadoop configuration so it is possible to
    access data from Bluemix Object Storage using Spark"""

    prefix = 'fs.swift.service.' + name
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + '.auth.url', 'https://identity.open.softlayer.com'+'/v3/auth/tokens')
    hconf.set(prefix + '.auth.endpoint.prefix', 'endpoints')
    hconf.set(prefix + '.tenant', '419cd8dece644c82af5a615b62af38e1')
    hconf.set(prefix + '.username', 'babc87c7f7b5482c83c36c78831298be')
    hconf.set(prefix + '.password', 'qilA8zqp0/F?C,!!')
    hconf.setInt(prefix + '.http.port', 8080)
    hconf.set(prefix + '.region', 'dallas')
    hconf.setBoolean(prefix + '.public', False)

# you can choose any name
name = 'keystone'
set_hadoop_config_with_credentials_64e9c4bd3e4148978db0a312dfcc0a93(name)

spark = SparkSession.builder.getOrCreate()

# Please read the documentation of PySpark to learn more about the possibilities to load data files.
# PySpark documentation: https://spark.apache.org/docs/2.0.1/api/python/pyspark.sql.html#pyspark.sql.SparkSession
# The SparkSession object is already initalized for you.
# The following variable contains the path to your file on your Object Storage.
path_1 = "swift://SpamDetection." + name + "/part-r-00000-939c4239-aeb8-44a6-9c29-90d7ab74de65.snappy.parquet"

# Please read the documentation of PySpark to learn more about the possibilities to load data files.
# PySpark documentation: https://spark.apache.org/docs/2.0.1/api/python/pyspark.sql.html#pyspark.sql.SparkSession
# The SparkSession object is already initalized for you.
# The following variable contains the path to your file on your Object Storage.
path_2 = "swift://SpamDetection." + name + "/part-r-00001-939c4239-aeb8-44a6-9c29-90d7ab74de65.snappy.parquet"

In [19]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *
data=spark.read.parquet("swift://SpamDetection.keystone/")
data.cache()
data.printSchema()
spamData = data.select(col("email_id"), col("text"), col("label").cast(DoubleType()))
spamData.cache()
spamData.printSchema()
spamData.show(5)
spamData.count()

root
 |-- email_id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)

root
 |-- email_id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- label: double (nullable = true)

+--------+--------------------+-----+
|email_id|                text|label|
+--------+--------------------+-----+
|       1|One of a kind Mon...|  0.0|
|      10|Re: What to choos...|  1.0|
|     100|Strictly Private....|  0.0|
|    1000|Re: Flash is open...|  1.0|
|    1001|Re: Alsa/Redhat 8...|  1.0|
+--------+--------------------+-----+
only showing top 5 rows



2500

In [20]:
# Split data into training (80%) and test (20%)
trainDF, testDF = spamData.randomSplit([0.8, 0.2])
#print "The number of training data is ",trainDF.count()
#print "The number of test data is ",spamData.count()

In [4]:
print"Training data: %d" % spamData.count()

Training data: 2500



## 2. Create a Spark ML pipeline consisting of:

    Tokenizer - extract tokens from raw text
    Count vectorizer - convert tokens to term-frequency vectors
    IDF - normalize term-frequency vectors using TF-IDF
    Logistic Regression for binary classification



In [5]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import RegexTokenizer,CountVectorizer,IDF
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml import Pipeline

In [6]:
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[^a-zA-Z_0-9]+")
cv = CountVectorizer(inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features")
lr = LogisticRegression(maxIter=150)
pipeline = Pipeline(stages=[tokenizer, cv, idf, lr])

## 3. Use K-fold Cross Validation for Model Selection for Pipeline

In [7]:
auc_eval = BinaryClassificationEvaluator()
grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [1e-3, 1e-2]) \
    .addGrid(lr.elasticNetParam, [0.25, 0.0]) \
    .addGrid(cv.vocabSize, [10000, 50000]) \
    .addGrid(idf.minDocFreq, [0, 3]) \
    .build()
cross_val = CrossValidator(estimator=pipeline, evaluator=auc_eval, estimatorParamMaps=grid, numFolds=3)

In [12]:
pipeline_model = cross_val.fit(trainDF)
testResult=pipeline_model.transform(testDF)

In [13]:
scores = zip(grid, pipeline_model.avgMetrics)
scores.sort(key=lambda x: x[1], reverse=True)
print "Cross-validation scores:"
for s in scores:
    p = s[0]
    print "regParam: %s; elasticNet: %s, vocabSize: %s, minDocFreq: %s - ROC score: %s" % \
        (p[lr.regParam], p[lr.elasticNetParam], p[cv.vocabSize], p[idf.minDocFreq], s[1])

Cross-validation scores:
regParam: 0.01; elasticNet: 0.0, vocabSize: 50000, minDocFreq: 3 - ROC score: 0.992134839512
regParam: 0.01; elasticNet: 0.25, vocabSize: 50000, minDocFreq: 0 - ROC score: 0.992075087481
regParam: 0.01; elasticNet: 0.0, vocabSize: 10000, minDocFreq: 3 - ROC score: 0.99196725177
regParam: 0.01; elasticNet: 0.0, vocabSize: 10000, minDocFreq: 0 - ROC score: 0.991959774715
regParam: 0.01; elasticNet: 0.25, vocabSize: 50000, minDocFreq: 3 - ROC score: 0.991774806979
regParam: 0.01; elasticNet: 0.25, vocabSize: 10000, minDocFreq: 3 - ROC score: 0.991656833128
regParam: 0.01; elasticNet: 0.25, vocabSize: 10000, minDocFreq: 0 - ROC score: 0.99156355421
regParam: 0.001; elasticNet: 0.25, vocabSize: 50000, minDocFreq: 0 - ROC score: 0.991513830707
regParam: 0.001; elasticNet: 0.0, vocabSize: 50000, minDocFreq: 3 - ROC score: 0.991034904831
regParam: 0.001; elasticNet: 0.0, vocabSize: 10000, minDocFreq: 3 - ROC score: 0.990587910962
regParam: 0.001; elasticNet: 0.0, vocab

## 4. Evaluate the trained model and draw the ROC curve

In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve
testDF=testResult.select("prediction","label").toPandas()
a=np.array(testDF.label)
b=np.array(testDF.prediction)
fpr, tpr, thresholds = roc_curve(a, b, pos_label=1)
data={'FPR':fpr,'TPR':tpr}
rocPD=pd.DataFrame(data)

In [15]:
import brunel
%brunel data('rocPD') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid)

<IPython.core.display.Javascript object>


## 5. Predict data with the trained model

In [16]:
# example pipeline output
spamData.show(5)
pipeline_model.transform(spamData).select("text", "words", "features", "label", "prediction").show(5)

+--------+--------------------+-----+
|email_id|                text|label|
+--------+--------------------+-----+
|       1|One of a kind Mon...|  0.0|
|      10|Re: What to choos...|  1.0|
|     100|Strictly Private....|  0.0|
|    1000|Re: Flash is open...|  1.0|
|    1001|Re: Alsa/Redhat 8...|  1.0|
+--------+--------------------+-----+
only showing top 5 rows

+--------------------+--------------------+--------------------+-----+----------+
|                text|               words|            features|label|prediction|
+--------------------+--------------------+--------------------+-----+----------+
|One of a kind Mon...|[one, of, a, kind...|(50000,[0,1,2,3,5...|  0.0|       0.0|
|Re: What to choos...|[re, what, to, ch...|(50000,[0,1,2,3,5...|  1.0|       1.0|
|Strictly Private....|[strictly, privat...|(50000,[0,1,2,3,5...|  0.0|       0.0|
|Re: Flash is open...|[re, flash, is, o...|(50000,[0,1,2,3,5...|  1.0|       1.0|
|Re: Alsa/Redhat 8...|[re, alsa, redhat...|(50000,[0,1,2,3,