# Imports

In [1]:


from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession

In [2]:
app_name="Case Study 1: SpamDetection"

conf = SparkConf().setAppName(app_name)
sc = SparkContext(conf = conf)
spark = SparkSession(sc)

In [3]:
sc.applicationId

'local-1572360162142'

In [4]:
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("pyspark script logger initialized")

# Extract words from the SMS message

In [5]:
def get_hdfs_filepath(file_name, on_cloud=True):
    # path to folder containing this code
    prefix = '/data/spark/8_cs1_dataset/'
    if on_cloud:
        bucket  = os.environ['BUCKET']
        file_path = bucket + prefix + file_name
    else:
        file_path = '/Users/val' + prefix + file_name

    return file_path

In [6]:
LOG = get_hdfs_filepath('SMSSpamCollection')


In [7]:
print(LOG)

gs://drive3/data/spark/8_cs1_dataset/SMSSpamCollection


In [8]:
import pyspark.sql.functions as F

In [9]:
raw = spark.read.option("delimiter","\t").csv(LOG).toDF("spam","message")
raw.show(2)

+----+--------------------+
|spam|             message|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
+----+--------------------+
only showing top 2 rows



In [10]:
# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
transformed = tokenizer.transform(raw)
transformed.show(1)

+----+--------------------+--------------------+
|spam|             message|               words|
+----+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|
+----+--------------------+--------------------+
only showing top 1 row



# Remove stop words

In [11]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(1)

+----+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
+----+--------------------+--------------------+--------------------+
only showing top 1 row



# Modify the stop words to include your custom words such as ‘-‘

In [12]:
# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

# Create the features from SMS message using CountVectorizer

In [13]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)

In [50]:
featured.limit(5).show(1)#toPandas()

+----+--------------------+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|            features|
+----+--------------------+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(13422,[7,11,31,6...|
+----+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



In [14]:
# convert to binary label
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)

In [36]:
import pandas as pd
def set_pandas_options() -> None:
    pd.options.display.max_columns = 1000
    pd.options.display.max_rows = 1000
    pd.options.display.max_colwidth = 1000
    pd.options.display.width = None

set_pandas_options()

In [37]:
indexed.limit(1).toPandas()

Unnamed: 0,spam,message,words,filtered,features,label
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...","[go, until, jurong, point,, crazy.., available, only, in, bugis, n, great, world, la, e, buffet..., cine, there, got, amore, wat...]","[go, jurong, point,, crazy.., available, bugis, n, great, world, la, e, buffet..., cine, got, amore, wat...]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...)",0.0


# Split the data into train and test - decide on a strategy

In [26]:
# Split to train and test
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
training, test = indexed.randomSplit([0.7, 0.3], seed = 12345)
training.limit(5).toPandas()

Unnamed: 0,spam,message,words,filtered,features,label
0,ham,&lt;#&gt; in mca. But not conform.,"[, &lt;#&gt;, , in, mca., but, not, conform.]","[, &lt;#&gt;, , mca., conform.]","(0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0.0
1,ham,&lt;#&gt; mins but i had to stop somewhere f...,"[, &lt;#&gt;, , mins, but, i, had, to, stop, s...","[, &lt;#&gt;, , mins, stop, somewhere, first.]","(0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",0.0
2,ham,&lt;DECIMAL&gt; m but its not a common car he...,"[, &lt;decimal&gt;, m, but, its, not, a, commo...","[, &lt;decimal&gt;, m, common, car, better, bu...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,ham,and picking them up from various points,"[, and, , picking, them, up, from, various, po...","[, , picking, various, points]","(0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,ham,"said kiss, kiss, i can't do the sound effects...","[, said, kiss,, kiss,, i, can't, do, the, soun...","[, said, kiss,, kiss,, sound, effects!, gorgeo...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


# Use logistic regression and check the accuracy

In [16]:
# Logistic regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(13422,[3,11,159,...|  0.0|       0.0|
|(13422,[3,12,77,8...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 2 rows

Accuracy 0.5


# Try to use a Random Forest classifier and see if it increases the accuracy

In [17]:
# Random Forest
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(10)
model = rf.fit(training)
predictions = model.transform(test)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

Accuracy 0.5092165898617511


# Introduce bi-gram and tri-gram and note the change in accuracy

In [18]:
from pyspark.ml.feature import NGram
ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams")
ngramDataFrame = ngram.transform(cleaned)
ngramDataFrame.select("ngrams").show(2, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go jurong, jurong point,, point, crazy.., crazy.. available, available bugis, bugis n, n great, great world, world la, la e, e buffet..., buffet... cine, cine got, got amore, amore wat...]|
|[ok lar..., lar... joking, joking wif, wif u, u oni...]                                                                                                                                      |
+---------------------------------------

# Decide on a strategy and generate a data pipeline

In [19]:
from pyspark.ml import Pipeline, PipelineModel
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")

stopwords = StopWordsRemover().getStopWords()+ ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features")
indexer = StringIndexer().setInputCol("spam").setOutputCol("label")
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, lr])
model = pipeline.fit(raw)
model.write().overwrite().save("models/spam_model4.4")

In [20]:
pipeline = PipelineModel.load("models/spam_model4.4")