In [1]:
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName("SpamDetection Notebook").getOrCreate()

In [3]:
raw = spark.read.option("delimiter","\t").csv("use_cases/SMSSpamCollection").toDF("spam","message")
raw.show(2)

+----+--------------------+
|spam|             message|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
+----+--------------------+
only showing top 2 rows



In [4]:
# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
transformed = tokenizer.transform(raw)
transformed.show(1)

+----+--------------------+--------------------+
|spam|             message|               words|
+----+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|
+----+--------------------+--------------------+
only showing top 1 row



In [5]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(1)

+----+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
+----+--------------------+--------------------+--------------------+
only showing top 1 row



In [6]:
# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

In [7]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)

In [8]:
# convert to binary label
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)

In [9]:
# Split to train and test
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
training, test = indexed.randomSplit([0.7, 0.3], seed = 12345)
training.show(1)

+----+--------------------+--------------------+--------------------+--------------------+-----+
|spam|             message|               words|            filtered|            features|label|
+----+--------------------+--------------------+--------------------+--------------------+-----+
| ham| &lt;#&gt;  in mc...|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|(13457,[3,7,5193,...|  0.0|
+----+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [10]:
# Logistic regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(13457,[3,12,168,...|  0.0|       0.0|
|(13457,[3,13,80,8...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 2 rows

('Accuracy', 0.5)


In [None]:
# Random Forest
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(10)
model = rf.fit(training)
predictions = model.transform(test)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

In [22]:
from pyspark.ml.feature import NGram
ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams")
ngramDataFrame = ngram.transform(cleaned)
ngramDataFrame.select("ngrams").show(2, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go jurong, jurong point,, point, crazy.., crazy.. available, available bugis, bugis n, n great, great world, world la, la e, e buffet..., buffet... cine, cine got, got amore, amore wat...]|
|[ok lar..., lar... joking, joking wif, wif u, u oni...]                                                                                                                                      |
+---------------------------------------

In [27]:
from pyspark.ml import Pipeline, PipelineModel
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")

stopwords = StopWordsRemover().getStopWords()+ ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features")
indexer = StringIndexer().setInputCol("spam").setOutputCol("label")
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, lr])
model = pipeline.fit(raw)
model.write().overwrite().save("use_cases/spam_model4.4")

In [29]:
pipeline = PipelineModel.load("use_cases/spam_model4.4")

In [None]:
val prop = new java.util.Properties
        prop.put("driver", "com.mysql.jdbc.Driver");
        prop.put("url", "jdbc:mysql://mysqldb.edu.cloudlab.com/use_cases");
        prop.put("user", "labuser");
        prop.put("password", "edureka");
        predictions.drop("features").write.mode("append").jdbc(
            prop.getProperty("url"), "bike_sharing", prop)