In [1]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import logging
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline

In [2]:
conf=SparkConf().setAppName('spam')
sc=SparkContext(conf=conf)
sql=SQLContext(sc)

In [3]:
logger=logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter=logging.Formatter('%(asctime)s:%(created)f:%(filename)s:%(message)s:%(message)s')
file_handler=logging.FileHandler('spam.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)

In [4]:
fi=sql.read.options(header=False,inferschema=True,sep="\t").csv('hdfs://nameservice1/user/edureka_396003/SMSSpamCollection')

In [5]:
fit=fi.withColumnRenamed('_c0','status').withColumnRenamed('_c1','message')

In [6]:
fit.registerTempTable('fita')

In [8]:
fita = sql.sql('select case when status = "ham" then 1.0  else 0 end as label, message from fita')
fita.show(5, truncate = True)

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  1.0|Go until jurong p...|
|  1.0|Ok lar... Joking ...|
|  0.0|Free entry in 2 a...|
|  1.0|U dun say so earl...|
|  1.0|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



In [17]:
token=Tokenizer(inputCol="message",outputCol="Words")
w=token.transform(fita)
w.show(10)

+-----+--------------------+--------------------+
|label|             message|               Words|
+-----+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|
|  1.0|U dun say so earl...|[u, dun, say, so,...|
|  1.0|Nah I don't think...|[nah, i, don't, t...|
|  0.0|FreeMsg Hey there...|[freemsg, hey, th...|
|  1.0|Even my brother i...|[even, my, brothe...|
|  1.0|As per your reque...|[as, per, your, r...|
|  0.0|WINNER!! As a val...|[winner!!, as, a,...|
|  0.0|Had your mobile 1...|[had, your, mobil...|
+-----+--------------------+--------------------+
only showing top 10 rows



In [20]:
from pyspark.ml.feature import StopWordsRemover
remover=StopWordsRemover().setInputCol('Words').setOutputCol('filtered')
cleaned=remover.transform(w)
cleaned.show()

+-----+--------------------+--------------------+--------------------+
|label|             message|               Words|            filtered|
+-----+--------------------+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|
|  1.0|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|
|  1.0|Nah I don't think...|[nah, i, don't, t...|[nah, don't, thin...|
|  0.0|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|
|  1.0|Even my brother i...|[even, my, brothe...|[even, brother, l...|
|  1.0|As per your reque...|[as, per, your, r...|[per, request, 'm...|
|  0.0|WINNER!! As a val...|[winner!!, as, a,...|[winner!!, valued...|
|  0.0|Had your mobile 1...|[had, your, mobil...|[mobile, 11, mont...|
|  1.0|I'm gonna be home...|[i'm, gonna, be, ...|[i'm, gonna, home...|
|  0.0

In [21]:
stopwords=StopWordsRemover().getStopWords() + ['-','.']
remover=StopWordsRemover().setStopWords(stopwords).setInputCol('Words').setOutputCol('filtered_word')
cleaned_custom=remover.transform(cleaned)
cleaned_custom.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|             message|               Words|            filtered|       filtered_word|
+-----+--------------------+--------------------+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|[go, jurong, poin...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|[ok, lar..., joki...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|[free, entry, 2, ...|
|  1.0|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|[u, dun, say, ear...|
|  1.0|Nah I don't think...|[nah, i, don't, t...|[nah, don't, thin...|[nah, don't, thin...|
|  0.0|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|[freemsg, hey, da...|
|  1.0|Even my brother i...|[even, my, brothe...|[even, brother, l...|[even, brother, l...|
|  1.0|As per your reque...|[as, per, your, r...|[per, request, 'm...|[per, requ

In [11]:
count=CountVectorizer(inputCol='Words',outputCol='features').fit(w).transform(w)
count.show(4)

+-----+--------------------+--------------------+--------------------+
|label|             message|               Words|            features|
+-----+--------------------+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|(13587,[8,42,52,6...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|(13587,[5,75,411,...|
|  0.0|Free entry in 2 a...|[free, entry, in,...|(13587,[0,3,8,20,...|
|  1.0|U dun say so earl...|[u, dun, say, so,...|(13587,[5,22,60,1...|
+-----+--------------------+--------------------+--------------------+
only showing top 4 rows



In [12]:
idf=IDF(inputCol="features",outputCol="idf_features").fit(count).transform(count)

In [13]:
seed=0
test,train=idf.randomSplit([0.8,0.2],seed)

In [15]:
train.show(10,truncate=True)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|             message|               Words|            features|        idf_features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|(Bank of Granite ...|[(bank, of, grani...|(13587,[3,7,10,12...|(13587,[3,7,10,12...|
|  0.0|+123 Congratulati...|[+123, congratula...|(13587,[0,4,5,8,1...|(13587,[0,4,5,8,1...|
|  0.0|+449071512431 URG...|[+449071512431, u...|(13587,[0,4,7,14,...|(13587,[0,4,7,14,...|
|  0.0|3. You have recei...|[3., you, have, r...|(13587,[2,11,14,9...|(13587,[2,11,14,9...|
|  0.0|44 7732584351, Do...|[44, 7732584351,,...|(13587,[0,2,3,15,...|(13587,[0,2,3,15,...|
|  0.0|4mths half price ...|[4mths, half, pri...|(13587,[0,11,15,1...|(13587,[0,11,15,1...|
|  0.0|4mths half price ...|[4mths, half, pri...|(13587,[0,11,15,1...|(13587,[0,11,15,1...|
|  0.0|500 free text msg...|[500, free, text,...|(13587,[0,6,11,35...|(13587,[0,

# Logistic regression

In [12]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [24]:
lr=LogisticRegression(labelCol="label",featuresCol="idf_features")
model=lr.fit(train)
predict=model.transform(test)

In [27]:
eval=MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy").evaluate(predict)

In [28]:
print("Accuracy of Logistic Regression is %g" %eval)

0.949438202247


In [30]:
predict.select('prediction','label','idf_features').show(40)

+----------+-----+--------------------+
|prediction|label|        idf_features|
+----------+-----+--------------------+
|       1.0|  0.0|(13587,[0,4,11,12...|
|       0.0|  0.0|(13587,[4,10,20,5...|
|       0.0|  0.0|(13587,[0,4,5,8,1...|
|       0.0|  0.0|(13587,[0,2,7,24,...|
|       0.0|  0.0|(13587,[0,2,3,10,...|
|       1.0|  0.0|(13587,[353,387,8...|
|       0.0|  0.0|(13587,[0,3,4,7,1...|
|       1.0|  0.0|(13587,[0,3,10,16...|
|       0.0|  0.0|(13587,[224,665,7...|
|       0.0|  0.0|(13587,[0,6,24,28...|
|       1.0|  0.0|(13587,[0,5,12,20...|
|       1.0|  0.0|(13587,[0,4,5,6,1...|
|       1.0|  0.0|(13587,[0,4,5,6,1...|
|       0.0|  0.0|(13587,[0,5,10,16...|
|       1.0|  0.0|(13587,[5831,5893...|
|       1.0|  0.0|(13587,[0,4,5,6,1...|
|       1.0|  0.0|(13587,[0,8,10,16...|
|       1.0|  0.0|(13587,[0,10,11,1...|
|       1.0|  0.0|(13587,[0,10,11,5...|
|       0.0|  0.0|(13587,[0,2,3,14,...|
|       0.0|  0.0|(13587,[0,11,15,1...|
|       0.0|  0.0|(13587,[0,3,15,34...|


# Decision Tree Classifier

In [13]:
from pyspark.ml.classification import DecisionTreeClassifier
lr=DecisionTreeClassifier(labelCol='label',featuresCol='idf_features')
model=lr.fit(train)
predict=model.transform(test)
eval=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy').evaluate(predict)

In [33]:
print("Accuracy of Decision Tree is %g" %eval)

0.938876404494


# RandomforestClassifier

In [17]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
rf=RandomForestClassifier(labelCol='label',featuresCol='idf_features')
model=rf.fit(train)
predict=model.transform(test)
eval=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy').evaluate(predict)


In [21]:
print("Accuracy of Random Forest is %g" %eval)

Accuracy of Random Forest is 0.864719


In [22]:
from pyspark.ml.feature import NGram
cleaned_custom.show(2)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|             message|               Words|            filtered|       filtered_word|
+-----+--------------------+--------------------+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|[go, jurong, poin...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|[ok, lar..., joki...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [23]:
ngram = NGram(n=2, inputCol="filtered_word", outputCol="ngrams")
ngramDataFrame = ngram.transform(cleaned_custom)
ngramDataFrame.show(2)

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|             message|               Words|            filtered|       filtered_word|              ngrams|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  1.0|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|[go, jurong, poin...|[go jurong, juron...|
|  1.0|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|[ok, lar..., joki...|[ok lar..., lar.....|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows

