In [2]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
import logging
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline

In [3]:
conf=SparkConf().setAppName('spam')
sc=SparkContext(conf=conf)
sql=SQLContext(sc)

In [3]:
logger=logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter=logging.Formatter('%(asctime)s:%(created)f:%(filename)s:%(message)s:%(message)s')
file_handler=logging.FileHandler('spam.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)

In [4]:
fi=sql.read.options(header=False,inferschema=True,sep="\t").csv('hdfs://nameservice1/user/edureka_37986/SMSSpamCollection')

In [5]:
fit=fi.withColumnRenamed('_c0','status').withColumnRenamed('_c1','message')

In [6]:
fit.registerTempTable('fita')

In [7]:
fita = sql.sql('select case when status = "ham" then 1.0  else 0 end as label, message from fita')
#fita.show(5, truncate = True)

In [8]:
token=Tokenizer(inputCol="message",outputCol="Words")
w=token.transform(fita)
#w.show(10)

In [9]:
count=CountVectorizer(inputCol='Words',outputCol='features').fit(w).transform(w)
#count.show(4)

In [9]:
idf=IDF(inputCol="features",outputCol="idf_features").fit(count).transform(count)

In [10]:
seed=0
test,train=idf.randomSplit([0.8,0.2],seed)

In [1]:
train.show(10,truncate=False)

NameError: name 'train' is not defined

# Logistic regression

In [12]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [24]:
lr=LogisticRegression(labelCol="label",featuresCol="idf_features")
model=lr.fit(train)
predict=model.transform(test)

In [27]:
eval=MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy").evaluate(predict)

In [28]:
print("Accuracy of Logistic Regression is %g" %eval)

0.949438202247


In [30]:
predict.select('prediction','label','idf_features').show(40)

+----------+-----+--------------------+
|prediction|label|        idf_features|
+----------+-----+--------------------+
|       1.0|  0.0|(13587,[0,4,11,12...|
|       0.0|  0.0|(13587,[4,10,20,5...|
|       0.0|  0.0|(13587,[0,4,5,8,1...|
|       0.0|  0.0|(13587,[0,2,7,24,...|
|       0.0|  0.0|(13587,[0,2,3,10,...|
|       1.0|  0.0|(13587,[353,387,8...|
|       0.0|  0.0|(13587,[0,3,4,7,1...|
|       1.0|  0.0|(13587,[0,3,10,16...|
|       0.0|  0.0|(13587,[224,665,7...|
|       0.0|  0.0|(13587,[0,6,24,28...|
|       1.0|  0.0|(13587,[0,5,12,20...|
|       1.0|  0.0|(13587,[0,4,5,6,1...|
|       1.0|  0.0|(13587,[0,4,5,6,1...|
|       0.0|  0.0|(13587,[0,5,10,16...|
|       1.0|  0.0|(13587,[5831,5893...|
|       1.0|  0.0|(13587,[0,4,5,6,1...|
|       1.0|  0.0|(13587,[0,8,10,16...|
|       1.0|  0.0|(13587,[0,10,11,1...|
|       1.0|  0.0|(13587,[0,10,11,5...|
|       0.0|  0.0|(13587,[0,2,3,14,...|
|       0.0|  0.0|(13587,[0,11,15,1...|
|       0.0|  0.0|(13587,[0,3,15,34...|


# Decision Tree Classifier

In [13]:
from pyspark.ml.classification import DecisionTreeClassifier
lr=DecisionTreeClassifier(labelCol='label',featuresCol='idf_features')
model=lr.fit(train)
predict=model.transform(test)
eval=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy').evaluate(predict)

In [33]:
print("Accuracy of Decision Tree is %g" %eval)

0.938876404494


# RandomforestClassifier

In [17]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
rf=RandomForestClassifier(labelCol='label',featuresCol='idf_features')
model=rf.fit(train)
predict=model.transform(test)
eval=MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy').evaluate(predict)


In [21]:
print("Accuracy of Random Forest is %g" %eval)

Accuracy of Random Forest is 0.864719
