In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark2 = SparkSession.builder.appName('ml').getOrCreate()
#Create a Spark Session
SpSession = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("ml") \
    .config("spark.executor.memory", "0.1g") \
    .config("spark.cores.max","2") \
    .config("spark.sql.warehouse.dir", "/home/sushant/Projects/Spark_Project/temp")\
    .getOrCreate()
    
#Get the Spark Context from Spark Session    
SpContext = SpSession.sparkContext

In [2]:
"""--------------------------------------------------------------------------
Load Data
-------------------------------------------------------------------------"""
#Load the CSV file into a RDD
smsData = SpContext.textFile("SMSSpamCollection.csv",2)
smsData.cache()
smsData.collect()

"""--------------------------------------------------------------------------
Prepare data for ML
-------------------------------------------------------------------------"""

def TransformToVector(inputStr):
    attList=inputStr.split(",")
    smsType= 0.0 if attList[0] == "ham" else 1.0
    return [smsType, attList[1]]

smsXformed=smsData.map(TransformToVector)

smsDf= SpSession.createDataFrame(smsXformed,
                          ["label","message"])
smsDf.cache()
smsDf.select("label","message").show()


+-----+--------------------+
|label|             message|
+-----+--------------------+
|  0.0|Go until jurong p...|
|  0.0|Ok lar... Joking ...|
|  0.0|U dun say so earl...|
|  0.0|Nah I don't think...|
|  0.0|Even my brother i...|
|  0.0|As per your reque...|
|  0.0|I'm gonna be home...|
|  0.0|I've been searchi...|
|  0.0|I HAVE A DATE ON ...|
|  0.0|Oh k...i'm watchi...|
|  0.0|Eh u remember how...|
|  0.0|Fine if thats th...|
|  0.0|Is that seriously...|
|  0.0|I‘m going to try ...|
|  0.0|So ü pay first la...|
|  0.0|Aft i finish my l...|
|  0.0|Ffffffffff. Alrig...|
|  0.0|Just forced mysel...|
|  0.0|Lol your always s...|
|  0.0|Did you catch the...|
+-----+--------------------+
only showing top 20 rows



In [4]:
"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""
#Split training and testing
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

#Setup pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Split into words and then build TF-IDF
tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \
        outputCol="tempfeatures")
idf=IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier=NaiveBayes()

pipeline = Pipeline(stages=[tokenizer, hashingTF, \
                idf, nbClassifier])

#Build a model with a pipeline
nbModel=pipeline.fit(trainingData)
#Predict on test data
prediction=nbModel.transform(testData)

#Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="label",metricName="accuracy")
evaluator.evaluate(prediction)

#Draw confusion matrics
prediction.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   51|
|  0.0|       1.0|    4|
|  1.0|       0.0|    1|
|  0.0|       0.0|   50|
+-----+----------+-----+

