In [2]:
import pyspark.sql.functions as F

In [3]:
#Spark session is being created

spark = SparkSession.builder.appName("SpamDetection Notebook").getOrCreate()

In [4]:
#Reading the file from the hdfs wherein the file has the data not comma separated but the tab separated
raw = spark.read.option("delimiter","\t").csv("/user/edureka_960126/SMSSpamCollection").toDF("spam","message")
raw.show(2)

+----+--------------------+
|spam|             message|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
+----+--------------------+
only showing top 2 rows



In [5]:
# Extract word
#Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). 
#A simple Tokenizer class provides this functionality.
#Note that all resulting words are the small letter words

from pyspark.ml.feature import Tokenizer

#Here the tokeniser take the input column and the output column
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")

#Transform the Data so that new column is created over the raw dataframe
transformed = tokenizer.transform(raw)
transformed.show(1)

+----+--------------------+--------------------+
|spam|             message|               words|
+----+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|
+----+--------------------+--------------------+
only showing top 1 row



In [6]:
# Remove stopwords
#Stop words are words which should be excluded from the input, typically because the words appear frequently 
#and don’t carry as much meaning

from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(1)

+----+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
+----+--------------------+--------------------+--------------------+
only showing top 1 row



In [7]:
# custom stopwords
#so here we are in custom manner removing the stop words whrein we have added the - as a stop word
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(1)

+----+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
+----+--------------------+--------------------+--------------------+
only showing top 1 row



In [8]:
# Generate features
#CountVectorizer and CountVectorizerModel aim to help convert a collection of text documents to vectors of token counts.
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)
print(featured.toPandas())

      spam                                            message  \
0      ham  Go until jurong point, crazy.. Available only ...   
1      ham                      Ok lar... Joking wif u oni...   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3      ham  U dun say so early hor... U c already then say...   
4      ham  Nah I don't think he goes to usf, he lives aro...   
5     spam  FreeMsg Hey there darling it's been 3 week's n...   
6      ham  Even my brother is not like to speak with me. ...   
7      ham  As per your request 'Melle Melle (Oru Minnamin...   
8     spam  WINNER!! As a valued network customer you have...   
9     spam  Had your mobile 11 months or more? U R entitle...   
10     ham  I'm gonna be home soon and i don't want to tal...   
11    spam  SIX chances to win CASH! From 100 to 20,000 po...   
12    spam  URGENT! You have won a 1 week FREE membership ...   
13     ham  I've been searching for the right words to tha...   
14     ham               

In [9]:
# convert to binary label
#Here we are just doing the indexing that is actually converting the spam column into 0 and 1 which are labels for ham and spam
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)

In [10]:
# Split to train and test
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
training, test = indexed.randomSplit([0.7, 0.3], seed = 12345)
training.show(1)

+----+--------------------+--------------------+--------------------+--------------------+-----+
|spam|             message|               words|            filtered|            features|label|
+----+--------------------+--------------------+--------------------+--------------------+-----+
| ham| &lt;#&gt;  in mc...|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|(13457,[3,7,5193,...|  0.0|
+----+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [11]:
# Logistic regression
#so here Logistic Regression is used for predicting whether it is a spam or not
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(13457,[3,12,168,...|  0.0|       0.0|
|(13457,[3,13,80,8...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 2 rows

('Accuracy', 0.5)


In [12]:
# Random Forest
#so here Random Forest Regression is used for predicting whether it is a spam or not
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(10)
model = rf.fit(training)
predictions = model.transform(test)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

('Accuracy', 0.5046082949308756)


In [13]:
from pyspark.ml.feature import NGram
ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams")
ngramDataFrame = ngram.transform(cleaned)
ngramDataFrame.select("ngrams").show(2, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go jurong, jurong point,, point, crazy.., crazy.. available, available bugis, bugis n, n great, great world, world la, la e, e buffet..., buffet... cine, cine got, got amore, amore wat...]|
|[ok lar..., lar... joking, joking wif, wif u, u oni...]                                                                                                                                      |
+---------------------------------------

In [14]:
#We have to use the ngram colums instead of filtered to check whether the accuracy is improved or not
#Feature detection using the ngrams column
cv_ngram_model = CountVectorizer().setInputCol("ngrams").setOutputCol("features").fit(ngramDataFrame)
featured = cv_ngram_model.transform(ngramDataFrame)

#Indexing of the strings in the column
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)

#Splitting of the Data between the training and the test Data
training, test = indexed.randomSplit([0.7, 0.3], seed = 12345)
training.show(1)

# Logistic regression
#so here Logistic Regression is used for predicting whether it is a spam or not
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)





+----+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|spam|             message|               words|            filtered|              ngrams|            features|label|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
| ham| &lt;#&gt;  in mc...|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|[ &lt;#&gt;, &lt;...|(37490,[0,1,11013...|  0.0|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(37490,[6607,7274...|  0.0|       0.0|
|(37490,[10,6515,6...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 2 rows

('Accuracy', 0.5)


In [15]:
#Tri-gram algorithm working

from pyspark.ml.feature import NGram
ngram = NGram().setN(3).setInputCol("filtered").setOutputCol("ngrams")
ngramDataFrame = ngram.transform(cleaned)
ngramDataFrame.select("ngrams").show(2, False)

#We have to use the ngram colums instead of filtered to check whether the accuracy is improved or not
#Feature detection using the ngrams column
cv_ngram_model = CountVectorizer().setInputCol("ngrams").setOutputCol("features").fit(ngramDataFrame)
featured = cv_ngram_model.transform(ngramDataFrame)

#Indexing of the strings in the column
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)

#Splitting of the Data between the training and the test Data
training, test = indexed.randomSplit([0.7, 0.3], seed = 12345)
training.show(1)

# Logistic regression
#so here Logistic Regression is used for predicting whether it is a spam or not
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)


+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                                                                                                                                           |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go jurong point,, jurong point, crazy.., point, crazy.. available, crazy.. available bugis, available bugis n, bugis n great, n great world, great world la, world la e, la e buffet..., e buffet... cine, buffet... cine 

In [18]:
from pyspark.ml import Pipeline, PipelineModel

tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")

stopwords = StopWordsRemover().getStopWords()+ ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features")
indexer = StringIndexer().setInputCol("spam").setOutputCol("label")
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, lr])
model = pipeline.fit(raw)
model.write().overwrite().save("/user/edureka_960126/spam_model")

In [19]:
pipeline = PipelineModel.load("/user/edureka_960126/spam_model")
pipeline

PipelineModel_4c8bb940fdc2767d6890

In [25]:
#Testing of the stored ML model on the HDFS

#predictions =pipeline.transform(raw)
#predictions.select("label","prediction").show(5)

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 5 rows



In [None]:
#Pipeline Model has already been created will be used to work on the the streaming data sent to the flume
#streaming data will be retreived by the Spark Streaming to predict the result

from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import PipelineModel

from pyspark import SparkContext

from pyspark.streaming import StreamingContext

from pyspark.streaming.flume import FlumeUtils

import json

from pyspark.sql import Row


#Model stored in the hdfs to be used for the machine learning application
#is saved as variable pipeline


#Schema is being set for the dataframe creation out of the records
cSchema = StructType([StructField("spam", StringType(),nullable=True),StructField("message", StringType(),nullable=True)])




#Process for working out the prediction
def process(rdd):
    
        # Get the singleton instance of SparkSession
        #spark = getSparkSessionInstance(rdd.context.getConf())

        #Conversion to the Data Frame to work on the dataframe for working out the predictions
        df = spark.createDataFrame(rdd,schema=cSchema)

        #Now we work on the dataframe for producing the predictions
        cols = df.columns

        #Getting the output after passing the data frame through vector assembler to produce the dataframe with vectorised features
        df = pipeline.transform(df)
        
        df.select("label","prediction").show()
        

#starting of the Spark streaming Context
ssc= StreamingContext(spark.sparkContext, 15)

#Flume stream is generated after the spark streaming receiver is connected to custom spark streaming sink created at the host with a given port
flumeStream = FlumeUtils.createPollingStream(ssc, [('ip-20-0-41-164.ec2.internal' , 9090)])

#Flume_microbatches count
flumeStream.count().pprint()


#RDD[Strings]

#Here we get the strings of the json format input data 
lines = flumeStream.map(lambda x: x[1])

#RDD of Dicts or JSON objects by extracting the json objects from the string 
records_dict=lines.map(lambda x: json.loads(x))

#Rows RDD rows rdd is created here
rows_rdd=records_dict.map(lambda res: Row(res['spam'],res['message']))

rows_rdd.foreachRDD(process)

ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate







-------------------------------------------
Time: 2020-07-21 04:14:15
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:14:30
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:14:45
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:15:00
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:15:15
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

----------------------------------------

-------------------------------------------
Time: 2020-07-21 04:25:00
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:25:15
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:25:30
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:25:45
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

-------------------------------------------
Time: 2020-07-21 04:26:00
-------------------------------------------

+-----+----------+
|label|prediction|
+-----+----------+
+-----+----------+

----------------------------------------