## NLP

For this code along we will build a spam filter! We'll use the various NLP tools we learned about as well as a new classifier, Naive Bayes.

We'll use a classic dataset for this - UCI Repository SMS Spam Detection:[https://archive.ics.uci.edu/dataset/228/sms+spam+collection]

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('spam filter').getOrCreate()

In [0]:
data = spark.read.csv('dbfs:/FileStore/SMSSpamCollection.txt', inferSchema=True, sep='\\t')

In [0]:
data.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [0]:
# renamed columns
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
data.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



## Clean and Prepare data

In [0]:
from pyspark.sql.functions import length

In [0]:
data = data.withColumn('length', length(data['text']))
data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [0]:
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham| 71.4545266210897|
| spam|138.6706827309237|
+-----+-----------------+



## Feature Transformations

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [0]:
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stopword = StopWordsRemover(inputCol='token_text', outputCol='stop_text')
count_vec = CountVectorizer(inputCol='stop_text', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
hand_spam_count = StringIndexer(inputCol='class', outputCol='label')

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

## The Model
we'll use Naive Bayes

In [0]:
from pyspark.ml.classification import NaiveBayes

In [0]:
nb = NaiveBayes()

## Pipeline

In [0]:
from pyspark.ml import Pipeline

In [0]:
pipe = Pipeline(stages=[hand_spam_count, tokenizer, stopword, count_vec, idf, clean_up])

In [0]:
cleaner = pipe.fit(data)

In [0]:
clean_data = cleaner.transform(data)

In [0]:
clean_data.limit(5).display()

class,text,length,label,token_text,stop_text,c_vec,tf_idf,features
ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",111,0.0,"List(go, until, jurong, point,, crazy.., available, only, in, bugis, n, great, world, la, e, buffet..., cine, there, got, amore, wat...)","List(go, jurong, point,, crazy.., available, bugis, n, great, world, la, e, buffet..., cine, got, amore, wat...)","Map(vectorType -> sparse, length -> 13423, indices -> List(7, 11, 31, 61, 71, 334, 627, 709, 1408, 1692, 4099, 6774, 7212, 8464, 11103, 12554), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 13423, indices -> List(7, 11, 31, 61, 71, 334, 627, 709, 1408, 1692, 4099, 6774, 7212, 8464, 11103, 12554), values -> List(3.1126188501633374, 3.2055125970560336, 3.822026551595063, 4.207206988531722, 4.32198250312415, 5.407171771460119, 5.917997395226109, 6.141140946540319, 6.680137447273006, 6.8342881271002645, 7.52743530766021, 7.9329004157683745, 7.9329004157683745, 7.9329004157683745, 7.9329004157683745, 7.9329004157683745))","Map(vectorType -> sparse, length -> 13424, indices -> List(7, 11, 31, 61, 71, 334, 627, 709, 1408, 1692, 4099, 6774, 7212, 8464, 11103, 12554, 13423), values -> List(3.1126188501633374, 3.2055125970560336, 3.822026551595063, 4.207206988531722, 4.32198250312415, 5.407171771460119, 5.917997395226109, 6.141140946540319, 6.680137447273006, 6.8342881271002645, 7.52743530766021, 7.9329004157683745, 7.9329004157683745, 7.9329004157683745, 7.9329004157683745, 7.9329004157683745, 111.0))"
ham,Ok lar... Joking wif u oni...,29,0.0,"List(ok, lar..., joking, wif, u, oni...)","List(ok, lar..., joking, wif, u, oni...)","Map(vectorType -> sparse, length -> 13423, indices -> List(0, 24, 301, 461, 2580, 4136), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 13423, indices -> List(0, 24, 301, 461, 2580, 4136), values -> List(2.016698353160939, 3.5761915890787823, 5.330210730323991, 5.7356758384321544, 7.239753235208429, 7.52743530766021))","Map(vectorType -> sparse, length -> 13424, indices -> List(0, 24, 301, 461, 2580, 4136, 13423), values -> List(2.016698353160939, 3.5761915890787823, 5.330210730323991, 5.7356758384321544, 7.239753235208429, 7.52743530766021, 29.0))"
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,155,1.0,"List(free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005., text, fa, to, 87121, to, receive, entry, question(std, txt, rate)t&c's, apply, 08452810075over18's)","List(free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005., text, fa, 87121, receive, entry, question(std, txt, rate)t&c's, apply, 08452810075over18's)","Map(vectorType -> sparse, length -> 13423, indices -> List(2, 13, 19, 30, 89, 154, 197, 307, 462, 474, 633, 860, 956, 1999, 2015, 2361, 3015, 3279, 3616, 4917, 5200), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 13423, indices -> List(2, 13, 19, 30, 89, 154, 197, 307, 462, 474, 633, 860, 956, 1999, 2015, 2361, 3015, 3279, 3616, 4917, 5200), values -> List(2.704469176684504, 3.332742771603827, 3.5634525633013525, 3.6702205387270586, 4.421354976937353, 4.841857962410058, 5.099687071712158, 11.070010285940008, 5.681608617161879, 5.7356758384321544, 5.917997395226109, 6.228152323529949, 6.323462503334274, 7.016609683894219, 7.016609683894219, 15.05487061532042, 7.239753235208429, 7.239753235208429, 7.52743530766021, 7.52743530766021, 7.52743530766021))","Map(vectorType -> sparse, length -> 13424, indices -> List(2, 13, 19, 30, 89, 154, 197, 307, 462, 474, 633, 860, 956, 1999, 2015, 2361, 3015, 3279, 3616, 4917, 5200, 13423), values -> List(2.704469176684504, 3.332742771603827, 3.5634525633013525, 3.6702205387270586, 4.421354976937353, 4.841857962410058, 5.099687071712158, 11.070010285940008, 5.681608617161879, 5.7356758384321544, 5.917997395226109, 6.228152323529949, 6.323462503334274, 7.016609683894219, 7.016609683894219, 15.05487061532042, 7.239753235208429, 7.239753235208429, 7.52743530766021, 7.52743530766021, 7.52743530766021, 155.0))"
ham,U dun say so early hor... U c already then say...,49,0.0,"List(u, dun, say, so, early, hor..., u, c, already, then, say...)","List(u, dun, say, early, hor..., u, c, already, say...)","Map(vectorType -> sparse, length -> 13423, indices -> List(0, 70, 80, 128, 147, 328, 2900, 4111), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 13423, indices -> List(0, 70, 80, 128, 147, 328, 2900, 4111), values -> List(4.033396706321878, 4.256599743861298, 4.32198250312415, 4.674803877746892, 4.734227298217693, 5.407171771460119, 7.239753235208429, 7.52743530766021))","Map(vectorType -> sparse, length -> 13424, indices -> List(0, 70, 80, 128, 147, 328, 2900, 4111, 13423), values -> List(4.033396706321878, 4.256599743861298, 4.32198250312415, 4.674803877746892, 4.734227298217693, 5.407171771460119, 7.239753235208429, 7.52743530766021, 49.0))"
ham,"Nah I don't think he goes to usf, he lives around here though",61,0.0,"List(nah, i, don't, think, he, goes, to, usf,, he, lives, around, here, though)","List(nah, think, goes, usf,, lives, around, though)","Map(vectorType -> sparse, length -> 13423, indices -> List(36, 134, 315, 550, 1328, 3615, 4091), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 13423, indices -> List(36, 134, 315, 550, 1328, 3615, 4091), values -> List(3.7977338590260183, 4.7140245909001735, 5.367951058306837, 5.792834252272104, 6.546606054648484, 7.52743530766021, 7.52743530766021))","Map(vectorType -> sparse, length -> 13424, indices -> List(36, 134, 315, 550, 1328, 3615, 4091, 13423), values -> List(3.7977338590260183, 4.7140245909001735, 5.367951058306837, 5.792834252272104, 6.546606054648484, 7.52743530766021, 7.52743530766021, 61.0))"


## Training and Evaluation

In [0]:
clean_data = clean_data.select(['label','features'])

In [0]:
clean_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,301,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
|  1.0|(13424,[10,60,140...|
|  0.0|(13424,[10,53,102...|
|  0.0|(13424,[127,185,4...|
|  1.0|(13424,[1,47,121,...|
|  1.0|(13424,[0,1,13,27...|
|  0.0|(13424,[18,43,117...|
|  1.0|(13424,[8,16,37,8...|
|  1.0|(13424,[13,30,47,...|
|  0.0|(13424,[39,95,221...|
|  0.0|(13424,[555,1797,...|
|  1.0|(13424,[30,109,11...|
|  0.0|(13424,[82,214,44...|
|  0.0|(13424,[0,2,49,13...|
|  0.0|(13424,[0,74,105,...|
|  1.0|(13424,[4,30,33,5...|
+-----+--------------------+
only showing top 20 rows



In [0]:
# split data train test

train, test = clean_data.randomSplit([0.7, 0.3])

In [0]:
spam_predictor = nb.fit(train)

In [0]:
data.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)



In [0]:
# prediction
test_result = spam_predictor.transform(test)

In [0]:
test_result.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,5,20,...|[-798.75772628022...|[1.0,2.0848726779...|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-679.96411268062...|[1.0,3.8134798377...|       0.0|
|  0.0|(13424,[0,1,14,79...|[-692.64914096939...|[1.0,1.9122545879...|       0.0|
|  0.0|(13424,[0,1,15,20...|[-674.57212009205...|[1.0,9.5677752860...|       0.0|
|  0.0|(13424,[0,1,18,20...|[-834.01738369970...|[1.0,4.0939551167...|       0.0|
|  0.0|(13424,[0,1,27,35...|[-1474.6167901680...|[0.99999999999996...|       0.0|
|  0.0|(13424,[0,1,150,1...|[-250.22119936598...|[0.99622470512885...|       0.0|
|  0.0|(13424,[0,1,500,5...|[-320.47080158600...|[0.99999999999715...|       0.0|
|  0.0|(13424,[0,2,3,4,6...|[-1277.3025291496...|[1.0,9.5509141395...|       0.0|
|  0.0|(13424,[0

In [0]:
## Evaluation 

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_result)
print('Acc : '+ str(acc))

Acc : 0.9220797233141105


## Good Job..!!