In [1]:
#To make our NLP Spam Filter, we are going to:
#Setup Spark Session
#1. Import Data
#2. NLP: Stop Words, Tokenize, TF,IDF
#3. Transform 
#4. Model
#5. Evaluate


In [2]:
import findspark
findspark.init('/home/ubuntu/spark-2.2.0-bin-hadoop2.7')

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlpProj').getOrCreate()

ImportError: No module named findspark

# 1. Import Data

In [3]:
file = '/SMSSpamCollection'
folder = '/home/ubuntu/data/raw'
data = spark.read.csv(folder+file, inferSchema=True, sep='\t')
data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



In [4]:
data.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [5]:
#Rename Columns
data = data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')
data.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



# NLP Transforms

In [6]:
#Create new feature called 'length'
from pyspark.sql.functions import length
data = data.withColumn('length',length(data['text']))
data.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [7]:
#Find average length of each
from pyspark.sql.functions import mean
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [8]:
#Tokenizer, Stop Words, Count Vectorizer, IDF, String Indexer
from pyspark.ml.feature import (Tokenizer, 
                                CountVectorizer, 
                                IDF, 
                                StringIndexer, 
                                StopWordsRemover)


In [9]:
#Initialize Models - Transform Pipline
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
sw_remover = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
count_vectorizer = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')

#Convert Class label to numbers
ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')

In [13]:
#Create Transforms
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')


# Pipeline

In [20]:
from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=(ham_spam_to_numeric,
                                 tokenizer,
                                 sw_remover,
                                 count_vectorizer,
                                 idf, assembler))
data_prep_pipe_fitted = data_prep_pipe.fit(data)
final_data = data_prep_pipe_fitted.transform(data)
final_data.printSchema()
final_data = final_data.select('label', 'features')
final_data.show(5)

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)
 |-- label: double (nullable = true)
 |-- token_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stop_token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- c_vec: vector (nullable = true)
 |-- tf_idf: vector (nullable = true)
 |-- features: vector (nullable = true)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
+-----+--------------------+
only showing top 5 rows



# ML Modeling

In [21]:
#Train Test Split
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [24]:
from pyspark.ml.classification import NaiveBayes
nb_clf = NaiveBayes()
nb_fitted = nb_clf.fit(train_data)
train_data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



# Evaluation

In [34]:
test_results = nb_fitted.transform(test_data)
test_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,7,8...|[-794.86190966821...|[1.0,3.4570735118...|       0.0|
|  0.0|(13424,[0,1,4,50,...|[-828.63318383182...|[1.0,5.6657295708...|       0.0|
|  0.0|(13424,[0,1,5,15,...|[-1001.9859120870...|[1.0,1.5155222939...|       0.0|
|  0.0|(13424,[0,1,5,20,...|[-802.89058872661...|[1.0,3.9589460003...|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-662.83854322418...|[1.0,6.4992513897...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [31]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator()
acc = acc_evaluator.evaluate(test_results)
print('ACC of NB MOdel')
print(acc)

ACC of NB MOdel
0.924193722540286
