# Natural Language Processing
Building a spam filter.  
We'll use a classic dataset for this - UCI Repository SMS Spam Detection:  

https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv('SMSSpamCollection', inferSchema=True, sep='\t')
df = df.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')
df.show(3)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
+-----+--------------------+
only showing top 3 rows



In [4]:
from pyspark.sql.functions import length

In [5]:
df = df.withColumn('length', length(df['text']))
df.show(3)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
+-----+--------------------+------+
only showing top 3 rows



In [6]:
df.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.48663212435233|
| spam|138.6706827309237|
+-----+-----------------+



Clearly spam messages are much longer than ham.

### Feature transforms

In [7]:
from pyspark.ml.feature import Tokenizer, IDF, CountVectorizer, StringIndexer, StopWordsRemover

In [8]:
tokenizer = Tokenizer(inputCol='text', outputCol='tokenized')
stopremover = StopWordsRemover(inputCol='tokenized', outputCol='stopremoved')
countvec = CountVectorizer(inputCol='stopremoved', outputCol='countveced')
idf = IDF(inputCol='countveced', outputCol='tf_idf')
class_num = StringIndexer(inputCol='class', outputCol='label')

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

### Modeling with pipeline and evaluation

In [11]:
from pyspark.ml.classification import NaiveBayes

In [12]:
nb = NaiveBayes()

In [13]:
from pyspark.ml import Pipeline

In [14]:
pipe = Pipeline(stages=[class_num, tokenizer, stopremover, countvec, idf, assembler])

In [15]:
data = pipe.fit(df).transform(df)

In [16]:
data.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)
 |-- label: double (nullable = true)
 |-- tokenized: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stopremoved: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- countveced: vector (nullable = true)
 |-- tf_idf: vector (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
data.head()

Row(class=u'ham', text=u'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', length=111, label=0.0, tokenized=[u'go', u'until', u'jurong', u'point,', u'crazy..', u'available', u'only', u'in', u'bugis', u'n', u'great', u'world', u'la', u'e', u'buffet...', u'cine', u'there', u'got', u'amore', u'wat...'], stopremoved=[u'go', u'jurong', u'point,', u'crazy..', u'available', u'bugis', u'n', u'great', u'world', u'la', u'e', u'buffet...', u'cine', u'got', u'amore', u'wat...'], countveced=SparseVector(13460, {8: 1.0, 12: 1.0, 33: 1.0, 64: 1.0, 74: 1.0, 342: 1.0, 640: 1.0, 781: 1.0, 1430: 1.0, 1644: 1.0, 4542: 1.0, 6323: 1.0, 8742: 1.0, 9308: 1.0, 10236: 1.0, 10950: 1.0}), tf_idf=SparseVector(13460, {8: 3.1123, 12: 3.2052, 33: 3.8217, 64: 4.2068, 74: 4.3216, 342: 5.4068, 640: 5.9176, 781: 6.1408, 1430: 6.6798, 1644: 6.8339, 4542: 7.5271, 6323: 7.9325, 8742: 7.9325, 9308: 7.9325, 10236: 7.9325, 10950: 7.9325}), features=SparseVector(13

In [18]:
final_data = data.select('label', 'features')
final_data.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13461,[8,12,33,6...|
|  0.0|(13461,[0,26,308,...|
|  1.0|(13461,[2,14,20,3...|
+-----+--------------------+
only showing top 3 rows



In [19]:
train, test = final_data.randomSplit([0.7, 0.3], seed=123)

In [20]:
model = nb.fit(train)

In [21]:
pred = model.transform(test)
pred.show(3)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13461,[0,1,2,14,...|[-611.84188634421...|[1.0,7.8680349218...|       0.0|
|  0.0|(13461,[0,1,2,44,...|[-1066.8191564181...|[1.0,1.0791739678...|       0.0|
|  0.0|(13461,[0,1,3,10,...|[-605.47111004459...|[1.0,9.2460638127...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [23]:
score = MulticlassClassificationEvaluator().evaluate(pred)
print 'Accuracy of this model at predicting spam is', score

Accuracy of this model at predicting spam is 0.928502126077
