We'll build an email spam filter

## Import

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("nlp").getOrCreate()

## Load data

In [2]:
import os

os.listdir()

['.ipynb_checkpoints',
 'My_basic_tools.ipynb',
 'My_code_along_NLP.ipynb',
 'NLP_Code_Along.ipynb',
 'smsspamcollection',
 'Tools_for_NLP.ipynb']

In [3]:
df = spark.read.csv("smsspamcollection/SMSSpamCollection", inferSchema=True, sep="\t")

In [4]:
df.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [5]:
dt = df.withColumnRenamed("_c0", "class").withColumnRenamed("_c1", "text")

In [6]:
dt.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



## Length can be a good feature

In [7]:
from pyspark.sql.functions import length

In [8]:
dt = dt.withColumn("len", length(dt["text"]))

In [9]:
dt.show()

+-----+--------------------+---+
|class|                text|len|
+-----+--------------------+---+
|  ham|Go until jurong p...|111|
|  ham|Ok lar... Joking ...| 29|
| spam|Free entry in 2 a...|155|
|  ham|U dun say so earl...| 49|
|  ham|Nah I don't think...| 61|
| spam|FreeMsg Hey there...|147|
|  ham|Even my brother i...| 77|
|  ham|As per your reque...|160|
| spam|WINNER!! As a val...|157|
| spam|Had your mobile 1...|154|
|  ham|I'm gonna be home...|109|
| spam|SIX chances to wi...|136|
| spam|URGENT! You have ...|155|
|  ham|I've been searchi...|196|
|  ham|I HAVE A DATE ON ...| 35|
| spam|XXXMobileMovieClu...|149|
|  ham|Oh k...i'm watchi...| 26|
|  ham|Eh u remember how...| 81|
|  ham|Fine if thats th...| 56|
| spam|England v Macedon...|155|
+-----+--------------------+---+
only showing top 20 rows



In [10]:
dt.groupBy("class").mean("len").show()

+-----+-----------------+
|class|         avg(len)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



In [11]:
dt.groupBy("class").agg({"len":"mean"}).show()

+-----+-----------------+
|class|         avg(len)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



## Tokenizing

In [12]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

tk = Tokenizer(inputCol='text', outputCol="tokens")

In [13]:
tokenized = tk.transform(dt)

In [14]:
tokenized.show(5)

+-----+--------------------+---+--------------------+
|class|                text|len|              tokens|
+-----+--------------------+---+--------------------+
|  ham|Go until jurong p...|111|[go, until, juron...|
|  ham|Ok lar... Joking ...| 29|[ok, lar..., joki...|
| spam|Free entry in 2 a...|155|[free, entry, in,...|
|  ham|U dun say so earl...| 49|[u, dun, say, so,...|
|  ham|Nah I don't think...| 61|[nah, i, don't, t...|
+-----+--------------------+---+--------------------+
only showing top 5 rows



In [15]:
rmv = StopWordsRemover(inputCol="tokens", outputCol="cleaned")

cleaned = rmv.transform(tokenized)

In [16]:
cleaned.show(5)

+-----+--------------------+---+--------------------+--------------------+
|class|                text|len|              tokens|             cleaned|
+-----+--------------------+---+--------------------+--------------------+
|  ham|Go until jurong p...|111|[go, until, juron...|[go, jurong, poin...|
|  ham|Ok lar... Joking ...| 29|[ok, lar..., joki...|[ok, lar..., joki...|
| spam|Free entry in 2 a...|155|[free, entry, in,...|[free, entry, 2, ...|
|  ham|U dun say so earl...| 49|[u, dun, say, so,...|[u, dun, say, ear...|
|  ham|Nah I don't think...| 61|[nah, i, don't, t...|[nah, think, goes...|
+-----+--------------------+---+--------------------+--------------------+
only showing top 5 rows



## TF-IDF

In [17]:
from pyspark.ml.feature import HashingTF, IDF, CountVectorizer

In [18]:
cnt = CountVectorizer(inputCol="cleaned", outputCol="cnt_vec")

In [19]:
cnt_dt = cnt.transform(cleaned)

In [20]:
hashed_dt.show(5)

+-----+--------------------+---+--------------------+--------------------+--------------------+
|class|                text|len|              tokens|             cleaned|                  tf|
+-----+--------------------+---+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|111|[go, until, juron...|[go, jurong, poin...|(262144,[3168,172...|
|  ham|Ok lar... Joking ...| 29|[ok, lar..., joki...|[ok, lar..., joki...|(262144,[122516,1...|
| spam|Free entry in 2 a...|155|[free, entry, in,...|[free, entry, 2, ...|(262144,[7958,944...|
|  ham|U dun say so earl...| 49|[u, dun, say, so,...|[u, dun, say, ear...|(262144,[28698,35...|
|  ham|Nah I don't think...| 61|[nah, i, don't, t...|[nah, think, goes...|(262144,[2710,259...|
+-----+--------------------+---+--------------------+--------------------+--------------------+
only showing top 5 rows



In [22]:
idf = IDF(minDocFreq=3, inputCol="tf", outputCol="tf-idf")
idf_data = idf.fit(hashed_dt).transform(hashed_dt)  #interesting that TF doesn't need fit, but idf need fit

## Convert label

In [23]:
from pyspark.ml.feature import StringIndexer

In [24]:
Ind = StringIndexer(inputCol="class", outputCol="label")

In [27]:
final_data = Ind.fit(idf_data).transform(idf_data)

In [28]:
final_data.show(5)

+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+-----+
|class|                text|len|              tokens|             cleaned|                  tf|              tf-idf|label|
+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+-----+
|  ham|Go until jurong p...|111|[go, until, juron...|[go, jurong, poin...|(262144,[3168,172...|(262144,[3168,172...|  0.0|
|  ham|Ok lar... Joking ...| 29|[ok, lar..., joki...|[ok, lar..., joki...|(262144,[122516,1...|(262144,[122516,1...|  0.0|
| spam|Free entry in 2 a...|155|[free, entry, in,...|[free, entry, 2, ...|(262144,[7958,944...|(262144,[7958,944...|  1.0|
|  ham|U dun say so earl...| 49|[u, dun, say, so,...|[u, dun, say, ear...|(262144,[28698,35...|(262144,[28698,35...|  0.0|
|  ham|Nah I don't think...| 61|[nah, i, don't, t...|[nah, think, goes...|(262144,[2710,259...|(262144,[2710,259...|  0.0|
+-----+---------