### Email Spam Filter

In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
sc = pyspark.SparkContext(appName = 'SpamFilter')

In [3]:
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from numpy import array
import pandas as pd

##### Load Data

In [4]:
nospam = sc.textFile("emails_nospam.txt")
spam = sc.textFile("emails_spam.txt")
query = sc.textFile("query.txt")

##### Split words and hash

In [5]:
features = HashingTF(numFeatures=1000)

In [6]:
features_spam = spam.map(lambda x: features.transform(x.split(" ")))
features_nospam = nospam.map(lambda x: features.transform(x.split(" ")))
features_query = query.map(lambda x: (features.transform(x.split(" ")),x))

##### Label Data to be used for predict and classify.

In [7]:
pos = features_spam.map(lambda x: LabeledPoint(1, x))
neg = features_nospam.map(lambda x: LabeledPoint(0, x))
query_class = features_query.map(lambda x: ((LabeledPoint(0, x[0])),x[1]))

##### Join data and train the logistic model.

In [8]:
data = pos.union(neg)
logistic_reg = LogisticRegressionWithSGD()
model = logistic_reg.train(data)

In [9]:
classify = query_class.map(lambda x: (model.predict(x[0].features),x[1]))

##### Output of the model's classification with each email from 'query.txt'.

In [12]:
classify.collect()

[(1,
  u"this is a year of promotion for Galaxy End of YearPromo You have 1 week remaining to retrieve your won prize for the Samsung Galaxy Xmas Promo 'C' draw category winning prize of Seven Hundred and Fifty Thousand Euros each and a Samsung Galaxy S6 EDGE. Winning Ticket Number:WIN-707-COS.  We advise you to keep this winning notification confidential and away from public notice to avoid double claim/mistransfer or impersonation until after remittance/payment to you."),
 (1,
  u"you are the lucky one: We've picked out 10 new matches for you. Meet them now and then check out all the singles in your area! you might win a prize too"),
 (1,
  u'Do not miss your chances: Get Viagra real cheap!  Send money right away to ...'),
 (1,
  u'Get real money fast: With my position in the office i assure you with 100% risk free that this transaction is not a childish game play and i want you to indicate your full interest with assurance of trust that you will not betray me once the fund is transf

##### Accuracy of Model by predicting using the training data.

In [13]:
predictedlabel = data.map(lambda x: (model.predict(x.features), x.label))

In [14]:
df = pd.DataFrame(data = predictedlabel.collect()).rename(columns = {0: 'Predicted', 1:'Actual'})

In [15]:
df['Misclassified'] = abs(df['Predicted'] - df['Actual'])

In [18]:
df.head()

Unnamed: 0,Predicted,Actual,Misclassified
0,1,1.0,0.0
1,1,1.0,0.0
2,1,1.0,0.0
3,1,1.0,0.0
4,1,1.0,0.0


In [19]:
spam_df = df[df['Actual'] ==1.0]

In [20]:
nonspam_df = df[df['Actual'] ==0.0]

In [22]:
Overall_Accuracy = 100*(1 - df['Misclassified'].sum()/ df['Actual'].count())
print "Overall Accuracy of Model:",Overall_Accuracy,'%'

Overall Accuracy of Model: 100.0 %


In [23]:
Spam_Accuracy = 100*(1 - spam_df['Misclassified'].sum()/ spam_df['Actual'].count())
print "Spam Accuracy of Model:",Spam_Accuracy,'%'

Spam Accuracy of Model: 100.0 %


In [24]:
nonspam_Accuracy = 100*(1 - nonspam_df['Misclassified'].sum()/ nonspam_df['Actual'].count())
print "Non-Spam Accuracy of Model:",nonspam_Accuracy,'%'

Non-Spam Accuracy of Model: 100.0 %
