In [1]:
import re
import numpy as np
import pandas as pd
import nltk

In [2]:
spam = pd.read_csv("spam.csv")

In [3]:
spam["target"] = np.where(spam["target"] == "spam", 1, 0)

In [4]:
spam.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [5]:
print spam.shape

(5572, 2)


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(spam["text"], spam["target"], random_state = 0)

In [8]:
print "Percent of documents that are spam:", np.mean(spam["target"])

Percent of documents that are spam: 0.134063173008


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
vect = CountVectorizer().fit(spam["text"])

In [18]:
tokens = np.array(vect.get_feature_names())
sorted(tokens, key=lambda x: len(x), reverse=True)[0] #longest token

u'hypotheticalhuagauahahuagahyuhagga'

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

In [26]:
vect = CountVectorizer().fit(xtrain)
xtrain_vect = vect.transform(xtrain)
nb = MultinomialNB()
nb.fit(xtrain_vect, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
print len(vect.get_feature_names())

7354


In [21]:
predsNB = nb.predict(vect.transform(xtest))
print "Naive Bayes AUC:", roc_auc_score(ytest, predsNB)

Naive Bayes AUC: 0.958136682342


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
vect = TfidfVectorizer().fit(xtrain)
print len(vect.get_feature_names())

7354


In [29]:
xtrain_vector = vect.transform(xtrain)
feature_names = np.array(vect.get_feature_names())

In [30]:
sorted_tfidf = xtrain_vector.max(0).toarray()[0].argsort()

In [32]:
print "Smallest tfidf:", feature_names[sorted_tfidf[:10]]
print "Largest tfidf:", feature_names[sorted_tfidf[:-11:-1]]

Smallest tfidf: [u'sympathetic' u'healer' u'aaniye' u'dependable' u'companion' u'listener'
 u'athletic' u'exterminator' u'psychiatrist' u'pest']
Largest tfidf: [u'146tf150p' u'havent' u'home' u'okie' u'thanx' u'er' u'anything' u'lei'
 u'nite' u'yup']


In [33]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [34]:
vect = TfidfVectorizer(min_df=3).fit(xtrain)
print len(vect.get_feature_names())

2295


In [35]:
xtrain_vect = vect.transform(xtrain)
nb = MultinomialNB(alpha=0.1) #Laplace smoothing parameter = 0.1
nb.fit(xtrain_vect, ytrain)
preds = nb.predict(vect.transform(xtest))
print "AUC:", roc_auc_score(ytest, preds)

AUC: 0.941624365482


In [38]:
print "Average length spam text:",np.mean(len(spam[spam["target"] == 1]["text"]))
print "Average length non-spam text:",np.mean(len(spam[spam["target"] == 0]["text"]))

Average length spam text: 747.0
Average length non-spam text: 4825.0


In [39]:
from sklearn.svm import SVC

In [41]:
vect = TfidfVectorizer(min_df=5).fit(xtrain)
print len(vect.get_feature_names())

1468


In [48]:
xtrain_vect = vect.transform(xtrain)
xtrain_vect = add_feature(xtrain_vect, xtrain.apply(lambda x: len(x)))

In [51]:
svc = SVC(C=10000)
svc.fit(xtrain_vect, ytrain)
xtest_vect = vect.transform(xtest)
xtest_vect = add_feature(xtest_vect, xtest.apply(lambda x: len(x)))
preds = svc.predict(xtest_vect)
print "AUC:", roc_auc_score(ytest, preds)

AUC: 0.958136682342


In [55]:
print "Average number digits non-spam document:", np.mean(spam[spam["target"] == 0].text.apply(lambda x: sum([c.isdigit() for c in x])))
print "Average number digits spam document:", np.mean(spam[spam["target"] == 1].text.apply(lambda x: sum([c.isdigit() for c in x])))

Average number digits non-spam document: 0.299274611399
Average number digits spam document: 15.7590361446


In [56]:
from sklearn.linear_model import LogisticRegression

In [58]:
vect = CountVectorizer(min_df=5, ngram_range=(1, 3)).fit(xtrain)
xtrain_vect = vect.transform(xtrain)
xtrain_vect = add_feature(xtrain_vect, xtrain.apply(lambda x: sum([c.isdigit() for c in x])))
print len(vect.get_feature_names())

3383


In [60]:
model = LogisticRegression(C=100)
model.fit(xtrain_vect, ytrain)
xtest_vect = vect.transform(xtest)
xtest_vect = add_feature(xtest_vect, xtest.apply(lambda x: sum([c.isdigit() for c in x])))
preds = model.predict(xtest_vect)
print "AUC:", roc_auc_score(ytest, preds)

AUC: 0.968318676468


In [79]:
string = "$kajflafj#&#*&*#"
print len(re.findall(r"\W", string))

8


In [83]:
print "Average non-word characters non-spam:", np.mean(spam[spam["target"] == 0]["text"].apply(lambda x: len(re.findall(r"\W", x))))
print "Average non-word characters spam:", np.mean(spam[spam["target"] == 1]["text"].apply(lambda x: len(re.findall(r"\W", x))))

Average non-word characters non-spam: 17.4864248705
Average non-word characters spam: 30.3895582329


In [84]:
vect = CountVectorizer(min_df=5, ngram_range=(2, 5)).fit(xtrain)
xtrain_vect = vect.transform(xtrain)
xtrain_vect = add_feature(xtrain_vect, xtrain.apply(lambda x: sum([c.isdigit() for c in x])))
xtrain_vect = add_feature(xtrain_vect, xtrain.apply(lambda x: len(re.findall(r"\W", x))))
print len(vect.get_feature_names())

2448


In [86]:
model = LogisticRegression(C=100)
model.fit(xtrain_vect, ytrain)
xtest_vect = vect.transform(xtest)
xtest_vect = add_feature(xtest_vect, xtest.apply(lambda x: sum([c.isdigit() for c in x])))
xtest_vect = add_feature(xtest_vect, xtest.apply(lambda x: len(re.findall(r"\W", x))))
preds = model.predict(xtest_vect)
print "AUC:", roc_auc_score(ytest, preds)

AUC: 0.941236015144
