# SMS Spam

In [3]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [4]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

### Percentage of the documents in `spam_data` are spam

In [26]:
len(spam_data[spam_data['target']==1])/len(spam_data)*100

13.406317300789663

### Longest token in the vocabulary


In [27]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)
max(vect.get_feature_names(), key=lambda token:len(token))

'com1win150ppmx3age16subscription'

###  A multinomial Naive Bayes classifier model with smoothing `alpha=0.1`.

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)
predictions = clf.predict(vect.transform(X_test))
roc_auc_score(y_test, predictions)

0.9720812182741116

### Question 

What is the average length of documents (number of characters) for not spam and spam documents?



def answer_six():
    spam_data['length'] = spam_data['text'].str.len()
    nonSpam = spam_data[spam_data['target'] == 0]
    spam = spam_data[spam_data['target'] == 1]
    return (nonSpam['length'].sum()/len(nonSpam), spam['length'].sum()/len(spam))

In [31]:
answer_six()

(71.02362694300518, 138.8661311914324)

In [33]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')


### SVM

In [34]:
from sklearn.svm import SVC

def answer_seven():
    vect = TfidfVectorizer(min_df=5).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    X_train_vectorized = add_feature(X_train_vectorized, X_train.str.len())
    X_test_vectorized = vect.transform(X_test)
    X_test_vectorized = add_feature(X_test_vectorized, X_test.str.len())
    model = SVC(C=10000)
    model.fit(X_train_vectorized, y_train)
    predictions = model.predict(X_test_vectorized)    
    return roc_auc_score(y_test, predictions)

In [35]:
answer_seven()

0.9581366823421557

### Question

What is the average number of digits per document for not spam and spam documents?

*This function should return a tuple (average # digits not spam, average # digits spam).*

In [36]:

import re
spam = [re.findall("[0-9]",i) for i in spam_data['text'][spam_data.target==1]]
non_spam = [re.findall("[0-9]",i) for i in spam_data['text'][spam_data.target==0]]
((np.mean(list(map(len,non_spam))),np.mean(list(map(len,spam)))))

(0.2992746113989637, 15.759036144578314)

In [37]:
answer_eight()

(0.2992746113989637, 15.759036144578314)