# SMS Spam Classification

data from: https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [1]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
# Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [2]:
df=pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['class', 'text'])

In [3]:
df.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [47]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.25)

In [48]:
X_train[:3]

2116    Well I wasn't available as I washob nobbing wi...
554     Ok. Every night take a warm bath drink a cup o...
2605    You call times job today ok umma and ask them ...
Name: text, dtype: object

#### Preprocessing 
- Count the occurances
- transform X_train to *csr_parse matrix*
- binary the classes

In [49]:
count_vectorizer = CountVectorizer()

In [50]:
X_train_counts = count_vectorizer.fit_transform(X_train)

In [77]:
from operator import itemgetter
list(count_vectorizer.vocabulary_.items())[:5]

[('well', 7184),
 ('wasn', 7122),
 ('available', 1084),
 ('as', 1017),
 ('washob', 7121)]

In [52]:
type(X_train_counts)

scipy.sparse.csr.csr_matrix

In [78]:
label_bin=LabelBinarizer()
y_train_bin=label_bin.fit_transform(y_train)
y_test_bin=label_bin.fit_transform(y_test)
y_train_bin

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [79]:
y_train_bin.shape

(4179, 1)

In [80]:
#y_train_bin was a columns vector, expected was an 1D array
y_train_bin.shape=(-1)
y_train_bin.shape

(4179,)

#### Train

In [81]:
clf=MultinomialNB().fit(X_train_counts, y_train_bin)

In [82]:
X_train_counts.shape

(4179, 7456)

In [64]:
len(count_vectorizer.vocabulary_)

7456

In [65]:
len(clf.coef_[0])

7456

In [66]:
from collections import Counter

In [83]:
important_words=Counter()

In [94]:
for i,j in zip(count_vectorizer.vocabulary_.keys(), clf.coef_[0]):
    important_words[i]=j
important_words.most_common()[:10]

[('surname', -3.680302702369673),
 ('monos', -4.4084048979631945),
 ('sexiest', -4.575818371668864),
 ('brolly', -4.609075593425346),
 ('choose', -4.8358489127901345),
 ('percentages', -4.879199353663748),
 ('sextextuk', -4.904841784277085),
 ('10am', -4.917913865844438),
 ('radio', -4.978949756430808),
 ('conserve', -5.058992464104344)]

In [95]:
# least important: 
important_words.most_common()[-10:]

[('punch', -9.941794386690715),
 ('wallet', -9.941794386690715),
 ('lightly', -9.941794386690715),
 ('checkboxes', -9.941794386690715),
 ('ambrith', -9.941794386690715),
 ('madurai', -9.941794386690715),
 ('dha', -9.941794386690715),
 ('someday', -9.941794386690715),
 ('amongst', -9.941794386690715),
 ('hooked', -9.941794386690715)]

#### Test 

In [97]:
X_test_counts=count_vectorizer.transform(X_test)

In [98]:
predictions=clf.predict(X_test_counts)

In [105]:
from sklearn.metrics import average_precision_score as aps
from sklearn.metrics import accuracy_score
print('Precision: {:.2f} percent\nand the accuracy: {:.2f}'.format(aps(y_test_bin, predictions)*100, 
                                                               accuracy_score(y_test_bin, predictions)*100))

Precision: 92.92 percent
and the accuracy: 98.99


#### Sanity check 

In [118]:
a=clf.predict(count_vectorizer.transform(['Our records indicate your Pension is under performing to see higher growth and up to 25% cash release reply PENSION for a free review.']))
if a==1: print('Yes, this is spam')
else: print("No spam")

Yes, this is spam


In [119]:
a=clf.predict(count_vectorizer.transform(['Hi Tom, are you well']))
if a==1: print('Yes, this is spam')
else: print("No spam")

No spam


##### Probability: 

In [123]:
clf.predict_proba(count_vectorizer.transform(['Let us meet at 6 PM']))
# the probability for class 0 and 1

array([[0.99729319, 0.00270681]])