In [1]:
# In this notebook:
# I attempt to classify SMS messages as potential spam
# I explore methods different for regular NLP for spam detection
# I conduct TFIDF vectorization
# I utilize hstack to combine sparse matrix with descriptive variables
# I conduct ML via logistic regression / Naive Bayes

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
sorig = pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['class','phrase'])

print(f'Dataset shape is: {sorig.shape}')

sorig.head()

Dataset shape is: (5572, 2)


Unnamed: 0,class,phrase
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [5]:
# no missing values
sorig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
class     5572 non-null object
phrase    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
# breakdown of ham to spam; spam == 13%
sorig['class'].value_counts()

ham     4825
spam     747
Name: class, dtype: int64

In [7]:
# a quick look at what is considered spam
# high count of numbers
# all caps words seem to be common
sorig[sorig['class'] == 'spam'][:20]

Unnamed: 0,class,phrase
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
5,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...
11,spam,"SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, ..."
12,spam,"URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM..."
15,spam,"XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here..."
19,spam,England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to...
34,spam,Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm ...
42,spam,07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free noki...


## Extract phrase detail

In [8]:
def digit_count(x):
    digits = 0
    for char in x:
        if char.isnumeric():
            digits += 1
    return digits

In [9]:
sorig['c_digits'] = sorig['phrase'].apply(digit_count)

sorig.head()

Unnamed: 0,class,phrase,c_digits
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,25
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives around here though",0


In [10]:
def symbol_count(x):
    syms = 0
    for char in x:
        if (char.isalnum() == False) & (char != ' '):
            syms += 1
            
    return syms

In [11]:
sorig['c_symbols'] = sorig['phrase'].apply(symbol_count)

sorig.head()

Unnamed: 0,class,phrase,c_digits,c_symbols
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",0,9
1,ham,Ok lar... Joking wif u oni...,0,6
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,25,6
3,ham,U dun say so early hor... U c already then say...,0,6
4,ham,"Nah I don't think he goes to usf, he lives around here though",0,2


In [12]:
# counts strings in all caps longer than 3 characters

def caps_count(x):
    caps_temp = 0
    caps_tot = 0
    for char in x:
        if char.isupper():
            caps_temp += 1
        else:
            if caps_temp >= 3:
                caps_tot += caps_temp
                caps_temp = 0
            else:
                caps_temp = 0
                
    return caps_tot

In [13]:
sorig['c_caps'] = sorig['phrase'].apply(caps_count)

sorig.head(10)

Unnamed: 0,class,phrase,c_digits,c_symbols,c_caps
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",0,9,0
1,ham,Ok lar... Joking wif u oni...,0,6,0
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,25,6,0
3,ham,U dun say so early hor... U c already then say...,0,6,0
4,ham,"Nah I don't think he goes to usf, he lives around here though",0,2,0
5,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...,4,9,0
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.,0,2,0
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,1,6,0
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...,19,7,6
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with came...,13,2,4


## Clean and preprocess phrase

In [14]:
from nltk.corpus import stopwords

In [15]:
stop = stopwords.words('english')

sorig['phrase'] = sorig['phrase'].apply(lambda x:
                                        " ".join(x for x in str(x).split()
                                        if x.lower() not in stop))

sorig['phrase'].head(10)

0            Go jurong point, crazy.. Available bugis n great world la e buffet... Cine got amore wat...
1                                                                          Ok lar... Joking wif u oni...
2    Free entry 2 wkly comp win FA Cup final tkts 21st May 2005. Text FA 87121 receive entry question...
3                                                              U dun say early hor... U c already say...
4                                                                Nah think goes usf, lives around though
5        FreeMsg Hey darling 3 week's word back! I'd like fun still? Tb ok! XxX std chgs send, £1.50 rcv
6                                                    Even brother like speak me. treat like aids patent.
7    per request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' set callertune Callers. Press *9 ...
8    WINNER!! valued network customer selected receivea £900 prize reward! claim call 09061701461. Cl...
9    mobile 11 months more? U R entitled Update latest 

In [16]:
# separate symbols from words in order to keep for TFIDF
def symbol_sep(x):
    new_str = ''
    for char in x:
        if (char.isalnum() == False) & (char != ' '):
            new_str = new_str + ' ' + char + ' '
        else:
            new_str += char
                        
    return new_str

In [17]:
sorig['phrase'] = sorig['phrase'].apply(symbol_sep)

sorig.head(10)

Unnamed: 0,class,phrase,c_digits,c_symbols,c_caps
0,ham,"Go jurong point , crazy . . Available bugis n great world la e buffet . . . Cine got amore...",0,9,0
1,ham,Ok lar . . . Joking wif u oni . . .,0,6,0
2,spam,Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 . Text FA 87121 receive entry questi...,25,6,0
3,ham,U dun say early hor . . . U c already say . . .,0,6,0
4,ham,"Nah think goes usf , lives around though",0,2,0
5,spam,"FreeMsg Hey darling 3 week ' s word back ! I ' d like fun still ? Tb ok ! XxX std chgs send ,...",4,9,0
6,ham,Even brother like speak me . treat like aids patent .,0,2,0
7,ham,per request ' Melle Melle ( Oru Minnaminunginte Nurungu Vettam ) ' set callertune Callers . ...,1,6,0
8,spam,WINNER ! ! valued network customer selected receivea £ 900 prize reward ! claim call 0906170...,19,7,6
9,spam,mobile 11 months more ? U R entitled Update latest colour mobiles camera Free ! Call Mobile Up...,13,2,4


In [18]:
# keep all-caps words while lowercasing the rest
def caps_sep(x):
    new_str = ''
    for word in str(x).split():
 
        if (len(word) > 2) & (str(word).isupper()):
            new_str = new_str + str(word) + ' '
        else:
            new_str = new_str + str(word).lower() + ' '
                             
    return new_str

In [19]:
sorig['phrase'] = sorig['phrase'].apply(caps_sep)

sorig['phrase'].head(10)

0    go jurong point , crazy . . available bugis n great world la e buffet . . . cine got amore wat ....
1                                                                   ok lar . . . joking wif u oni . . . 
2    free entry 2 wkly comp win fa cup final tkts 21st may 2005 . text fa 87121 receive entry questio...
3                                                       u dun say early hor . . . u c already say . . . 
4                                                              nah think goes usf , lives around though 
5    freemsg hey darling 3 week ' s word back ! i ' d like fun still ? tb ok ! xxx std chgs send , £ ...
6                                                 even brother like speak me . treat like aids patent . 
7    per request ' melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . pres...
8    WINNER ! ! valued network customer selected receivea £ 900 prize reward ! claim call 09061701461...
9    mobile 11 months more ? u r entitled update latest

In [20]:
# remove single letters
def remove_letters(x):
    new_str = ''
    for word in str(x).split():
 
        if (len(word) == 1) & (str(word).isalpha()):
            pass
        else:
            new_str = new_str + str(word) + ' '
                             
    return new_str

In [21]:
sorig['phrase'] = sorig['phrase'].apply(remove_letters)

sorig['phrase'].head(10)

0      go jurong point , crazy . . available bugis great world la buffet . . . cine got amore wat . . . 
1                                                                     ok lar . . . joking wif oni . . . 
2    free entry 2 wkly comp win fa cup final tkts 21st may 2005 . text fa 87121 receive entry questio...
3                                                             dun say early hor . . . already say . . . 
4                                                              nah think goes usf , lives around though 
5    freemsg hey darling 3 week ' word back ! ' like fun still ? tb ok ! xxx std chgs send , £ 1 . 50...
6                                                 even brother like speak me . treat like aids patent . 
7    per request ' melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . pres...
8    WINNER ! ! valued network customer selected receivea £ 900 prize reward ! claim call 09061701461...
9    mobile 11 months more ? entitled update latest col

In [22]:
word_count = pd.Series(' '.join(sorig['phrase']).split()).value_counts()

print(f'Number of unique words: {len(word_count)}')
print(f'Sum of words: {word_count.sum()}')

word_count[:20]

Number of unique words: 9441
Sum of words: 77347


.       11214
,        1980
?        1550
!        1397
'        1391
&         922
;         768
:         745
-         585
call      564
2         533
)         499
/         419
ur        391
get       382
£         329
4         327
gt        318
lt        316
*         311
dtype: int64

In [23]:
word_rare = word_count[(word_count == 1) | (word_count == 2)]

print(f'Number of rare words: {len(word_rare)}')
print(f'Sum of rare words: {word_rare.sum()}')

word_rare[:20]

Number of rare words: 6448
Sum of rare words: 8032


08718730666       2
simpler           2
actor             2
dippeditinadew    2
vth               2
muah              2
hsbc              2
wicklow           2
sunlight          2
teaching          2
nokia6650         2
07734396839       2
BOUT              2
kr                2
EURO              2
bone              2
chances           2
GREAT             2
ryan              2
08000776320       2
dtype: int64

In [24]:
sorig['phrase'] = sorig['phrase'].apply(lambda x:
                                      " ".join(x for x in str(x).split()
                                      if x not in word_rare))

sorig['phrase'].head(10)

0                           go point , crazy . . available bugis great world la . . . cine got wat . . .
1                                                                      ok lar . . . joking wif oni . . .
2    free entry 2 wkly comp win fa cup final tkts 21st may 2005 . text fa 87121 receive entry questio...
3                                                                  dun say early . . . already say . . .
4                                                               nah think goes usf , lives around though
5            freemsg hey darling 3 week ' word back ! ' like fun still ? tb ok ! xxx std send , £ 1 . 50
6                                                              even brother like speak me . treat like .
7    per request ' melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . pres...
8    WINNER ! ! valued network customer selected £ 900 prize reward ! claim call . claim code . valid...
9    mobile 11 months more ? entitled update latest col

## TFIDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tf = TfidfVectorizer(max_features=3000, lowercase=None, analyzer='word',
                        stop_words = 'english', ngram_range=(1,1))

tfvec = tf.fit_transform(sorig['phrase'])

tfvec

<5572x2744 sparse matrix of type '<class 'numpy.float64'>'
	with 35114 stored elements in Compressed Sparse Row format>

In [27]:
first20vocab = {x: tf.vocabulary_[x] for x in list(tf.vocabulary_)[:20]}

first20vocab

{'point': 1911,
 'crazy': 861,
 'available': 544,
 'bugis': 675,
 'great': 1233,
 'world': 2694,
 'la': 1470,
 'cine': 777,
 'got': 1224,
 'wat': 2626,
 'ok': 1796,
 'lar': 1479,
 'joking': 1431,
 'wif': 2659,
 'oni': 1804,
 'free': 1151,
 'entry': 1027,
 'wkly': 2677,
 'comp': 816,
 'win': 2663}

In [28]:
rand20idf = {x: dict(zip(tf.get_feature_names(), tf.idf_))[x] for x in 
              list(dict(zip(tf.get_feature_names(), tf.idf_)))[200:220]}

rand20idf

{'ALRITE': 8.016250875135436,
 'ASAP': 7.546247245889701,
 'AUCTION': 8.239394426449646,
 'AWAITING': 8.239394426449646,
 'BABE': 6.986631457954278,
 'BABY': 8.239394426449646,
 'BACK': 7.8339293183414815,
 'BAK': 8.239394426449646,
 'BIRTHDAY': 7.679778638514223,
 'BIT': 8.016250875135436,
 'BLOOD': 8.016250875135436,
 'BMW': 8.239394426449646,
 'BONUS': 8.239394426449646,
 'BOX': 7.32310369457549,
 'BOX95QU': 8.239394426449646,
 'BOX97N7QP': 8.239394426449646,
 'BSLVYL': 6.986631457954278,
 'CALL': 6.224491405907381,
 'CAMERA': 8.016250875135436,
 'CARD': 8.239394426449646}

## Conduct ML

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from sklearn.metrics import classification_report

In [30]:
# add in the phrase detail columns for ML

sml = tfvec.copy()

sml = hstack((sml ,np.array(sorig['c_digits'])[:,None]))
sml = hstack((sml ,np.array(sorig['c_symbols'])[:,None]))
sml = hstack((sml ,np.array(sorig['c_caps'])[:,None]))

sml

<5572x2747 sparse matrix of type '<class 'numpy.float64'>'
	with 42557 stored elements in COOrdinate format>

In [31]:
(x_train, x_test, y_train, y_test) = train_test_split(sml, sorig['class'], test_size=0.25)

print(f'x_train: {x_train.shape}\ny_train: {y_train.shape}')
print(f'x_test: {x_test.shape}\ny_test: {y_test.shape}')

x_train: (4179, 2747)
y_train: (4179,)
x_test: (1393, 2747)
y_test: (1393,)


In [32]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    logreg = LogisticRegression(C=c)
    logreg.fit(x_train, y_train)
    print (f'TFIDF with Logistic Regression Accuracy: C={c} {accuracy_score(y_test, logreg.predict(x_test))}')

TFIDF with Logistic Regression Accuracy: C=0.01 0.9662598707824839
TFIDF with Logistic Regression Accuracy: C=0.05 0.9662598707824839
TFIDF with Logistic Regression Accuracy: C=0.25 0.968413496051687
TFIDF with Logistic Regression Accuracy: C=0.5 0.9705671213208902
TFIDF with Logistic Regression Accuracy: C=1 0.9755922469490309


In [33]:
logreg = LogisticRegression(C=1)
logregmodel = logreg.fit(x_train, y_train)

print ('Logistic regression report:')
print (classification_report(y_test, logregmodel.predict(x_test)))

Logistic regression report:
             precision    recall  f1-score   support

        ham       0.97      1.00      0.99      1204
       spam       0.98      0.84      0.90       189

avg / total       0.98      0.98      0.97      1393



In [34]:
mnb = MultinomialNB()
mnbmodel = mnb.fit(x_train, y_train)

print (f'TFIDF with Naive Bayes Accuracy: {accuracy_score(y_test, mnbmodel.predict(x_test))}')

TFIDF with Naive Bayes Accuracy: 0.955491744436468


In [35]:
print ('Naive Bayes report:')
print (classification_report(y_test, mnbmodel.predict(x_test)))

Naive Bayes report:
             precision    recall  f1-score   support

        ham       0.98      0.97      0.97      1204
       spam       0.82      0.87      0.84       189

avg / total       0.96      0.96      0.96      1393



In [36]:
# Looks like Logistic Regression C=1 is the best predicting model
# Below are some predictions to see the model in action

test_phrase = ['hey Dave, how are you and the fam?',
               'tired of your phone bill? Send us a text at 4012221234',
               'YO you up bro? Been textin you since 10, let me know wen u up!',
               'ONCE IN A LIFE OPPORTUNITY!!! Contact us at team@makemoneyquick.com!!!',
               'CONGRATULATIONS! You are the lucky winner of £100,000']

test_transform = tf.transform(test_phrase)

test_transform = hstack((test_transform ,np.array([0,10,2,1,6])[:,None])) #c_digits
test_transform = hstack((test_transform ,np.array([1,1,2,7,2])[:,None])) #c_symbols
test_transform = hstack((test_transform ,np.array([0,0,0,19,15])[:,None])) #c_caps

In [37]:
test_pred = logregmodel.predict(test_transform)

for name, pred in zip(test_phrase, test_pred):
    print(f'Prediction: {pred.capitalize()}   Phrase: {name}')
    
# we see a misprediction with 4
# May be due to the lack of data cleaning for test cases

Prediction: Ham   Phrase: hey Dave, how are you and the fam?
Prediction: Spam   Phrase: tired of your phone bill? Send us a text at 4012221234
Prediction: Ham   Phrase: YO you up bro? Been textin you since 10, let me know wen u up!
Prediction: Ham   Phrase: ONCE IN A LIFE OPPORTUNITY!!! Contact us at team@makemoneyquick.com!!!
Prediction: Spam   Phrase: CONGRATULATIONS! You are the lucky winner of £100,000


In [38]:
# Part 01 complete!