In [4]:
import pandas as pd
import string
from nltk.corpus import stopwords

In [9]:
#Get the spam data collection 
df = pd.read_csv('I:\DataScience\SpamCollection', sep='\t', names = ['response','message'])

In [10]:
df.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [15]:
#view response using group by method
df.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [16]:
#Verify length of the messages and also add it as a new column 
df['length']=df['message'].apply(len)

In [17]:
df.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [21]:
#define a function to get rid of stopwords present in the messages
def remove_stop_words(message):
    # getting character list without punctuation
    no_punc = [char for char in message if char not in string.punctuation]
    # forming sentence
    no_punc = ''.join(no_punc)
    # no eleminate stop words, me, myself, we, yours
    return [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]

In [24]:
# check function
df['message'].head(5).apply(remove_stop_words)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [25]:
#start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
#use bag of words by applying the function and fit the data into it
bag_of_words = CountVectorizer(analyzer=remove_stop_words).fit(df['message'])

In [29]:
#print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words.vocabulary_))

11425


In [30]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
message_bag_of_words = bag_of_words.transform(df['message'])

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bag_of_words)

In [36]:
# print the shape of tfidf
message_tfidf = tfidf_transformer.transform(message_bag_of_words)
print(message_tfidf)

  (0, 11163)	0.23026685592418913
  (0, 10965)	0.19073428545061483
  (0, 8917)	0.24704652376837993
  (0, 8336)	0.17046869292195632
  (0, 7668)	0.26403384065473806
  (0, 7555)	0.31253856260694546
  (0, 6937)	0.1834692413608692
  (0, 6906)	0.15158474664662352
  (0, 6217)	0.18915557732842803
  (0, 5769)	0.24984711892976424
  (0, 5218)	0.26870593862526665
  (0, 5217)	0.29835184088197164
  (0, 4653)	0.31253856260694546
  (0, 2060)	0.24203960256420656
  (0, 1483)	0.31253856260694546
  (0, 1110)	0.2882862016308418
  (1, 11072)	0.40061560982443056
  (1, 10698)	0.2063637481323008
  (1, 8590)	0.5043405901305854
  (1, 7701)	0.3767401070812794
  (1, 3064)	0.2911995411244838
  (1, 2451)	0.561988811929381
  (2, 11123)	0.19104387220509106
  (2, 11084)	0.15898145347176754
  (2, 10686)	0.13995540820792943
  :	:
  (5568, 6882)	0.31367469776242124
  (5568, 6691)	0.47781076401785183
  (5568, 6354)	0.5575721048646767
  (5568, 4880)	0.3853122086093004
  (5569, 10199)	0.520467167163554
  (5569, 8252)	0.432829

In [41]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf, df['response'])

In [48]:
#check model for the predicted and expected value say for message#2 and message#5
message = df['message'][2]
bagofword = bag_of_words.transform([message])
tfidf = tfidf_transformer.transform(bagofword)

In [49]:
print('predicted', spam_detect_model.predict(tfidf)[0])
print('expected', df.response[2])

predicted spam
expected spam
