# NLP Spam Detection
### Note: Credits of this work goes to simplilearn, you may check their site:
https://www.simplilearn.com/

In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2021.11.10-cp39-cp39-win_amd64.whl (273 kB)
Collecting click
  Downloading click-8.0.3-py3-none-any.whl (97 kB)
Installing collected packages: regex, click, nltk
Successfully installed click-8.0.3 nltk-3.6.5 regex-2021.11.10




In [3]:
import pandas as pd 
import string
from nltk.corpus import stopwords

In [4]:
#Get the spam data collection 
df = pd.read_csv('SpamCollection', sep = '\t', names = ['response', 'message'])

In [5]:
df.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [7]:
#view response 
df.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [8]:
#Verify length of the messages and also add it as a new column 
df['length'] = df['message'].apply(len)

In [9]:
df.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\y\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
#define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    # check characters to see if there are punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    # froming a sentence
    no_punctuation = ''.join(no_punctuation)
    # eliminating any stopwords
    return [word for word in no_punctuation.split() if word.lower() not in  stopwords.words('english')]

In [16]:
df['message'].head().apply(message_text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [17]:
#start text processing with vectorizer 
from sklearn .feature_extraction.text import CountVectorizer

In [18]:
#use bag of words by applying the function and fit the data into it
bag_of_words_transformer = CountVectorizer(analyzer = message_text_process).fit(df['message'])

In [20]:
#print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words_transformer.vocabulary_))

11425


In [21]:
message_bagofwords = bag_of_words_transformer.transform(df['message'])

In [22]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn .feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bagofwords)

In [24]:
#print shape of the tfidf 
message_tfidf = tfidf_transformer.transform(message_bagofwords)
print(message_tfidf.shape)

(5572, 11425)


In [26]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf, df['response'])

In [38]:
#check model for the predicted and expected value say for message#2 and message#5
message = df['message'][4]
bag_of_words_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_words_message)

print('Predicted:', spam_detect_model.predict(tfidf)[0])
print('Expected:', df.response[4])

Predicted: ham
Expected: ham


### Thanks for stopping by <3