## Reading Dataset

In [1]:
import pandas as pd

In [2]:
message_df = pd.read_csv("dataset/SMSSpamCollection",sep='\t',names=["label","message"])

In [3]:
message_df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data cleaning & pre-processing

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

In [24]:
ps = PorterStemmer()
corpus = []
nltk.sent_tokenize
for idx in range(len(message_df)):
    review = re.sub("^[A-Za-z]"," ",message_df['message'][idx])
    review = review.lower()
    review = review.strip()
    
    review = [ps.stem(word) for word in nltk.word_tokenize(review) if not word in stopwords.words("english")]
    
    review = " ".join(review)
    corpus.append(review)

In [25]:
print(corpus)

['jurong point , crazi .. avail bugi n great world la e buffet ... cine got amor wat ...', 'k lar ... joke wif u oni ...', "ree entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri question ( std txt rate ) & c 's appli 08452810075over18 's", 'dun say earli hor ... u c alreadi say ...', "ah n't think goe usf , live around though", "reemsg hey darl 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chg send , £1.50 rcv", 'ven brother like speak . treat like aid patent .', "per request 'mell mell ( oru minnaminungint nurungu vettam ) ' set callertun caller . press * 9 copi friend callertun", 'inner ! ! valu network custom select receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hour .', 'ad mobil 11 month ? u r entitl updat latest colour mobil camera free ! call mobil updat co free 08002986030', "'m gon na home soon n't want talk stuff anymor tonight , k ? 've cri enough today .", 'ix chanc win cash ! 100 20,000 pound t

## Bag Of Words

In [26]:
## CREATING BAG OF WORDS
from sklearn.feature_extraction.text import CountVectorizer

## SELECT ONLY 5000 most frequent words
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [27]:
y = pd.get_dummies(message_df['label'])
y = y.iloc[:,1].values

## Train Test Split

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

## Training the model

In [None]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train,y_train)

y_pred = spam_detect_model.predict(X_test)

## Model Evaluation

In [31]:
from sklearn.metrics import confusion_matrix,accuracy_score

confusion_m = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)

In [32]:
confusion_m

array([[944,  11],
       [  6, 154]], dtype=int64)

In [33]:
accuracy

0.9847533632286996