# SPAM CLASSIFICATION

## 1. Import libraries

In [1]:
import pandas as pd #for creating datasets
import numpy as np

import re #regular expression for removing non-alphabetical character from dataset

import nltk #library for NLP

from nltk.corpus import stopwords #set of meaningless words we'll remove from dataset

from nltk.stem import PorterStemmer #for stemming 

from sklearn.feature_extraction.text import CountVectorizer #for creating bag of words model

from sklearn.model_selection import train_test_split #to split the training and testing data

from sklearn.naive_bayes import MultinomialNB #using classification model for prediction

from sklearn.metrics import confusion_matrix #to calculate accuracy
from sklearn.metrics import accuracy_score

##  2. Importing dataset

In [2]:
#dataset of uci machine lerning website

messages=pd.read_csv('SMSSpamCollection',sep='\t',names=["label","message"])

In [3]:
print(messages.count);

<bound method DataFrame.count of      label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>


## 3. Data Cleaning and Preprocessing

In [4]:
ps=PorterStemmer(); #stemming object creation

corpus=[];

for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i])#remove non-alphabetical characters
    review=review.lower()#lower the sentence
    review=review.split()#convert sentence into list of words
    review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]#remove stopwords
    review=' '.join(review)
    corpus.append(review)

In [5]:
print(corpus)

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat', 'ok lar joke wif u oni', 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli', 'u dun say earli hor u c alreadi say', 'nah think goe usf live around though', 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv', 'even brother like speak treat like aid patent', 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun', 'winner valu network custom select receivea prize reward claim call claim code kl valid hour', 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free', 'gonna home soon want talk stuff anymor tonight k cri enough today', 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info', 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw', 'search right word thank breather promis wont 

## 4. Creating bag of words model

In [6]:
cv=CountVectorizer(max_features=5000) #object for creating bag of words

In [7]:
x=cv.fit_transform(corpus).toarray();

In [8]:
y=pd.get_dummies(messages['label']) #encoding span/han with 0 and 1
print(y)

      ham  spam
0       1     0
1       1     0
2       0     1
3       1     0
4       1     0
...   ...   ...
5567    0     1
5568    1     0
5569    1     0
5570    1     0
5571    1     0

[5572 rows x 2 columns]


In [9]:
#we select only the spam column i.e. 0 means ham and 1 means spam

y=y.iloc[:,1].values

In [10]:
print(y)

[0 0 1 ... 0 0 0]


## 5. Train Test Split

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=0)

## 6. Using Naive Bayes Classifier

In [12]:
spam_detect_model = MultinomialNB().fit(x_train,y_train) #training the model

In [13]:
y_pred=spam_detect_model.predict(x_test) #predicted data

In [14]:
print(y_pred)

[0 1 0 ... 0 1 0]


## 7. Using confusion martix to calculate accuracy

In [15]:
confusion_m=confusion_matrix(y_test,y_pred)

In [16]:
print(confusion_m)

[[946   9]
 [  8 152]]


In [17]:
accuracy=accuracy_score(y_test,y_pred)

In [18]:
print(accuracy)

0.9847533632286996


### We got this accuracy using stemming and bag of words model
### Now we shall use lemmetization and TFIDF model

## 1. Importing Libraries

In [19]:
from nltk.stem import WordNetLemmatizer #for lemmatization
from sklearn.feature_extraction.text import TfidfVectorizer #for creating TFIDF model

## 2. Cleaning Data and Preprocessing

In [20]:
lemmatizer = WordNetLemmatizer(); #lemmatizer object creation

corpus=[];

for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i])#remove non-alphabetical characters
    review=review.lower()#lower the sentence
    review=review.split()#convert sentence into list of words
    review=[lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]#remove stopwords
    review=' '.join(review)
    corpus.append(review)

In [21]:
print(corpus)



## 3. Creating TFIDF model

In [22]:
tf=TfidfVectorizer() #tfidf object

In [23]:
X=tf.fit_transform(corpus).toarray();

In [24]:
Y=pd.get_dummies(messages['label']) #encoding span/han with 0 and 1
print(y)

[0 0 1 ... 0 0 0]


In [25]:
#we select only the spam column i.e. 0 means ham and 1 means spam

Y=Y.iloc[:,1].values

In [26]:
print(Y)

[0 0 1 ... 0 0 0]


## 4. Train Test Split

In [27]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.20,random_state=0)

## 5. Using Naive Bayes Classifier

In [28]:
spam_detect_model = MultinomialNB().fit(x_train,y_train) #training the model

In [29]:
y_pred=spam_detect_model.predict(x_test) #predicted data

In [30]:
print(y_pred)

[0 0 0 ... 0 1 0]


## 6. Using confusion matrix to calculate accuracy

In [31]:
confusion_m=confusion_matrix(y_test,y_pred)

In [32]:
print(confusion_m)

[[955   0]
 [ 31 129]]


In [33]:
accuracy=accuracy_score(y_test,y_pred)

In [34]:
print(accuracy)

0.9721973094170404


# CONCLUSION

## We got a better accuracy using stemming and bag of words model