<a href="https://colab.research.google.com/github/sumair-7/SMS-Spam-Prediction/blob/master/SMS_Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SMS Spam Prediction using Machine Learning and Deep Learning

In [23]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [25]:
#loading the data
df = pd.read_csv('gdrive/My Drive/Spam/spam.csv', engine='python')
df.rename(columns={'v1':'Spam', 'v2':'SMS'}, inplace=True)

In [26]:
#Check whether there are any null values
df.isna().sum()

Spam             0
SMS              0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [27]:
df.describe()

Unnamed: 0,Spam,SMS,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


Note:
    
    There are 5572 records out of which 5169 are unique

In [28]:
#Storing all the stop words in a variable
stopwords_list = stopwords.words('english') 

In [29]:
#Initializing PorterStemmer and WordNetLemmatizer
stemmer = PorterStemmer()
Lemmatizer = WordNetLemmatizer()

In [30]:
#All the processed sentences will be stored in corpus list
corpus = []
for i in range(df.shape[0]):
    sent = re.sub('[^a-zA-Z]',' ',df['SMS'][i]) #Removes all Special Characters and punctuations
    sent = re.sub('\s+[a-zA-Z]\s+',' ',sent) #Removes all single characters
    sent = re.sub('\s+',' ',sent) #Replace multiple spaces with a space
    sent = sent.lower() #Convert all words into lower case
    words = nltk.word_tokenize(sent) #Tokenization of sentence into words
    words = [Lemmatizer.lemmatize(word) for word in words if word not in stopwords_list] #Applying Lemmatization
    processed_sent = ' '.join(words)
    corpus.append(processed_sent)
    

In [31]:
#Top 10 processed rows in corpus
corpus[:10]

['go jurong point crazy available bugis great world la buffet cine got amore wat',
 'ok lar joking wif oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month r entitled update latest colour mobile camera free call mobile update co free']

In [32]:
#Imputing Spam with 1 and Ham with 0
sms_dict = {'ham': 0, 'spam': 1}
df['Spam'] = df['Spam'].map(sms_dict)

In [33]:
#Apply Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 5000)
X = cv.fit_transform(corpus).toarray()

In [34]:
y = df[['Spam']]

In [35]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 0)

### NAIVE BAYES

In [36]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
mnb.score(x_test,y_test)

0.9820531227566404

In [38]:
from sklearn.model_selection import cross_val_score, KFold
kfold = KFold(n_splits = 5)

In [39]:
cross_validation = cross_val_score(mnb,x_train,y_train,scoring = 'accuracy',cv = kfold )

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [40]:
print('Cross Validation Score is: {}'.format(cross_validation.mean()))

Cross Validation Score is: 0.9806188579778243


### ANN MODEL

With our data loaded and preprocessed, we’re now well prepared to use neural network architecture to classify the text message.

In [41]:
import tensorflow as tf

In [42]:
from tensorflow import keras
from tensorflow.keras.layers import Dense,ReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD

In [43]:
#Initialize Model
model = Sequential()

#Input Layer
model.add(Dense(7,activation = 'relu',kernel_initializer = 'uniform', input_dim = 5000))

#Hidden Layer
model.add(Dense(9,activation = 'relu',kernel_initializer = 'uniform'))

#Output Layer
model.add(Dense(1,activation = 'sigmoid',kernel_initializer = 'uniform'))


In [44]:
#Compilation
opt = SGD(learning_rate = 0.1,momentum = 0.5)
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])

In [45]:
#Accuracy on Train Data
model.fit(x_train,y_train,batch_size = 4,epochs = 5,validation_data = (x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f755c30aed0>

In [46]:
#Accuracy on Test data
model.evaluate(x_test,y_test,batch_size = 4)



[0.07321330159902573, 0.9842067360877991]