# **SMS Spam Detector**

In [1]:
import pandas as pd

In [2]:
# Loading the SMS spam collection dataset
# Assigning column names 'label' for spam/ham and 'message' for the SMS content
spam_dataset = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

In [3]:
spam_dataset.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Importing the regex module and Natural Language Toolkit (nltk) for text processing
# Downloading the 'stopwords' dataset
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Importing stopwords from nltk and PorterStemmer for stemming
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [6]:
# Creating an instance of PorterStemmer to use for stemming operations
port_stem = PorterStemmer()

In [7]:
# Initializing an empty list to store the cleaned and processed SMS messages
corpus = []

## **Cleaning and preprocessing the messages**

In [9]:
for i in range(0, len(spam_dataset)):
  result = re.sub('[^a-zA-Z]', ' ', spam_dataset['message'][i])
  result = result.lower()
  result = result.split()
  result = [port_stem.stem(word) for word in result if not word in stopwords.words('english')]
  result = ' '.join(result)
  corpus.append(result)

In [10]:
# Using CountVectorizer to convert the processed text data into numerical form
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3500)
X = cv.fit_transform(corpus).toarray()

In [13]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Spam - 1

Ham - 0

In [15]:
# Converting the 'label' column into numerical format using one-hot encoding
y = pd.get_dummies(spam_dataset['label'])
y = y.iloc[:,1].values

In [16]:
print(y)

[False False  True ... False False False]


## **Splitting the dataset into training and testing sets**

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [18]:
print(X.shape, X_train.shape, X_test.shape)

(5572, 3500) (4457, 3500) (1115, 3500)


In [19]:
print(y.shape, y_train.shape, y_test.shape)

(5572,) (4457,) (1115,)


## **Model Training**

In [20]:
# Importing and training a Multinomial Naive Bayes model on the training dataset
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, y_train)

## **Model Evaluation**

In [21]:
# Making predictions on the test dataset
y_pred = model.predict(X_test)

In [22]:
# Importing the confusion_matrix function to evaluate the model's performance
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [23]:
print(cm)

[[961   7]
 [  5 142]]


In [24]:
# Importing the accuracy_score function to calculate the model's accuracy
from sklearn.metrics import accuracy_score
data_accuracy = accuracy_score(y_test, y_pred)

In [25]:
# Printing the accuracy of the model
print(data_accuracy)

0.989237668161435
