Dataset link: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset


In [None]:
import numpy as np
import pandas as pd

In [None]:
import chardet

with open('sms spam detection.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

df = pd.read_csv('sms spam detection.csv', encoding=encoding)
df.head(10)


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [None]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5518
Unnamed: 3    5556
Unnamed: 4    5562
dtype: int64

In [None]:
df.drop(columns = ['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace = True)

In [None]:
df.duplicated().sum()

403

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df.duplicated().sum()

0

In [None]:
df.shape

(5165, 2)

In [None]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['v1'] = le.fit_transform(df['v1'])
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Rename columns
df.rename(columns = {'v1':'target','v2':'text'}, inplace = True)

In [None]:
df.head(2)

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...


Exploratory Data Analysis

In [None]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df['#char'] = df['text'].apply(len)

In [None]:
df.head()

Unnamed: 0,target,text,#char
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [None]:
df['#words'] = df['text'].apply(lambda x: len(word_tokenize(x)))

In [None]:
df.head(2)

Unnamed: 0,target,text,#char,#words
0,0,"Go until jurong point, crazy.. Available only ...",111,24
1,0,Ok lar... Joking wif u oni...,29,8


In [None]:
df['#sentence'] = df['text'].apply(lambda x: len(sent_tokenize(x)))

In [None]:
df.head(3)

Unnamed: 0,target,text,#char,#words,#sentence
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2
1,0,Ok lar... Joking wif u oni...,29,8,2
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2


In [None]:
df[['#char','#words','#sentence']].describe()

Unnamed: 0,#char,#words,#sentence
count,5165.0,5165.0,5165.0
mean,78.996515,18.461955,1.965731
std,58.240593,13.326733,1.448906
min,2.0,1.0,1.0
25%,36.0,9.0,1.0
50%,60.0,15.0,1.0
75%,117.0,26.0,2.0
max,910.0,220.0,38.0


In [None]:
df1 = df.copy()

Text Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

def data_preprocessing(text):
  #lower casing
  text = text.lower()

  #remove punctuations
  text = re.sub(r'[^\w\s]', '', text)

  #remove stopwords
  new_text = []
  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)
  x=new_text[:]
  new_text.clear
  text =  ' '.join(x)

  #Tokenize
  text = word_tokenize(text)

  #Stemming
  new_list = []
  ps = PorterStemmer()
  for word in text:
    new_list.append(ps.stem(word))
  text = " ".join(new_list)

  return text

#data_preprocessing("Hi! I am , ashish Malhotra? They will feel the music in a bit. Musician is good. We will have to go there walking. You shouldnt run")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df1['transformed_text'] = df1['text'].apply(data_preprocessing)

In [None]:
df1.head(5)

Unnamed: 0,target,text,#char,#words,#sentence,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2,go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,29,8,2,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,49,13,1,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1,nah dont think goe usf live around though


**Vecotrization:Bag of Words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initializing CountVectorizer with binary=False for actual word counts
vectorizer = CountVectorizer(binary=False)

X_sparse = vectorizer.fit_transform(df1['transformed_text'])
X = X_sparse.toarray()
y = df1.target
print(X[1])
print(vectorizer.vocabulary_)
print(X.shape)

[0 0 0 ... 0 0 0]
{'go': 3256, 'jurong': 4037, 'point': 5510, 'crazi': 2187, 'avail': 1286, 'bugi': 1687, 'great': 3345, 'world': 7753, 'la': 4178, 'buffet': 1685, 'cine': 1967, 'got': 3308, 'amor': 1097, 'wat': 7549, 'ok': 5142, 'lar': 4213, 'joke': 4003, 'wif': 7666, 'oni': 5172, 'free': 3076, 'entri': 2734, 'wkli': 7712, 'comp': 2063, 'win': 7679, 'fa': 2853, 'cup': 2233, 'final': 2954, 'tkt': 7074, '21st': 429, 'may': 4570, '2005': 416, 'text': 6934, '87121': 822, 'receiv': 5831, 'questionstd': 5744, 'txt': 7251, 'ratetc': 5790, 'appli': 1169, '08452810075over18': 70, 'dun': 2611, 'say': 6099, 'earli': 2627, 'hor': 3628, 'alreadi': 1070, 'nah': 4857, 'dont': 2533, 'think': 7003, 'goe': 3268, 'usf': 7377, 'live': 4331, 'around': 1208, 'though': 7019, 'freemsg': 3084, 'hey': 3537, 'darl': 2288, 'week': 7586, 'word': 7742, 'back': 1330, 'id': 3731, 'like': 4300, 'fun': 3136, 'still': 6641, 'tb': 6876, 'xxx': 7870, 'std': 6618, 'chg': 1916, 'send': 6170, 'å150': 8005, 'rcv': 5801, 'eve

**Tfidf**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(binary = False)
X_sparse = vectorizer.fit_transform(df1['transformed_text'])
X = X_sparse.toarray()
print(X[1])
print(vectorizer.vocabulary_)
print(X.shape)
y = df1.target

[0. 0. 0. ... 0. 0. 0.]
{'go': 3256, 'jurong': 4037, 'point': 5510, 'crazi': 2187, 'avail': 1286, 'bugi': 1687, 'great': 3345, 'world': 7753, 'la': 4178, 'buffet': 1685, 'cine': 1967, 'got': 3308, 'amor': 1097, 'wat': 7549, 'ok': 5142, 'lar': 4213, 'joke': 4003, 'wif': 7666, 'oni': 5172, 'free': 3076, 'entri': 2734, 'wkli': 7712, 'comp': 2063, 'win': 7679, 'fa': 2853, 'cup': 2233, 'final': 2954, 'tkt': 7074, '21st': 429, 'may': 4570, '2005': 416, 'text': 6934, '87121': 822, 'receiv': 5831, 'questionstd': 5744, 'txt': 7251, 'ratetc': 5790, 'appli': 1169, '08452810075over18': 70, 'dun': 2611, 'say': 6099, 'earli': 2627, 'hor': 3628, 'alreadi': 1070, 'nah': 4857, 'dont': 2533, 'think': 7003, 'goe': 3268, 'usf': 7377, 'live': 4331, 'around': 1208, 'though': 7019, 'freemsg': 3084, 'hey': 3537, 'darl': 2288, 'week': 7586, 'word': 7742, 'back': 1330, 'id': 3731, 'like': 4300, 'fun': 3136, 'still': 6641, 'tb': 6876, 'xxx': 7870, 'std': 6618, 'chg': 1916, 'send': 6170, 'å150': 8005, 'rcv': 5801

**Model Building**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.2, random_state = 42)

In [None]:
def visuals(model,y_pred):
  accuracy = accuracy_score(y_test,y_pred)
  cm = confusion_matrix(y_test,y_pred)
  ps  = precision_score(y_test,y_pred)
  print(f"********{model}***********")
  print("accuracy")
  print(accuracy)
  print("Confusion Matrix")
  print(cm)
  print("Precision Score")
  print(ps)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

g_model = GaussianNB()
g_model.fit(X_train,y_train)
y_pred = g_model.predict(X_test)
visuals('gaussianNB',y_pred)


********gaussianNB***********
accuracy
0.8760890609874153
Confusion Matrix
[[782 114]
 [ 14 123]]
Precision Score
0.5189873417721519


In [None]:
m_model = MultinomialNB()
m_model.fit(X_train,y_train)
y_pred = m_model.predict(X_test)
visuals('Multinomial',y_pred)

********Multinomial***********
accuracy
0.9699903194578896
Confusion Matrix
[[896   0]
 [ 31 106]]
Precision Score
1.0


In [None]:
b_model = BernoulliNB()
b_model.fit(X_train,y_train)
y_pred = b_model.predict(X_test)
visuals('Bernoulli',y_pred)

********Bernoulli***********
accuracy
0.978702807357212
Confusion Matrix
[[891   5]
 [ 17 120]]
Precision Score
0.96
