# Spam SMS Detection

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/spam.csv")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Data preprocessing

### Remove the unwanted columns

In [3]:
df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1, inplace = True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Rename the columns appropriately

In [4]:
df.rename(columns={"v1": "label", "v2": "messages"}, inplace=True)
df.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Rearrange the columns

In [5]:
df = df[["messages", "label"]]

In [6]:
df.head()

Unnamed: 0,messages,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [7]:
df.isna().sum()

messages    0
label       0
dtype: int64

In [8]:
df[df.duplicated()==True]

Unnamed: 0,messages,label
102,As per your request 'Melle Melle (Oru Minnamin...,ham
153,As per your request 'Melle Melle (Oru Minnamin...,ham
206,"As I entered my cabin my PA said, '' Happy B'd...",ham
222,"Sorry, I'll call later",ham
325,No calls..messages..missed calls,ham
...,...,...
5524,You are awarded a SiPix Digital Camera! call 0...,spam
5535,"I know you are thinkin malaria. But relax, chi...",ham
5539,Just sleeping..and surfing,ham
5553,Hahaha..use your brain dear,ham


### Remove the duplicates

In [9]:
indexes = df[df.duplicated()==True].index
df.drop(indexes, axis=0, inplace=True)

In [10]:
df[df.duplicated()==True]

Unnamed: 0,messages,label


### Convert the categorical values into numerical values

In [11]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df["label"] = encoder.fit_transform(df["label"])

df.head()

Unnamed: 0,messages,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df["messages"])

X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
y = df["label"]

In [14]:
X.shape

(5169, 8673)

In [15]:
y.shape

(5169,)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [18]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear")

model.fit(X_train, y_train)

predictions = model.predict(X_test)

In [19]:
accuracy_score(y_test, predictions)

0.9806576402321083

In [20]:
def is_message_spam(message):
    vectorized_message = vectorizer.transform([message])
    if model.predict(vectorized_message) == 1:
        print("Spam!!!")
    else:
        print("Not spam.")

In [21]:
is_message_spam("Congratulations! You've won a luxury vacation for two. Click the link to claim your prize!")

Spam!!!


In [22]:
is_message_spam("Hello, Ata how are you.")

Not spam.


In [23]:
is_message_spam("URGENT: Your bank account is locked. Reply with your login details to unlock.")

Spam!!!
