<a href="https://colab.research.google.com/github/yklee7285/spam-detector/blob/main/SpamDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import python library for data analysis
import pandas as pd

In [None]:
# Data inspection
data = pd.read_csv("spam.csv") # read data file and assign to 'data' variable
data.head() # print initial rows of data for data inspection and understanding

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# Sort data by category
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [None]:
# Assign value '1' to spam and '0' to non-spam
data['spam']=data['Category'].apply(lambda x: 1 if x=='spam' else 0)
data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# Split data into random train and test subsets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.Message,data.spam, test_size = 0.2)

# x_train: independent variable used to train the model
# x_test: independent variable not used in model training but used for making predictions to test accuracy of model
# y_train: dependent variable that needs to be predicted by this model
# y_test: dependent variable used for testing model accuracy

In [None]:
# Convert a collection of text to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
x_train_count = v.fit_transform(x_train.values)
x_train_count.toarray()[:6]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
# Training machine learning model
from sklearn.naive_bayes import MultinomialNB # Naive Bayes classifier for multinomial models
NB_model = MultinomialNB()
NB_model.fit(x_train_count,y_train)

In [None]:
# Testing result
print("Spam Detector")
text = input("Please enter text: ")
message = []
message.append(text)

message_count = v.transform(message)
prediction = NB_model.predict(message_count)

if 1 in prediction:
  print("Spam")
else:
  print("Not Spam")

Spam Detector
Please enter text: For a limited time, grab this promotion coupon at a 10% discount!
Spam


In [None]:
# Checking accuracy of model
x_test_count = v.transform(x_test)
NB_model.score(x_test_count, y_test)

0.9847533632286996

In [None]:
# Alternative way with simplified code
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

message = [
    'Hey bro, wanna go to the gym later?',
    'For a limited time, grab this promotion coupon at a 10% discount!',
    'Eat where later?',
    'Congratulations! You’ve won a $500 Amazon gift card. Claim it now.'
]

clf.fit(x_train,y_train)
clf.predict(message)

array([0, 1, 0, 1])

In [None]:
clf.score(x_test, y_test)

0.9847533632286996