# Spam Detection Project

In [122]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.preprocessing import MinMaxScaler

* Data exploration - let us check how the columns look like and delete unnecessary ones and then move to preprocessing
* We dont need the first column 'Unnamed: 0' as it is just an ID
* The columns 'label' and 'label_num' depict the same thing, let us only leave 'label_num' that represent spam (1) and non-spam (0) label

Import the dataset:

In [86]:
data = pd.read_csv(r"C:\Users\serei\Downloads\spam.csv", encoding = "latin")

In [89]:
data = data[["v1", "v2"]]

In [99]:
data.columns

Index(['spam', 'text'], dtype='object')

In [98]:
data.rename({"v1": "spam", "v2":"text"}, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [100]:
data.describe()

Unnamed: 0,spam,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


Data preprocessing - we only need to take care of the text column
Convert text to lowercase - the iteration is required to go through each mail (it seems that the text was already in lower case but just to make sure let us do it again)

In [101]:
data['text'][0]

for i in range(0,len(data)):
    data['text'][i] = data['text'][i].lower()

## Check if it worked

data['text'][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'][i] = data['text'][i].lower()


'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

## Word Frequency

The previous analysis was based on word frequencies appearing in the text. In order to obtain the right word frequencies, we have to remove the stopwords as well as conduct lemmantization to obtain the root of the word and reduce the number of unique words. 
Next, we count the occurence of the top 50 most frequent words, special characters and numbers in each email.

We generate the functions with the help of ChatGPT.

In [102]:
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Lemmatize each word and remove stop words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
    
    # Join the lemmatized words back into a single string
    processed_text = ' '.join(lemmatized_words)
    
    return processed_text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\serei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\serei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [104]:
data['processed_text'] = data['text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['processed_text'] = data['text'].apply(preprocess_text)


In [105]:
data

Unnamed: 0,spam,text,processed_text
0,ham,"go until jurong point, crazy.. available only ...","go jurong point , crazy .. available bugis n g..."
1,ham,ok lar... joking wif u oni...,ok lar ... joking wif u oni ...
2,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say so early hor... u c already then say...,u dun say early hor ... u c already say ...
4,ham,"nah i don't think he goes to usf, he lives aro...","nah n't think go usf , life around though"
...,...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u. u å£750 pound priz...
5568,ham,will ì_ b going to esplanade fr home?,ì_ b going esplanade fr home ?
5569,ham,"pity, * was in mood for that. so...any other s...","pity , * mood . ... suggestion ?"
5570,ham,the guy did some bitching but i acted like i'd...,guy bitching acted like 'd interested buying s...


The words are tokenized excluding the numbers (as random selection of integers were appearing in the top 50). We obtain the 50 most frequent ones.

In [106]:
# Tokenize the text into words while excluding numbers
all_text = data['processed_text'].str.cat(sep=' ')
words = nltk.word_tokenize(all_text)
words = [word for word in words if word.isalpha()]

# Create a frequency distribution of the words
freq_dist = FreqDist(words)

# Retrieve the most common words
num_most_common = 50
most_common_words = freq_dist.most_common(num_most_common)

# Print the most common words and their frequencies
for word, frequency in most_common_words:
    print(word, frequency)


u 1184
call 603
get 396
ur 381
gt 318
lt 316
go 305
free 278
know 271
ok 250
come 250
got 249
like 247
day 244
good 242
time 241
text 215
want 214
love 209
send 191
one 184
need 181
today 173
going 173
r 172
txt 169
home 163
lor 162
sorry 159
see 158
still 155
back 153
stop 153
c 152
mobile 150
take 150
da 150
n 148
reply 147
think 146
tell 142
phone 139
dont 138
new 136
later 135
week 132
hi 132
please 130
pls 125
make 124


A list of only the needed words is obtained.

In [107]:
word_list = [word for word, count in most_common_words]

We generate a word frequency matrix for the most common words, as well as adding additional columns for special characters and integers.

In [108]:
vectorizer = CountVectorizer(vocabulary=word_list)
X = vectorizer.fit_transform(data["processed_text"])

In [110]:
matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
data = pd.concat([data[["processed_text", "spam"]], matrix], axis=1)

In [111]:
for char in ["$", "€", "!", "@", "?"]:
    pattern = re.escape(char)  # Escape special characters in the regex pattern
    data[char] = data["processed_text"].str.count(pattern)

data["digit_count"] = data["processed_text"].str.count(r"\d")

In [112]:
data

Unnamed: 0,processed_text,spam,u,call,get,ur,gt,lt,go,free,...,hi,please,pls,make,$,€,!,@,?,digit_count
0,"go jurong point , crazy .. available bugis n g...",ham,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,ok lar ... joking wif u oni ...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,free entry 2 wkly comp win fa cup final tkts 2...,spam,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,25
3,u dun say early hor ... u c already say ...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"nah n't think go usf , life around though",ham,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,2nd time tried 2 contact u. u å£750 pound priz...,spam,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,21
5568,ì_ b going esplanade fr home ?,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5569,"pity , * mood . ... suggestion ?",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5570,guy bitching acted like 'd interested buying s...,ham,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [113]:
data["word_count"] = data["processed_text"].apply(lambda x: len(x.split()))
data[word_list] = data[word_list].div(data["word_count"], axis=0)
data

Unnamed: 0,processed_text,spam,u,call,get,ur,gt,lt,go,free,...,please,pls,make,$,€,!,@,?,digit_count,word_count
0,"go jurong point , crazy .. available bugis n g...",ham,0.0,0.00,0.0,0.0,0.0,0.0,0.050000,0.000000,...,0.0,0.0,0.0,0,0,0,0,0,0,20
1,ok lar ... joking wif u oni ...,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0,0,0,0,0,0,8
2,free entry 2 wkly comp win fa cup final tkts 2...,spam,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.033333,...,0.0,0.0,0.0,0,0,0,0,0,25,30
3,u dun say early hor ... u c already say ...,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0,0,0,0,0,0,11
4,"nah n't think go usf , life around though",ham,0.0,0.00,0.0,0.0,0.0,0.0,0.111111,0.000000,...,0.0,0.0,0.0,0,0,0,0,0,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,2nd time tried 2 contact u. u å£750 pound priz...,spam,0.0,0.04,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0,0,1,0,0,21,25
5568,ì_ b going esplanade fr home ?,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0,0,0,0,1,0,7
5569,"pity , * mood . ... suggestion ?",ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0,0,0,0,1,0,8
5570,guy bitching acted like 'd interested buying s...,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.071429,...,0.0,0.0,0.0,0,0,0,0,0,0,14


In [123]:
columns_to_scale = data.columns[2:57]

# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the selected columns
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

# Display the scaled DataFrame
data


Unnamed: 0,processed_text,spam,u,call,get,ur,gt,lt,go,free,...,please,pls,make,$,€,!,@,?,digit_count,word_count
0,"go jurong point , crazy .. available bugis n g...",ham,0.0,0.00,0.0,0.0,0.0,0.0,0.150000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0,20
1,ok lar ... joking wif u oni ...,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0,8
2,free entry 2 wkly comp win fa cup final tkts 2...,spam,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.133333,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,25,30
3,u dun say early hor ... u c already say ...,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0,11
4,"nah n't think go usf , life around though",ham,0.0,0.00,0.0,0.0,0.0,0.0,0.333333,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,2nd time tried 2 contact u. u å£750 pound priz...,spam,0.0,0.08,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.000000,21,25
5568,ì_ b going esplanade fr home ?,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.083333,0,7
5569,"pity , * mood . ... suggestion ?",ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.083333,0,8
5570,guy bitching acted like 'd interested buying s...,ham,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0,14


In [124]:
data.to_csv(r"C:\Users\serei\Desktop\Untitled Folder\new_data_new_words.csv")

# Word frequencies from previous analysis

We also want to create a word frequency matrix with the words used in the previous analysis. We calculate the frequencies as well as dividing them by the total number of words in the text to match the previous dataset.

In [115]:
data_old = pd.DataFrame(data[["processed_text", "spam"]])

words_old = ['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet','order',
             'mail', 'receive', 'will', 'people', 'report', 'addresses','free', 'business',
             'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp','hpl','george',
             '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology', '1999', 
             'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table',
             'conference', ';', '(', '[', '!', '$', '#']


In [116]:
for char in words_old:
    pattern = re.escape(char)  # Escape special characters in the regex pattern
    data_old[char] = data["processed_text"].str.count(pattern)

In [117]:
data_old

Unnamed: 0,processed_text,spam,make,address,all,3d,our,over,remove,internet,...,re,edu,table,conference,;,(,[,!,$,#
0,"go jurong point , crazy .. available bugis n g...",ham,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
1,ok lar ... joking wif u oni ...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,free entry 2 wkly comp win fa cup final tkts 2...,spam,0,0,0,0,0,1,0,0,...,2,0,0,0,0,1,0,0,0,0
3,u dun say early hor ... u c already say ...,ham,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,"nah n't think go usf , life around though",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,2nd time tried 2 contact u. u å£750 pound priz...,spam,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5568,ì_ b going esplanade fr home ?,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,"pity , * mood . ... suggestion ?",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,guy bitching acted like 'd interested buying s...,ham,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0


In [118]:
data_old["word_count"] = data_old["processed_text"].apply(lambda x: len(x.split()))

In [119]:
data_old[words_old] = data_old[words_old].div(data_old["word_count"], axis=0)

In [120]:
data_old

Unnamed: 0,processed_text,spam,make,address,all,3d,our,over,remove,internet,...,edu,table,conference,;,(,[,!,$,#,word_count
0,"go jurong point , crazy .. available bugis n g...",ham,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.0,20
1,ok lar ... joking wif u oni ...,ham,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.0,8
2,free entry 2 wkly comp win fa cup final tkts 2...,spam,0.0,0.0,0.00,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.033333,0.0,0.00,0.0,0.0,30
3,u dun say early hor ... u c already say ...,ham,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.0,11
4,"nah n't think go usf , life around though",ham,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,2nd time tried 2 contact u. u å£750 pound priz...,spam,0.0,0.0,0.04,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.04,0.0,0.0,25
5568,ì_ b going esplanade fr home ?,ham,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.0,7
5569,"pity , * mood . ... suggestion ?",ham,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.0,8
5570,guy bitching acted like 'd interested buying s...,ham,0.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.0,14


In [121]:
data_old.to_csv(r"C:\Users\serei\Desktop\Untitled Folder\new_data_old_words.csv")