# Spam Detection Project

In [18]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
import re

* Data exploration - let us check how the columns look like and delete unnecessary ones and then move to preprocessing
* We dont need the first column 'Unnamed: 0' as it is just an ID
* The columns 'label' and 'label_num' depict the same thing, let us only leave 'label_num' that represent spam (1) and non-spam (0) label

Import the dataset:

In [2]:
data = pd.read_csv(r"C:\Users\serei\Downloads\spam_ham_dataset.csv")

In [3]:
data.describe

<bound method NDFrame.describe of       Unnamed: 0 label                                               text  \
0            605   ham  Subject: enron methanol ; meter # : 988291\nth...   
1           2349   ham  Subject: hpl nom for january 9 , 2001\n( see a...   
2           3624   ham  Subject: neon retreat\nho ho ho , we ' re arou...   
3           4685  spam  Subject: photoshop , windows , office . cheap ...   
4           2030   ham  Subject: re : indian springs\nthis deal is to ...   
...          ...   ...                                                ...   
5166        1518   ham  Subject: put the 10 on the ft\nthe transport v...   
5167         404   ham  Subject: 3 / 4 / 2000 and following noms\nhpl ...   
5168        2933   ham  Subject: calpine daily gas nomination\n>\n>\nj...   
5169        1409   ham  Subject: industrial worksheets for august 2000...   
5170        4807  spam  Subject: important online banking alert\ndear ...   

      label_num  
0             0  
1    

In [4]:
data.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

Due to the fact that some variables that will be obtained from raw text the 'text' column will be copied to the new one named 'text_raw'

In [5]:
del data['Unnamed: 0']
del data['label']
 
data['text_raw'] = data['text']

Data preprocessing - we only need to take care of the text column
Convert text to lowercase - the iteration is required to go through each mail (it seems that the text was already in lower case but just to make sure let us do it again)

In [6]:
data['text'][0]

for i in range(0,len(data)):
    data['text'][i] = data['text'][i].lower()

## Check if it worked

data['text'][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'][i] = data['text'][i].lower()


"subject: enron methanol ; meter # : 988291\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary\nflow data provided by daren } .\nplease override pop ' s daily volume { presently zero } to reflect daily\nactivity you can obtain from gas control .\nthis change is needed asap for economics purposes ."

## Word Frequency

The previous analysis was based on word frequencies appearing in the text. In order to obtain the right word frequencies, we have to remove the stopwords as well as conduct lemmantization to obtain the root of the word and reduce the number of unique words. 
Next, we count the occurence of the top 50 most frequent words, special characters and numbers in each email.

We generate the functions with the help of ChatGPT.

In [7]:
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Lemmatize each word and remove stop words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
    
    # Join the lemmatized words back into a single string
    processed_text = ' '.join(lemmatized_words)
    
    return processed_text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\serei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\serei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data['processed_text'] = data['text_raw'].apply(preprocess_text)

In [9]:
data

Unnamed: 0,text,label_num,text_raw,processed_text
0,subject: enron methanol ; meter # : 988291\nth...,0,Subject: enron methanol ; meter # : 988291\nth...,Subject : enron methanol ; meter # : 988291 fo...
1,"subject: hpl nom for january 9 , 2001\n( see a...",0,"Subject: hpl nom for january 9 , 2001\n( see a...","Subject : hpl nom january 9 , 2001 ( see attac..."
2,"subject: neon retreat\nho ho ho , we ' re arou...",0,"Subject: neon retreat\nho ho ho , we ' re arou...","Subject : neon retreat ho ho ho , ' around won..."
3,"subject: photoshop , windows , office . cheap ...",1,"Subject: photoshop , windows , office . cheap ...","Subject : photoshop , window , office . cheap ..."
4,subject: re : indian springs\nthis deal is to ...,0,Subject: re : indian springs\nthis deal is to ...,Subject : : indian spring deal book teco pvr r...
...,...,...,...,...
5166,subject: put the 10 on the ft\nthe transport v...,0,Subject: put the 10 on the ft\nthe transport v...,Subject : put 10 ft transport volume decreased...
5167,subject: 3 / 4 / 2000 and following noms\nhpl ...,0,Subject: 3 / 4 / 2000 and following noms\nhpl ...,Subject : 3 / 4 / 2000 following noms hpl ' ta...
5168,subject: calpine daily gas nomination\n>\n>\nj...,0,Subject: calpine daily gas nomination\n>\n>\nj...,Subject : calpine daily gas nomination > > jul...
5169,subject: industrial worksheets for august 2000...,0,Subject: industrial worksheets for august 2000...,Subject : industrial worksheet august 2000 act...


The words are tokenized excluding the numbers (as random selection of integers were appearing in the top 50). We obtain the 50 most frequent ones.

In [10]:
# Tokenize the text into words while excluding numbers
all_text = data['processed_text'].str.cat(sep=' ')
words = nltk.word_tokenize(all_text)
words = [word for word in words if word.isalpha()]

# Create a frequency distribution of the words
freq_dist = FreqDist(words)

# Retrieve the most common words
num_most_common = 50
most_common_words = freq_dist.most_common(num_most_common)

# Print the most common words and their frequencies
for word, frequency in most_common_words:
    print(word, frequency)


ect 13908
hou 7289
enron 6555
Subject 5171
com 3709
deal 3635
please 3198
gas 3036
subject 2891
meter 2716
cc 2391
pm 2343
hpl 2318
e 1999
daren 1901
thanks 1898
need 1846
corp 1776
volume 1690
know 1611
day 1549
price 1524
new 1435
company 1429
may 1382
u 1359
mmbtu 1349
j 1337
get 1320
forwarded 1297
l 1296
http 1242
see 1211
let 1182
contract 1168
farmer 1156
information 1155
change 1147
time 1109
month 1098
attached 1097
would 1078
xl 1049
message 1042
sale 994
one 988
mail 917
th 908
robert 904
question 904


A list of only the needed words is obtained.

In [11]:
word_list = [word for word, count in most_common_words]

We generate a word frequency matrix for the most common words, as well as adding additional columns for special characters and integers.

In [12]:
vectorizer = CountVectorizer(vocabulary=word_list)
X = vectorizer.fit_transform(data["processed_text"])

In [13]:
matrix = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
data = pd.concat([data[["processed_text", "label_num"]], matrix], axis=1)

In [27]:
for char in ["$", "€", "!", "@", "?"]:
    pattern = re.escape(char)  # Escape special characters in the regex pattern
    data[char] = data["processed_text"].str.count(pattern)

data["digit_count"] = data["processed_text"].str.count(r"\d")

In [67]:
data

Unnamed: 0,processed_text,label_num,ect,hou,enron,Subject,com,deal,please,gas,...,mail,th,robert,question,$,€,!,@,?,digit_count
0,Subject : enron methanol ; meter # : 988291 fo...,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,10
1,"Subject : hpl nom january 9 , 2001 ( see attac...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
2,"Subject : neon retreat ho ho ho , ' around won...",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,4,0,0,14
3,"Subject : photoshop , window , office . cheap ...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Subject : : indian spring deal book teco pvr r...,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5166,Subject : put 10 ft transport volume decreased...,0,0,0,2,0,2,0,0,0,...,0,0,0,0,0,0,0,2,0,56
5167,Subject : 3 / 4 / 2000 following noms hpl ' ta...,0,0,0,3,0,3,0,0,0,...,0,0,0,0,0,0,0,8,0,72
5168,Subject : calpine daily gas nomination > > jul...,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,34
5169,Subject : industrial worksheet august 2000 act...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,14


# Word frequencies from previous analysis

We also want to create a word frequency matrix with the words used in the previous analysis. We calculate the frequencies as well as dividing them by the total number of words in the text to match the previous dataset.

In [30]:
data_old = pd.DataFrame(data[["processed_text", "label_num"]])

words_old = ['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet','order',
             'mail', 'receive', 'will', 'people', 'report', 'addresses','free', 'business',
             'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp','hpl','george',
             '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology', '1999', 
             'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table',
             'conference', ';', '(', '[', '!', '$', '#']


In [31]:
for char in words_old:
    pattern = re.escape(char)  # Escape special characters in the regex pattern
    data_old[char] = data["processed_text"].str.count(pattern)

In [32]:
data_old

Unnamed: 0,processed_text,label_num,make,address,all,3d,our,over,remove,internet,...,re,edu,table,conference,;,(,[,!,$,#
0,Subject : enron methanol ; meter # : 988291 fo...,0,0,0,0,0,0,1,0,0,...,4,0,0,0,1,0,0,0,0,1
1,"Subject : hpl nom january 9 , 2001 ( see attac...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,"Subject : neon retreat ho ho ho , ' around won...",0,1,0,2,0,2,0,0,0,...,30,1,0,1,0,2,0,4,0,0
3,"Subject : photoshop , window , office . cheap ...",1,0,0,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0
4,Subject : : indian spring deal book teco pvr r...,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5166,Subject : put 10 ft transport volume decreased...,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,1,0,0,0,0
5167,Subject : 3 / 4 / 2000 following noms hpl ' ta...,0,0,0,1,0,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
5168,Subject : calpine daily gas nomination > > jul...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5169,Subject : industrial worksheet august 2000 act...,0,0,0,0,0,0,0,0,0,...,6,0,0,0,0,0,0,0,0,0


In [33]:
data_old["word_count"] = data_old["processed_text"].apply(lambda x: len(x.split()))

In [34]:
data_old[words_old] = data_old[words_old].div(data_old["word_count"], axis=0)

In [35]:
data_old

Unnamed: 0,processed_text,label_num,make,address,all,3d,our,over,remove,internet,...,edu,table,conference,;,(,[,!,$,#,word_count
0,Subject : enron methanol ; meter # : 988291 fo...,0,0.000000,0.0,0.000000,0.0,0.000000,0.02,0.0,0.000000,...,0.000000,0.0,0.000000,0.02,0.000000,0.0,0.000000,0.0,0.02,50
1,"Subject : hpl nom january 9 , 2001 ( see attac...",0,0.000000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.000000,...,0.000000,0.0,0.000000,0.00,0.043478,0.0,0.000000,0.0,0.00,23
2,"Subject : neon retreat ho ho ho , ' around won...",0,0.003311,0.0,0.006623,0.0,0.006623,0.00,0.0,0.000000,...,0.003311,0.0,0.003311,0.00,0.006623,0.0,0.013245,0.0,0.00,302
3,"Subject : photoshop , window , office . cheap ...",1,0.000000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.000000,...,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.00,49
4,Subject : : indian spring deal book teco pvr r...,0,0.000000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.000000,...,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.00,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5166,Subject : put 10 ft transport volume decreased...,0,0.000000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.000000,...,0.000000,0.0,0.000000,0.00,0.007299,0.0,0.000000,0.0,0.00,137
5167,Subject : 3 / 4 / 2000 following noms hpl ' ta...,0,0.000000,0.0,0.003876,0.0,0.000000,0.00,0.0,0.000000,...,0.000000,0.0,0.000000,0.00,0.003876,0.0,0.000000,0.0,0.00,258
5168,Subject : calpine daily gas nomination > > jul...,0,0.000000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.000000,...,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.00,67
5169,Subject : industrial worksheet august 2000 act...,0,0.000000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.000000,...,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.00,84


In [36]:
data_old.to_csv(r"C:\Users\serei\Desktop\Untitled Folder\data_old.csv")