In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv(r'C:\Users\chava\Downloads\archive (5)\spam_or_not_spam.csv')

# Explore the dataset
print(df.head())
print(df.info())
print(df['label'].value_counts())  # 1 for spam, 0 for not spam


                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   object
 1   label   2999 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB
None
0    2500
1     499
Name: label, dtype: int64


In [6]:
# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Apply preprocessing
df['cleaned_text'] = df['email'].apply(preprocess_text)
df['cleaned_text'][0]

'date wed number aug number number number number number from chris garrigues cwg dated number numberfanumberd deepeddy com message id number number tmda deepeddy vircio com i can t reproduce this error for me it is very repeatable like every time without fail this is the debug log of the pick happening number number number pick_it exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number ftoc_pickmsgs number hit number number number marking number hits number number number tkerror syntax error in expression int note if i run the pick command by hand delta pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number hit that s where the number hit comes from obviously the version of nmh i m using is delta pick version pick nmh number number number compiled on url at sun mar number number n

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8783333333333333
Precision: 1.0
Recall: 0.27
F1 Score: 0.4251968503937008


In [33]:
# Predict new emails
new_emails = [""" hyperlink copyright NUMBER all rights reservedif you would no longer like us to contact you or feel that you havereceived this email in error please hyperlink click here to unsubscribe 
"""]
new_emails_cleaned = [preprocess_text(email) for email in new_emails]
new_emails_cleaned
new_emails_tfidf = vectorizer.transform(new_emails_cleaned)
predictions = model.predict(new_emails_tfidf)

print(predictions)  # Output will be '1' for spam and '0' for not spam

[1]
