In [None]:
"""
Project: AI-Based Spam Email Detection System
Internship: Shris Tech
Domain: Artificial Intelligence
"""

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
df = pd.read_csv("spam.csv")
df


Unnamed: 0,label,text
0,ham,Hey are we still meeting today?
1,spam,Congratulations! You have won a free lottery t...
2,ham,Please call me when you are free.
3,spam,URGENT! Your account has been compromised. Cli...
4,ham,Can you send me the notes from class?
5,spam,Win cash prizes instantly by clicking this link.
6,ham,Let's have lunch tomorrow.
7,spam,You have been selected for a free gift card. A...
8,ham,Are you coming to the party tonight?
9,spam,Limited time offer! Buy now and get 50% off.


In [5]:
df.info()
df['label'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   40 non-null     object
 1   text    40 non-null     object
dtypes: object(2)
memory usage: 772.0+ bytes


label
spam    25
ham     15
Name: count, dtype: int64

In [6]:
import re


In [7]:
def clean_text(text):
    text = text.lower()                      # convert to lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text)   # remove numbers & symbols
    text = re.sub(r'\s+', ' ', text)         # remove extra spaces
    return text


In [8]:
df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']]


Unnamed: 0,text,clean_text
0,Hey are we still meeting today?,hey are we still meeting today
1,Congratulations! You have won a free lottery t...,congratulations you have won a free lottery ti...
2,Please call me when you are free.,please call me when you are free
3,URGENT! Your account has been compromised. Cli...,urgent your account has been compromised click...
4,Can you send me the notes from class?,can you send me the notes from class
5,Win cash prizes instantly by clicking this link.,win cash prizes instantly by clicking this link
6,Let's have lunch tomorrow.,let s have lunch tomorrow
7,You have been selected for a free gift card. A...,you have been selected for a free gift card ac...
8,Are you coming to the party tonight?,are you coming to the party tonight
9,Limited time offer! Buy now and get 50% off.,limited time offer buy now and get off


In [9]:
df.head()


Unnamed: 0,label,text,clean_text
0,ham,Hey are we still meeting today?,hey are we still meeting today
1,spam,Congratulations! You have won a free lottery t...,congratulations you have won a free lottery ti...
2,ham,Please call me when you are free.,please call me when you are free
3,spam,URGENT! Your account has been compromised. Cli...,urgent your account has been compromised click...
4,ham,Can you send me the notes from class?,can you send me the notes from class


In [10]:
X = df['clean_text']
y = df['label']


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)


In [13]:
X_tfidf.shape


(40, 79)

In [14]:
from sklearn.model_selection import train_test_split


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


In [16]:
X_train.shape, X_test.shape


((32, 79), (8, 79))

In [17]:
from sklearn.naive_bayes import MultinomialNB


In [18]:
model = MultinomialNB()
model.fit(X_train, y_train)


0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [19]:
y_pred = model.predict(X_test)


In [20]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00         2
        spam       1.00      1.00      1.00         6

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [21]:
test_msg = ["Congratulations! You have won a free prize. Click now!"]

test_msg_clean = [clean_text(test_msg[0])]
test_msg_tfidf = tfidf.transform(test_msg_clean)

prediction = model.predict(test_msg_tfidf)
print("Message:", test_msg[0])
print("Prediction:", prediction[0])


Message: Congratulations! You have won a free prize. Click now!
Prediction: spam


In [22]:
def predict_spam(message):
    message_clean = clean_text(message)
    message_tfidf = tfidf.transform([message_clean])
    prediction = model.predict(message_tfidf)
    return prediction[0]


In [25]:
msg = input("Enter an email/message: ")
result = predict_spam(msg)
print("Prediction:", result)


Enter an email/message:  Congratulations! You are selected for the prize of 1 Million.


Prediction: spam


In [None]:
"""
## Limitations of the System
- The accuracy of the spam detection system depends heavily on the size and quality of the dataset.
- With a small dataset, the model may misclassify some messages.
- The system currently supports only text-based input.
- More advanced NLP techniques could further improve performance.
"""

In [None]:
"""
## Future Scope
- The dataset can be expanded with real-world email data for better accuracy.
- More advanced models such as Logistic Regression or Deep Learning can be used.
- The system can be integrated with an email client for real-time spam filtering.
- Language support can be extended beyond English.
"""


In [None]:
"""## Conclusion
The AI-based spam email detection system successfully classifies messages as spam or non-spam
using machine learning techniques. By applying text preprocessing, TF-IDF feature extraction, and a Naive Bayes classifier,
the system demonstrates how artificial intelligence can automate spam detection efficiently.
"""
