In [1]:
test_messages = [
    "Congratulations! You've won $1,000,000! Click here now!",
    "Hey, how are you doing today?",
    "URGENT: Your account will be closed! Act now!!!",
    "Free viagra! Buy now with no prescription needed!",
    "Meeting at 3pm tomorrow in conference room B",
    "Limited time offer: 90% off everything! www.deals.com",
    "Thanks for your help with the project."
]

v 1.0: `code`

_generated with Qwen3-Max_

In [2]:
def is_spam(message):
    
    # Common spam indicators
    spam_keywords = [
        'free', 'win', 'winner', 'cash', 'prize', 'urgent', 'act now',
        'limited time', 'offer', 'discount', 'click here', 'buy now',
        'guaranteed', 'risk free', 'congratulations', 'you won',
        'money back', 'no obligation', 'credit card', 'viagra',
        'weight loss', 'lottery', 'million dollars', 'nigerian prince'
    ]
    
    # Suspicious patterns
    excessive_caps = sum(1 for c in message if c.isupper()) / len(message) > 0.5 if len(message) > 0 else False
    excessive_exclamation = message.count('!') > 3
    contains_urls = any(indicator in message.lower() for indicator in ['http://', 'https://', 'www.', '.com', '.net', '.org'])
    contains_phone_numbers = any(char.isdigit() for char in message) and len([c for c in message if c.isdigit()]) > 5
    
    # Check for spam keywords
    keyword_count = 0
    for keyword in spam_keywords:
        if keyword in message.lower():
            keyword_count += 1
    
    # Simple scoring system
    spam_score = 0
    
    # Add points for each spam indicator found
    if keyword_count >= 2:
        spam_score += 2
    elif keyword_count == 1:
        spam_score += 1
        
    if excessive_caps:
        spam_score += 1
        
    if excessive_exclamation:
        spam_score += 1
        
    if contains_urls:
        spam_score += 1
        
    if contains_phone_numbers:
        spam_score += 1
    
    # Very short messages with spam keywords are often spam
    if len(message.strip()) < 20 and keyword_count > 0:
        spam_score += 1
    
    # Return True if spam score is 2 or higher
    return spam_score >= 2


# Test the function
if __name__ == "__main__":    
    for msg in test_messages:
        result = is_spam(msg)
        print(f"Message: '{msg}'")
        print(f"Spam detected: {result}")
        print("-" * 50)

Message: 'Congratulations! You've won $1,000,000! Click here now!'
Spam detected: True
--------------------------------------------------
Message: 'Hey, how are you doing today?'
Spam detected: False
--------------------------------------------------
Message: 'URGENT: Your account will be closed! Act now!!!'
Spam detected: True
--------------------------------------------------
Message: 'Free viagra! Buy now with no prescription needed!'
Spam detected: True
--------------------------------------------------
Message: 'Meeting at 3pm tomorrow in conference room B'
Spam detected: False
--------------------------------------------------
Message: 'Limited time offer: 90% off everything! www.deals.com'
Spam detected: True
--------------------------------------------------
Message: 'Thanks for your help with the project.'
Spam detected: False
--------------------------------------------------


v 2.0: `weights` 

_Naive Bayes Classifier_

In [3]:
import pandas as pd

df = pd.read_csv('./emails.csv')
display(df.info())
display(df.head())

# calculating the probability of spam and ham
spam_probability = len(df[df['spam'] == 1]) / len(df)
ham_probability = len(df[df['spam'] == 0]) / len(df)

print(f'Spam Probability={spam_probability}')
print(f'Ham Probability={ham_probability}')
display(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


None

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


Spam Probability=0.2388268156424581
Ham Probability=0.7611731843575419


Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# split data into training and testing sets
X = df['text']
y = df['spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Added random_state for reproducibility


# to convert text into a matrix of token counts
vectorizer = CountVectorizer()
# fit vectorizer on training data only (to avoid data leakage from test set)
# learn the vocabulary dictionary and return document-term matrix
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# inspect the features
_features=vectorizer.get_feature_names_out()
print(len(_features), _features[:10], _features[-10:])

# using naive bayes classifier
clf=MultinomialNB()
# fit classifier on training data
clf.fit(X_train_vectorized,y_train)

# evaluate the test sets
#X_test_vectorized = vectorizer.transform(X_test)
#display(clf.score(X_test_vectorized,y_test))

33790 ['00' '000' '0000' '000000' '00000000' '0000000000' '000000000003619'
 '000000000003991' '000000000003997' '000000000005168'] ['zwwyw' 'zwzm' 'zxghlajf' 'zyban' 'zyc' 'zygoma' 'zymg' 'zzn' 'zzncacst'
 'zzzz']


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [5]:
# making predictions on the test set
y_predict_test=clf.predict(X_test_vectorized)
display(y_predict_test[:10])

# evaluating the model
from sklearn.metrics import accuracy_score,classification_report
print(f'Accuracy={accuracy_score(y_test,y_predict_test)}')
print(classification_report(y_test,y_predict_test))

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1], dtype=int64)

Accuracy=0.987783595113438
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       856
           1       0.98      0.97      0.98       290

    accuracy                           0.99      1146
   macro avg       0.98      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146



In [13]:
import joblib
# save the fitted vectorizer and classifier
joblib.dump(vectorizer, 'spam-vectorizer.joblib')
joblib.dump(clf, 'spam-classifier.joblib')
print("Model and vectorizer saved!")

Model and vectorizer saved!


In [6]:
#sample test
for msg in test_messages:
    msg_vector=vectorizer.transform([msg])
    prob = clf.predict_proba(msg_vector)
    prediction=clf.predict(msg_vector)
    print(f'Message: "{msg}"')
    print(f'Spam Detected: {bool(prediction[0])}')
    print(f'Probability: {prob[0][1]:.4f}')
    print('-'*50)

Message: "Congratulations! You've won $1,000,000! Click here now!"
Spam Detected: True
Probability: 0.9992
--------------------------------------------------
Message: "Hey, how are you doing today?"
Spam Detected: False
Probability: 0.4555
--------------------------------------------------
Message: "URGENT: Your account will be closed! Act now!!!"
Spam Detected: True
Probability: 0.9753
--------------------------------------------------
Message: "Free viagra! Buy now with no prescription needed!"
Spam Detected: True
Probability: 1.0000
--------------------------------------------------
Message: "Meeting at 3pm tomorrow in conference room B"
Spam Detected: False
Probability: 0.0000
--------------------------------------------------
Message: "Limited time offer: 90% off everything! www.deals.com"
Spam Detected: True
Probability: 0.9540
--------------------------------------------------
Message: "Thanks for your help with the project."
Spam Detected: False
Probability: 0.0089
------------

v 3.0: `prompts`

In [None]:
import random
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
load_dotenv("./../.env")
prompt = """You are a spam detection AI model. Your task is to classify emails as 'spam' or not. 
Your input is an email text message, Output should be a a boolean values indicating whether the message is spam.

Example
Input: "Congratulations! You've won a free vacation to the Bahamas! Click here to claim your prize."
Output: True

Now classify the following email:

"""
model = init_chat_model("openai:gpt-5-nano")
msg = random.choice(test_messages)
prompt += msg
response = model.invoke(prompt)
print(f'Message: "{msg}"')
print(f'Spam Detected: {response.content}')
print('-'*50)

Message: "Thanks for your help with the project."
Spam Detected: false
--------------------------------------------------


In [26]:
import json
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
load_dotenv("./../.env")
prompt = """You are a spam detection AI model. Your task is to classify emails as 'spam' or not. 
Your input is a json contained one or more emails. 
Output should be a json with the same keys as the input, but with a float continuous values indicating the probability each email is spam.

Example
Input: {"0": "Congratulations! You've won a free vacation to the Bahamas! Click here to claim your prize.", "1": "Meeting at 3pm tomorrow in conference room B"}
Output: {"0": 0.9994, "1": 0.0000}

Now classify the following emails:

"""
model = init_chat_model("openai:gpt-5-nano")
input_dict = {}
for idx, msg in enumerate(test_messages):
    input_dict[str(idx)] = msg
prompt += str(input_dict)
response = model.invoke(prompt)
result_json = json.loads(response.content)
for key in result_json:
    print(f'Message: "{input_dict[key]}"')
    print(f'Spam Probability: {result_json[key]}')
    print('-'*50)

Message: "Congratulations! You've won $1,000,000! Click here now!"
Spam Probability: 0.99
--------------------------------------------------
Message: "Hey, how are you doing today?"
Spam Probability: 0.03
--------------------------------------------------
Message: "URGENT: Your account will be closed! Act now!!!"
Spam Probability: 0.95
--------------------------------------------------
Message: "Free viagra! Buy now with no prescription needed!"
Spam Probability: 0.97
--------------------------------------------------
Message: "Meeting at 3pm tomorrow in conference room B"
Spam Probability: 0.05
--------------------------------------------------
Message: "Limited time offer: 90% off everything! www.deals.com"
Spam Probability: 0.92
--------------------------------------------------
Message: "Thanks for your help with the project."
Spam Probability: 0.02
--------------------------------------------------


In [None]:
!fastapi run main.py --port 6001