In [1]:
# Load the dataset
import pandas as pd

# Load the dataset
data = pd.read_csv("emails.csv")

# Display first few rows
print(data.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [2]:
# Display basic info
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB
None


In [3]:
# Check for missing values
print(data.isnull().sum())

text    0
spam    0
dtype: int64


In [4]:
# Check the number of unique values per column
print(data.nunique())

# Check class distribution
print(data['spam'].value_counts())

text    5695
spam       2
dtype: int64
0    4360
1    1368
Name: spam, dtype: int64


In [5]:
# check class distribution
print(data['spam'].value_counts())

0    4360
1    1368
Name: spam, dtype: int64


# Data Preprocessing

In [6]:
# Removing Duplicate Emails
data = data.drop_duplicates(subset=['text'])

# Check new shape
print("New dataset size after removing duplicates:", data.shape)


New dataset size after removing duplicates: (5695, 2)


In [7]:
# Handling class imbalance
# Oversampling with SMOTE (Synthetic Minority Over-sampling Technique) helps balance the classes by creating synthetic samples of the minority class (spam).

In [8]:
# Apply SMOTE to balance the dataset
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Convert text to numerical features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'])  # Transform email text into numeric features
y = data['spam']  # Target variable

# Split data before applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print class distribution after SMOTE
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_resampled).value_counts())

Before SMOTE: 0    3462
1    1094
Name: spam, dtype: int64
After SMOTE: 0    3462
1    3462
Name: spam, dtype: int64


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Train models and store performance
models = {
    "Multinomial Naïve Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)
    
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
    }

# Display results
import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)

                         Accuracy  F1 Score  Precision    Recall
Multinomial Naïve Bayes  0.992976  0.985560   0.975000  0.996350
Random Forest            0.975417  0.949458   0.939286  0.959854
MLP                      0.991220  0.981949   0.971429  0.992701


In [10]:
from sklearn.naive_bayes import MultinomialNB

# Train Multinomial Naïve Bayes model
mnb_model = MultinomialNB()
mnb_model.fit(X_train_resampled, y_train_resampled)

# Check model accuracy on test data
accuracy = mnb_model.score(X_test, y_test)
print(f"Final Model Accuracy: {accuracy:.4f}")

Final Model Accuracy: 0.9930


In [11]:
import pickle

# Save the trained model
with open("email_detection.pkl", "wb") as model_file:
    pickle.dump(mnb_model, model_file)

# Save the vectorizer
with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and Vectorizer saved successfully!")

Model and Vectorizer saved successfully!


In [12]:
# Load the saved model
with open("email_detection.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

# Load the saved vectorizer
with open("vectorizer.pkl", "rb") as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

print("Model and Vectorizer loaded successfully!")

Model and Vectorizer loaded successfully!


In [14]:
def predict_email(message):
    message_vector = loaded_vectorizer.transform([message])  # Convert message to numerical features
    prediction = loaded_model.predict(message_vector)  # Predict using the model
    return "Spam" if prediction[0] == 1 else "Ham"  # Return result

# Example: Take user input for testing
user_message = input("Enter email text to predict: ")
prediction = predict_email(user_message)
print(f"The email is: {prediction}")

Enter email text to predict: Congrats! you won an iphone,click here to claim
The email is: Spam
