In [1]:
# Import necessary libraries
import pandas as pd
import re
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vijir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vijir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Step 1: Load the dataset
data = pd.read_csv('email_spam.csv', encoding='latin-1', dtype=str, keep_default_na=False, na_values=[], skipinitialspace=False, index_col=False)  # Update file path as needed
#data = pd.read_csv('email_spam.csv', encoding='ISO-8859-1')  # Update file path as needed

In [4]:

# Step 2: Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    #text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    words = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join the words back into a single string
    return ' '.join(words)


In [5]:

# Apply preprocessing to the text column
# Assuming the email text is in the column named 'v1' or 'v2' based on the global variable output.
# Change 'v1' to the actual column name if it's different.
data['cleaned_text'] = data['v2'].apply(preprocess_text) # Changed 'text' to 'v2' - Update to correct column name if needed. Print data.columns to confirm column names.


In [6]:

# Step 3: Prepare Features and Labels
texts = data['cleaned_text']  # Use cleaned text for modeling
labels = data['v1']  # Replace 'label' with the actual label column name - Assuming v1 contains the label


In [7]:

# Step 4: Vectorize the text data
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(texts)

In [8]:

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, labels, test_size=0.40, random_state=42)


In [9]:

# Step 6: Train the Decision Tree Classifier
model = MultinomialNB()
model.fit(X_train, y_train)


In [10]:

# Step 7: Make predictions
y_pred = model.predict(X_test)

In [11]:

# Step 8: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9547085201793722
Classification Report:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1932
        spam       1.00      0.66      0.80       298

    accuracy                           0.95      2230
   macro avg       0.98      0.83      0.89      2230
weighted avg       0.96      0.95      0.95      2230



In [12]:
# Step 9: Test the model with new data
new_messages = ["win cash prize worth $4000", "You're owed a refund!", "You are fool", "Sunshine Quiz Wkly Q!"]
new_vectors = vectorizer.transform(new_messages)  # Transform new messages to numerical vectors
new_predictions = model.predict(new_vectors)     # Predict spam/ham labels
new_texts = vectorizer.inverse_transform(new_vectors)  # Map vectors back to corresponding words

# Print predictions and the corresponding words
#print("New Predictions:", new_predictions)

for i, text in enumerate(new_texts):
    print(f"Message {i + 1}: Predicted as {new_predictions[i]}, Words: {' '.join(text)}")
    #print(f"Message {i + 1}:, Words: {' '.join(text)}, Predicted as {new_predictions[i]}")


Message 1: Predicted as spam, Words: cash prize win worth
Message 2: Predicted as ham, Words: owed refund
Message 3: Predicted as ham, Words: fool
Message 4: Predicted as spam, Words: quiz sunshine wkly


In [13]:
# Step 9: Test the model with new data
new_messages = ["Congratulations, you won a free ticket!", "Hey, how are you doing today?", "Sunshine Quiz Wkly Q!"]
new_messages_cleaned = [preprocess_text(msg) for msg in new_messages]
new_vectors = vectorizer.transform(new_messages_cleaned)
new_predictions = model.predict(new_vectors)

print("New Predictions:", new_predictions)

New Predictions: ['ham' 'ham' 'spam']
