In [177]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression

In [178]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vijir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vijir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [179]:
# Step 1: Load the dataset
data = pd.read_csv('email_spam.csv', encoding='latin-1', dtype=str, keep_default_na=False, na_values=[], skipinitialspace=False, index_col=False)  # Update file path as needed

In [None]:

# Step 2: Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    words = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join the words back into a single string
    return ' '.join(words)


In [181]:

# Apply preprocessing to the text column
# Assuming the email text is in the column named 'v1' or 'v2' based on the global variable output.
# Change 'v1' to the actual column name if it's different.
data['cleaned_text'] = data['v2'].apply(preprocess_text) # Changed 'text' to 'v2' - Update to correct column name if needed. Print data.columns to confirm column names.


UnboundLocalError: cannot access local variable 'words' where it is not associated with a value

In [None]:

# Step 3: Prepare Features and Labels
texts = data['v2']  # Use cleaned text for modeling
labels = data['v1']  # Replace 'label' with the actual label column name - Assuming v1 contains the label


In [None]:
print(texts.head())

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object


In [None]:
print(labels.tail())

5569     ham
5570     ham
5571     ham
5572    spam
5573    spam
Name: v1, dtype: object


In [None]:

# Step 4: Vectorize the text data
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(texts)

In [None]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 73923 stored elements and shape (5574, 8672)>
  Coords	Values
  (0, 3550)	1
  (0, 8030)	1
  (0, 4350)	1
  (0, 5920)	1
  (0, 2327)	1
  (0, 1303)	1
  (0, 5537)	1
  (0, 4087)	1
  (0, 1751)	1
  (0, 3634)	1
  (0, 8489)	1
  (0, 4476)	1
  (0, 1749)	1
  (0, 2048)	1
  (0, 7645)	1
  (0, 3594)	1
  (0, 1069)	1
  (0, 8267)	1
  (1, 5504)	1
  (1, 4512)	1
  (1, 4318)	1
  (1, 8392)	1
  (1, 5533)	1
  (2, 4087)	1
  (2, 3358)	1
  :	:
  (5570, 1438)	1
  (5570, 5334)	1
  (5570, 2592)	1
  (5570, 8065)	1
  (5570, 1778)	1
  (5570, 7049)	1
  (5570, 2892)	1
  (5570, 3470)	1
  (5570, 1786)	1
  (5570, 3687)	1
  (5570, 4161)	1
  (5570, 903)	1
  (5570, 1546)	1
  (5571, 7756)	1
  (5571, 5244)	1
  (5571, 4225)	2
  (5571, 7885)	1
  (5571, 6505)	1
  (5572, 1224)	1
  (5572, 7669)	1
  (5572, 6853)	1
  (5572, 7105)	1
  (5573, 8609)	1
  (5573, 1183)	1
  (5573, 3299)	1


In [None]:

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, labels, test_size=0.40, random_state=42)


In [None]:

# Step 6: Train the Decision Tree Classifier
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
print (X_test)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 29563 stored elements and shape (2230, 8672)>
  Coords	Values
  (0, 3358)	1
  (0, 3308)	1
  (0, 4939)	1
  (0, 8615)	1
  (0, 1813)	2
  (0, 5879)	1
  (0, 1054)	1
  (0, 8222)	1
  (0, 5529)	1
  (0, 6914)	1
  (1, 7756)	2
  (1, 3207)	1
  (1, 5420)	1
  (1, 8609)	2
  (1, 4206)	1
  (1, 8416)	1
  (1, 6073)	1
  (1, 2054)	1
  (1, 1813)	1
  (1, 5570)	1
  (1, 7669)	1
  (1, 5606)	1
  (1, 1183)	1
  (1, 299)	1
  (1, 3666)	1
  :	:
  (2228, 8095)	1
  (2228, 8425)	1
  (2228, 3797)	2
  (2228, 5026)	2
  (2228, 3317)	1
  (2228, 2830)	1
  (2229, 1084)	1
  (2229, 8609)	4
  (2229, 4206)	2
  (2229, 7621)	1
  (2229, 3563)	1
  (2229, 8362)	1
  (2229, 1079)	1
  (2229, 2675)	1
  (2229, 6259)	1
  (2229, 2769)	2
  (2229, 3010)	1
  (2229, 7609)	2
  (2229, 3004)	1
  (2229, 5411)	1
  (2229, 3134)	1
  (2229, 1124)	1
  (2229, 4036)	1
  (2229, 6972)	1
  (2229, 4871)	1


In [None]:

# Step 7: Make predictions
y_pred = model.predict(X_test)

In [None]:
print(y_pred)

['ham' 'spam' 'ham' ... 'ham' 'ham' 'ham']


In [None]:

# Step 8: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9811659192825112
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1932
        spam       0.93      0.93      0.93       298

    accuracy                           0.98      2230
   macro avg       0.96      0.96      0.96      2230
weighted avg       0.98      0.98      0.98      2230



In [None]:
# Step 9: Test the model with new data
new_messages = ["win cash prize worth $4000", "You're owed a refund!", "You are fool"]
new_vectors = vectorizer.transform(new_messages)
new_predictions = model.predict(new_vectors)

print("New Predictions:", new_predictions)

New Predictions: ['spam' 'ham' 'ham']
