In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Step 1: Load the dataset
data = pd.read_csv('/content/email_spam.csv', encoding='latin-1')  # Update file path as needed

In [3]:
# Inspect the data (Optional)
print(data.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [4]:

# Step 2: Preprocessing
# Assuming the dataset has 'v2' and 'v1' columns
# **Change:** Replace 'text' with 'v2' - this is the actual column name for messages in the dataset
texts = data['v2']
# **Change:** Replace 'label' with 'v1' - this is the actual column name for spam/ham labels
labels = data['v1']


In [6]:

# Step 3: Vectorize the text data
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(texts)

In [7]:
print(x)

  (0, 3550)	0.1481298737377147
  (0, 8030)	0.22998520738984352
  (0, 4350)	0.3264252905795869
  (0, 5920)	0.2553151503985779
  (0, 2327)	0.25279391746019725
  (0, 1303)	0.24415547176756056
  (0, 5537)	0.15618023117358304
  (0, 4087)	0.10720385321563428
  (0, 1751)	0.2757654045621182
  (0, 3634)	0.1803175103691124
  (0, 8489)	0.22080132794235655
  (0, 4476)	0.2757654045621182
  (0, 1749)	0.3116082237740733
  (0, 2048)	0.2757654045621182
  (0, 7645)	0.15566431601878158
  (0, 3594)	0.15318864840197105
  (0, 1069)	0.3264252905795869
  (0, 8267)	0.18238655630689804
  (1, 5504)	0.27211951321382544
  (1, 4512)	0.4082988561907181
  (1, 4318)	0.5236458071582338
  (1, 8392)	0.4316010362639011
  (1, 5533)	0.5465881710238072
  (2, 4087)	0.07917128722158312
  (2, 3358)	0.11301399735581102
  :	:
  (5570, 4218)	0.12246610191126918
  (5570, 8313)	0.18723687600522523
  (5570, 1084)	0.11225268140936363
  (5570, 4615)	0.1596552981734164
  (5570, 7039)	0.18426763178390446
  (5570, 3308)	0.1217217261863451

In [8]:

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, labels, test_size=0.40, random_state=42)

In [9]:
# Step 5: Train the Decision Tree Classifier
model = LogisticRegression()
model.fit(X_train, y_train)

In [10]:
# Step 6: Make predictions
pred = model.predict(X_test)

In [11]:
# Step 7: Evaluate the model
print("Accuracy:", accuracy_score(y_test, pred))
print("Classification Report:\n", classification_report(y_test, pred))

Accuracy: 0.955585464333782
Classification Report:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1930
        spam       0.99      0.68      0.80       299

    accuracy                           0.96      2229
   macro avg       0.97      0.84      0.89      2229
weighted avg       0.96      0.96      0.95      2229



In [12]:
# Step 8: Test the model with new data
new_messages = ["win cash prize worth $4000", "You're owed a refund!", "will you complete your task tomorrow?", "Congratulations! You've won a free trip.", "Meeting rescheduled to tomorrow."]
new_vectors = vectorizer.transform(new_messages)
new_predictions = model.predict(new_vectors)

print("New Predictions:", new_predictions)


New Predictions: ['spam' 'ham' 'ham' 'ham' 'ham']
