In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle

In [6]:
print("Loading dataset...")
df = pd.read_csv("spam.csv", encoding="latin-1")

Loading dataset...


In [7]:
df = df.rename(columns={"v1": "label", "v2": "message"})  # Rename columns for clarity
df = df[["label", "message"]]  # Select relevant columns

In [8]:
# Encode labels: 'spam' = 1, 'ham' = 0
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [9]:
print("Dataset loaded and processed!")

Dataset loaded and processed!


In [10]:
# Step 2: Split Data
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
print("Data split into training and testing sets.")

Data split into training and testing sets.


In [12]:
# Step 3: Vectorize Text Data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [13]:
print("Text data vectorized.")

Text data vectorized.


In [14]:
# Step 4: Train the Model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [15]:
print("Model trained!")

Model trained!


In [16]:
# Step 5: Evaluate the Model
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.99


In [27]:
print("Shape of training data vector:", X_train_vec.shape)
print("Shape of input vector:", X_test_vec.shape)


Shape of training data vector: (4457, 7733)
Shape of input vector: (1115, 7733)


In [32]:
# Step 6: Save the Model and Vectorizer
with open("spam_classifier.pkl", "rb") as model_file:
    model_tuple = pickle.load(model_file)  # Load as tuple

In [33]:
print(type(model))


<class 'sklearn.naive_bayes.MultinomialNB'>


In [34]:
with open("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [35]:
print(type(vectorizer))

<class 'sklearn.feature_extraction.text.CountVectorizer'>


In [36]:
print("Model and vectorizer saved as 'spam_classifier.pkl' and 'vectorizer.pkl'.")


Model and vectorizer saved as 'spam_classifier.pkl' and 'vectorizer.pkl'.


In [37]:
# Step 7: Demonstrate Usage
print("Demonstrating usage of the saved model and vectorizer...")

Demonstrating usage of the saved model and vectorizer...


In [42]:
with open("spam_classifier.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

In [39]:
with open("vectorizer.pkl", "rb") as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

In [24]:
# Test with a new message
test_message = ["Congratulations! You've won a free gift card."]
test_message_vec = loaded_vectorizer.transform(test_message)
prediction = loaded_model.predict(test_message_vec)
result = "Spam" if prediction[0] == 1 else "Not Spam"

In [25]:
print(f"Test Message: {test_message[0]}")
print(f"Prediction: {result}")

Test Message: Congratulations! You've won a free gift card.
Prediction: Spam
