In [1]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
print("Loading dataset...")
df = pd.read_csv(r"C:\Users\varsh\Downloads\spam.csv", encoding='latin-1')
df = df.rename(columns={"v1": "label", "v2": "message"})  # Rename columns
df = df[['label', 'message']]  # Keep only relevant columns
df['label'] = df['label'].map({'spam': 1, 'ham': 0})  # Convert labels to binary

print(f"Dataset loaded. Total messages: {len(df)}")
print(f"Spam messages: {df['label'].sum()}, Non-spam messages: {len(df) - df['label'].sum()}")

# Step 2: Text vectorization
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['message']).toarray()
y = df['label']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train the model
model = MultinomialNB()
model.fit(X_train, y_train)
print("Model trained successfully.")

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")

# Function to predict spam or not
def predict_spam(email):
    email_vectorized = vectorizer.transform([email]).toarray()
    prediction = model.predict(email_vectorized)[0]
    print(f"Prediction for email: {'Spam' if prediction == 1 else 'Not Spam'}")

# Step 6: Test with a new email
print("\nTesting with sample emails...")
predict_spam("Congratulations! You've won a free ticket. Claim now!")
predict_spam("Hi, can we schedule a meeting tomorrow?")

Loading dataset...
Dataset loaded. Total messages: 5572
Spam messages: 747, Non-spam messages: 4825
Model trained successfully.
Model accuracy: 97.73%

Testing with sample emails...
Prediction for email: Spam
Prediction for email: Not Spam
