In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Sample data
data = {
    "text": [
        "Win a free vacation! Call now to claim your prize!",
        "Reminder: Your bill is due tomorrow.",
        "Congratulations! You've been selected for a $1,000 gift card.",
        "Don't forget the team meeting at 3 PM.",
        "Exclusive deal just for you! Buy now and save 50%.",
        "Can we reschedule our appointment to next week?",
        "Claim your free trial for our premium service today!",
        "Your Amazon order has been shipped.",
        "Get cheap medications online with no prescription!",
        "Here's the report you requested."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("email_dataset.csv", index=False)

print("email_dataset.csv has been created.")

# Check for missing values
if df.isnull().sum().sum() > 0: # Use df instead of data
    print("Dataset contains missing values. Please handle them before proceeding.")
    exit()

# Preprocess the data
# Convert to lowercase, remove punctuation, and strip extra whitespace
df['text'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True).str.strip()
# Use df['text'] instead of data['text'] as you want to modify the DataFrame

# Rest of your code...
# Tokenization and Feature Extraction
vectorizer = CountVectorizer(stop_words='english')
X_counts = vectorizer.fit_transform(data['text'])

# Apply TF-IDF transformation
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Define features and target variable
X = X_tfidf
y = data['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Example usage
sample_emails = ["Win a free iPhone now!", "Meeting is scheduled at 10 AM."]
sample_emails_tfidf = tfidf_transformer.transform(vectorizer.transform(sample_emails))
predictions = model.predict(sample_emails_tfidf)

print("\nSample Predictions:")
for email, pred in zip(sample_emails, predictions):
    print(f"Email: {email} -> {'Spam' if pred == 1 else 'Not Spam'}")

email_dataset.csv has been created.
Evaluation Metrics:
Accuracy: 0.50
Precision: 0.00
Recall: 0.00
F1 Score: 0.00

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


Sample Predictions:
Email: Win a free iPhone now! -> Spam
Email: Meeting is scheduled at 10 AM. -> Not Spam


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
