In [3]:
import pandas as pd

df = pd.read_csv("data/emails_labeled.csv")
df.head()
print(df['label'].value_counts())


label
Spam           79701
Billing        72093
Social         25934
Personal        8470
Work            5405
Technical       5336
Promotional     3061
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X = df['message']      # features (email text)
y = df['label']     # target (category)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Evaluate
print("Naive Bayes Accuracy:", nb_model.score(X_test_tfidf, y_test))

Naive Bayes Accuracy: 0.625775


In [7]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Evaluate
print("Logistic Regression Accuracy:", lr_model.score(X_test_tfidf, y_test))


Logistic Regression Accuracy: 0.863975


In [8]:
sample = ["I can’t log into my account, please help!",
          "Your invoice for the month is attached.", 
          "Let's have lunch tomorrow.",
          "There was an error processing your payment.",
          "I am facing issues with the new update.",
          "Congratulations! You've won a free trip to Bahamas! Click here to claim.",
          "Don't forget our meeting next week.",
          "This is not spam, just a regular email.",
          "Your subscription will expire soon, renew now to continue enjoying our services.",
          "Win a brand new car by entering our contest! Limited time offer!",
            "Get rich quick with this one simple trick!"]
sample_tfidf = vectorizer.transform(sample)

print("Naive Bayes Prediction:", nb_model.predict(sample_tfidf))
print("LogReg Prediction:", lr_model.predict(sample_tfidf))


Naive Bayes Prediction: ['Billing' 'Billing' 'Spam' 'Billing' 'Billing' 'Spam' 'Spam' 'Billing'
 'Billing' 'Billing' 'Billing']
LogReg Prediction: ['Spam' 'Billing' 'Personal' 'Billing' 'Technical' 'Billing' 'Personal'
 'Spam' 'Billing' 'Billing' 'Spam']
