In [None]:
# Load needed dependencies
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle

In [None]:
# Load the spam dataset
df1 = pd.read_csv('data/spam_1.csv', quotechar='"')
df2 = pd.read_csv('data/spam_2.csv', quotechar='"')

df = df1.append(df2[['Category','Message']], ignore_index=True)
df

In [None]:
# Separate features and labels
X = df['Message']
y = df['Category']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Create a CountVectorizer to convert text into numerical features
vectorizer = CountVectorizer()

In [None]:
# Fit the vectorizer on the training data
vectorizer.fit(X_train)

In [None]:
# Transform the training and testing data
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Train a Naive Bayes classifier on the training data
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

In [None]:
# Predict the class labels for the testing data
y_pred = classifier.predict(X_test_vectorized)

In [None]:
# Evaluate the model's performance
accuracy = classifier.score(X_test_vectorized, y_test)
print("Accuracy:", accuracy)

In [None]:
# Use the model to classify a new email
new_email = "Subject: Buy now! Get 20% off on all electronics!"

# Vectorize the new email
new_email_vectorized = vectorizer.transform([new_email])

# Predict the class label for the new email
prediction = classifier.predict(new_email_vectorized)
prediction_proba = classifier.predict_proba(new_email_vectorized)

print("Spam %:", prediction_proba[:, 1] * 100)

In [None]:
# Exporting happens here
with open('spam_model_bayes.pickle', 'wb') as f:
    pickle.dump((classifier, vectorizer), f)