<a href="https://colab.research.google.com/github/thetestcoder/ml-projects/blob/main/3_ML_Spam_Email_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir ~/.kaggle

!cp kaggle.json ~/.kaggle

!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download ue153011/spam-mail-detection-dataset

Downloading spam-mail-detection-dataset.zip to /content
  0% 0.00/208k [00:00<?, ?B/s]
100% 208k/208k [00:00<00:00, 103MB/s]


In [3]:
!unzip spam-mail-detection-dataset.zip

Archive:  spam-mail-detection-dataset.zip
  inflating: spam_mail_data.csv      


In [4]:
#load dataset
import pandas as pd

df = pd.read_csv("spam_mail_data.csv")

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
from nltk.stem.snowball import stopwords
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')


def preprocess_text(text):
  text = text.lower()

  text = text.translate(str.maketrans('', '', string.punctuation))

  words = word_tokenize(text)

  stemmer = PorterStemmer()
  stop_words = set(stopwords.words('english'))
  filtered_words = [stemmer.stem(word) for word in words if word not in stop_words]

  preprocessed_text = ' '.join(filtered_words)
  return preprocessed_text;


df['processed_message'] = df['Message'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = df['processed_message']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Initialize the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()

# Define the hyperparameter grid for tuning
param_grid = {
    'alpha': [0.1, 0.5, 1.0],  # Laplace smoothing parameter (additive smoothing)
}


# Perform grid search using cross-validation (5-fold cross-validation here)
grid_search = GridSearchCV(naive_bayes_classifier, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to find the best hyperparameters
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters found during the search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best trained model
best_model = grid_search.best_estimator_

Best Hyperparameters: {'alpha': 0.1}


In [13]:
# model selection, training and evaluation

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

best_model.fit(X_train_tfidf, y_train)

y_pred = best_model.predict(X_test_tfidf)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.9829596412556054
Precision: 0.9924242424242424
Recall: 0.8791946308724832
F1-score: 0.9323843416370107
