In [None]:
#import lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn

# download data

In [None]:
from google.colab import drive
drive.mount('/drive') #mount drive first then read data from Colab

for dirname, _, filenames in os.walk('/drive/MyDrive/Colab Notebooks'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("/drive/MyDrive/Colab Notebooks/emails.csv")
df.head()


In [None]:
# Download packages
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
randomSample = df["text"].sample(3)
for text in randomSample:
  print(text,'\n')

In [None]:
# Preprocessing function
def preprocessing_text(text):
  # Lowercase the text
  text = text.lower()

  # Tokenize the text
  tokens = word_tokenize(text)

  # Remove punctuation and non-alphanumeric characters
  tokens = [word for word in tokens if word.isalnum()]

  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  # Initialize steming
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(word) for word in tokens]

  # Join tokens back into a processed text
  processed_text = ' '.join(tokens)

  return processed_text

# Download punkt_tab
nltk.download('punkt_tab')

# Assign a new column with processed text
df['processed_text'] = df['text'].apply(preprocessing_text)
df['processed_text'].sample(5)

In [None]:
x_text = df['processed_text']
y = df['spam']

x_train, x_test, y_train, y_test = train_test_split(x_text, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
spam_counts = [y_train.value_counts()[1], y_test.value_counts()[1]]
non_spam_counts = [y_train.value_counts()[0], y_test.value_counts()[0]]

x_labels = ["Train", "Test"]

plt.bar(x_labels, spam_counts, label='Spam')
plt.bar(x_labels, non_spam_counts, bottom=spam_counts, label='Not Spam')
plt.title('Spam vs Not Spam in Train and Test Sets')
plt.xlabel('Dataset')
plt.ylabel('Count')
plt.legend()
plt.show()

In [None]:
vectorize = CountVectorizer()#vectorize the data
x_train_vectorized = vectorize.fit_transform(x_train)
x_test_vectorized = vectorize.transform(x_test)

print(f"X_train_vec: {x_train_vectorized.toarray().shape}")
print(f"X_test_vec: {x_test_vectorized.toarray().shape}")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
#convert data to pytorch tensor
X_train_tensor = torch.tensor(x_train_vectorized.toarray(),
                             dtype=torch.float32,
                             device=device)
y_train_tensor = torch.tensor(y_train.values,
                             dtype=torch.float32,
                             device=device)
X_test_tensor = torch.tensor(x_test_vectorized.toarray(),
                             dtype=torch.float32,
                             device=device)
y_test_tensor = torch.tensor(y_test.values,
                             dtype=torch.float32,
                             device=device)

In [None]:
train_set = TensorDataset(X_train_tensor, y_train_tensor)
test_set = TensorDataset(X_test_tensor, y_test_tensor)


batch_size = 64
train_loader = DataLoader(train_set,
                          batch_size=batch_size,
                          shuffle=True)
test_loader = DataLoader(test_set,
                         batch_size=batch_size,
                         shuffle=False)

In [None]:
import numpy as np

class MultinomialNB:
    def __init__(self, alpha = 1):
        """
        Hàm khởi tạo
        alpha: Tham số làm mượt Laplace (smoothing)
        """
        self.alpha = alpha

        self.priors_ = {}
        self.likelihoods_ = {}
        self.classes_ = set()
        self.vocabulary_ = set()
        self.n_vocab_ = 0

    def fit(self, X_train, y_train):
        n_docs = len(X_train)

        class_counts = {}
        all_docs_by_class = {}

        for doc, c in zip(X_train, y_train):
            if c not in self.classes_:
                self.classes_.add(c)
                class_counts[c] = 0
                all_docs_by_class[c] = []

            class_counts[c] += 1

            words = doc.split()
            all_docs_by_class[c].extend(words)
            for word in words:
                self.vocabulary_.add(word)

        self.n_vocab_ = len(self.vocabulary_)

        for c in self.classes_:
            self.priors_[c] = class_counts[c] / n_docs


        total_words_by_class = {}
        word_counts_by_class = {}

        for c in self.classes_:
            total_words_by_class[c] = len(all_docs_by_class[c]) # N_c

            # Đếm số lần xuất hiện của từng từ trong class c
            counts_for_this_class = {}
            for word in all_docs_by_class[c]:
                counts_for_this_class[word] = counts_for_this_class.get(word, 0) + 1
            word_counts_by_class[c] = counts_for_this_class


        for c in self.classes_:
            self.likelihoods_[c] = {}
            # Mẫu số chung cho class c
            denom = total_words_by_class[c] + (self.alpha * self.n_vocab_)

            # Tính P(word|c) cho MỌI TỪ trong từ điển
            for word in self.vocabulary_:
                # Lấy số lần từ 'word' xuất hiện trong class 'c'
                count = word_counts_by_class[c].get(word, 0)

                # Tử số
                num = count + self.alpha

                self.likelihoods_[c][word] = num / denom

    def _predict_one(self, doc):
        """
        Dự đoán class cho MỘT văn bản (đây là hàm nội bộ)
        """
        scores = {}
        words = doc.split()

        for c in self.classes_:
            # Bắt đầu bằng log của Xác suất Tiên nghiệm
            score = np.log(self.priors_[c])

            # Cộng dồn log của các Xác suất Có điều kiện
            for word in words:
                if word in self.vocabulary_:
                    score += np.log(self.likelihoods_[c][word])

            scores[c] = score

        return max(scores, key=scores.get)

    def predict(self, X_test):
        """
        Dự đoán class cho một LIST các văn bản mới.
        X_test: list các văn bản (vd: ["free offer", "call me"])
        """
        predictions = []
        for doc in X_test:
            predictions.append(self._predict_one(doc))
        return predictions

In [None]:
#init model
model_spam_filter = MultinomialNB(alpha=1)
# Train the MultinomialNB model
model_spam_filter.fit(x_train, y_train)
print("Model training complete.")


In [None]:
# @title
spam_email = """Here are this week's five freeCodeCamp resources that are worth your time:

1. freeCodeCamp just published a new course taught by legendary Harvard computer science professor Dr. David J. Malan. This comprehensive cybersecurity for beginners course will teach you how to secure accounts, databases, and entire software systems. Dr. Malan also shares tons of practical tips for securing your privacy in an increasingly adversarial world. (8 hour YouTube course): https://www.freecodecamp.org/news/learn-cybersecurity-from-harvard-university

2. freeCodeCamp also published a guide to passing the Certified Kubernetes Administrator Exam. Beau Carnes teaches this course, which will walk you through key DevOps concepts. You'll start by setting up your K8s practice environment. Then you'll bootstrap a multi-node cluster and your control plane. You'll learn about Helm, High Availability Autoscaling, CoreDNS, and more. (2 hour YouTube course): https://www.freecodecamp.org/news/prepare-for-the-kubernetes-administrator-certification-and-pass/

3. On this week's freeCodeCamp podcast, I interview a software engineer who got his first developer job at age 45. Eric Carlson is a self-taught software engineer at Cisco. In his early 20s, he worked his way up to manager at the busiest Domino's Pizza in Canada. He eventually went to college and studied liberal arts, then worked as a teacher for two decades before teaching himself programming using freeCodeCamp. He was able to gradually pivot into a developer role within the big telecom company where he was working. (1 hour watch or listen in your favorite podcast app): https://www.freecodecamp.org/news/first-dev-job-at-45-interview-with-self-taught-freecodecamp-grad-eric-carlson-podcast-194/

4. Learn how to build high-performance mobile apps using Google's open-source Flutter framework. freeCodeCamp uses Flutter for our Android and iPhone apps, and it's way easier than maintaining two separate app codebases. This Flutter handbook will teach you how to efficiently lay out your apps with minimum widget rebuilds. You'll learn state management techniques, asynchronous patterns, and image caching best practices. You'll also learn how to use Isolates and lazy loading to make your apps really snappy. (full length handbook): https://www.freecodecamp.org/news/how-to-build-scalable-and-performant-flutter-applications-a-handbook-for-devs/

5. Learn Serverless Architecture using C# .NET and Azure cloud. This jam-packed course will teach you common microservice patterns, Onion Architecture, IoT functions, and more. (5 hour YouTube course): https://www.freecodecamp.org/news/serverless-and-microservices-with-c-and-azure/

The freeCodeCamp community is working hard on so many improvements to our core curriculum. You should support our charity's mission, and by extension the entire open source ecosystem that relies on our learning resources: https://www.freecodecamp.org/donate

Quote of the Week: “Most developer stories about learning to code involve having time to do things like go to meetups, build side projects, grind LeetCode, and so on. I didn’t have time or energy to do any of these things with a baby at home. So I found another way into my first developer job. For me, I had to find a way to get paid to code at my non-coding job. First I found a way to code for 5% of my job, then 10%, then a jump to 50%, and finally a jump to a 100% coding role.” — Software Engineer Eric Carlson on how he transitioned into a developer role within his current company at age 45, on this week's freeCodeCamp podcast

Until next week, happy coding.

-- Quincy Larson

Teacher and founder of freeCodeCamp.org

If these aren't worth your time, you can turn them off: https://www.freecodecamp.org/ue/7eTUUxGrt9IIdSkFAxxQj

"""

non_spam_email = """Subject: Meeting Agenda for Tomorrow

Hi Team,

I hope this email finds you well. I wanted to remind everyone about the meeting scheduled for tomorrow at 10:00 AM. Below is the agenda:

1. Review of project milestones
2. Discussion on upcoming deadlines
3. Any other business

Please come prepared with any updates or questions you may have. Looking forward to a productive meeting.

Best regards,
Mr. Bannerjee
"""

X_test = [spam_email, non_spam_email]
X_check = [1, 0]

# test
# 3. DỰ ĐOÁN
predictions = model_spam_filter.predict(X_test)

# 4. XEM KẾT QUẢ
print(f"--- Kết quả dự đoán ---")
for doc, pred in zip(X_test, predictions):
    if pred == 1:
        print(f"'{doc}'  ==>  Dự đoán: Spam")
    else:
        print(f"'{doc}'  ==>  Dự đoán: Not Spam")
accuracy = np.mean(np.array(predictions) == np.array(X_check))
print(f"Accuracy: {accuracy}")

In [None]:

# đánh giá model
def evaluate_classification_model(model, X_test, y_test_true):
    print(f"--- Đánh giá Model: {model.__class__.__name__} ---")

    y_pred = model.predict(X_test)

    pos_label = 1

    accuracy = accuracy_score(y_test_true, y_pred)

    precision = precision_score(y_test_true, y_pred, pos_label=pos_label, zero_division=0)
    recall = recall_score(y_test_true, y_pred, pos_label=pos_label, zero_division=0)
    f1 = f1_score(y_test_true, y_pred, pos_label=pos_label, zero_division=0)

    print(f"Nhãn thật (True):    {y_test_true}")
    print(f"Nhãn dự đoán (Pred): {y_pred}")
    print("-" * 50)
    print(f"Accuracy:   {accuracy * 100:.2f} %")
    print(f"Precision (cho '{pos_label}'): {precision * 100:.2f} %")
    print(f"Recall (cho '{pos_label}'):    {recall * 100:.2f} %")
    print(f"F1-Score (cho '{pos_label}'):   {f1 * 100:.2f} %")

    labels = sorted(list(set(y_test_true)))
    cm = confusion_matrix(y_test_true, y_pred, labels=labels)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()


X_test = [
    "free offer",         # Spam
    "call me",            # Ham
    "send free stuff",    # Spam
    "my office tomorrow", # Ham
    "new offer call me"   # Ham
]
y_test_true = [
    1,
    0,
    1,
    0,
    0
]

# Đánh giá model
evaluate_classification_model(model_spam_filter, X_test, y_test_true)