In [1]:
import string
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\votaq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\votaq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\votaq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
DATASET_PATH = "2cls_spam_text_cls.csv"
df = pd.read_csv(DATASET_PATH)
print(df.head())
messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
nltk.data.path.append('C:/Users/votaq/AppData/Roaming/nltk_data') # the path to your nltk_data directory

def lowercase(text):
    return text.lower()

def punctuation_removal(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(nltk.corpus.stopwords.words("english"))
    return [token for token in tokens if token not in stop_words]

def stemming(tokens):
    stemmer = nltk.stem.PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)
    return " ".join(tokens)  # Trả về chuỗi, không phải list

messages = [preprocess_text(message) for message in messages]


In [4]:
def create_dictionary(messages):
    dictionary = []
    for tokens in messages:
        for token in tokens.split():
            if token not in dictionary:
                dictionary.append(token)
    return dictionary
dictionary = create_dictionary(messages)

In [5]:
# BoW
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))
    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1
    return features
X = np.array([create_features(tokens, dictionary) for tokens in messages])

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(messages).toarray()

In [6]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


In [7]:
def train_and_evaluate_model(X, y, feature_type="BoW", model_type="Gaussian"):
    """
    Train and evaluate a Naive Bayes model with given features
    
    Args:
        X: Feature matrix (either BoW or TF-IDF)
        y: Target labels
        feature_type: String to identify the feature type for output
        model_type: Type of Naive Bayes model ("Gaussian" or "Multinomial")
    
    Returns:
        model: Trained model
        val_accuracy: Validation accuracy
        test_accuracy: Test accuracy
    """
    VAL_SIZE = 0.2
    TEST_SIZE = 0.125 # 0.1 / (1-0.2)
    SEED = 0

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                     test_size=VAL_SIZE,
                                                     shuffle=True,
                                                     random_state=SEED)

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                        test_size=TEST_SIZE,
                                                        shuffle=True,
                                                        random_state=SEED)
    
    # Choose model type
    if model_type == "Multinomial":
        model = MultinomialNB()
    else:
        model = GaussianNB()
    
    print(f"Start training {model_type} NB with {feature_type} features...")
    model = model.fit(X_train, y_train)
    print(f"{model_type} NB with {feature_type} training completed.")
    
    # Evaluate model
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)

    val_accuracy = accuracy_score(y_val, y_val_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f"{model_type} NB + {feature_type} - Val accuracy: {val_accuracy:.4f}")
    print(f"{model_type} NB + {feature_type} - Test accuracy: {test_accuracy:.4f}")
    
    return model, val_accuracy, test_accuracy

In [8]:
# Train and evaluate all 4 combinations
print("Training all model combinations...\n")

# Gaussian NB with BoW
gaussian_bow_model, gaussian_bow_val_acc, gaussian_bow_test_acc = train_and_evaluate_model(
    X, y, "BoW", "Gaussian"
)

# Gaussian NB with TF-IDF  
gaussian_tfidf_model, gaussian_tfidf_val_acc, gaussian_tfidf_test_acc = train_and_evaluate_model(
    X_tfidf, y, "TF-IDF", "Gaussian"
)

# Multinomial NB with BoW
multinomial_bow_model, multinomial_bow_val_acc, multinomial_bow_test_acc = train_and_evaluate_model(
    X, y, "BoW", "Multinomial"
)

# Multinomial NB with TF-IDF
multinomial_tfidf_model, multinomial_tfidf_val_acc, multinomial_tfidf_test_acc = train_and_evaluate_model(
    X_tfidf, y, "TF-IDF", "Multinomial"
)

# Store all models and their info for comparison
models_info = [
    {
        'model': gaussian_bow_model,
        'feature_type': 'BoW',
        'model_type': 'Gaussian',
        'feature_matrix': X,
        'val_acc': gaussian_bow_val_acc,
        'test_acc': gaussian_bow_test_acc,
        'name': 'Gaussian NB + BoW'
    },
    {
        'model': gaussian_tfidf_model,
        'feature_type': 'TF-IDF',
        'model_type': 'Gaussian',
        'feature_matrix': X_tfidf,
        'val_acc': gaussian_tfidf_val_acc,
        'test_acc': gaussian_tfidf_test_acc,
        'name': 'Gaussian NB + TF-IDF'
    },
    {
        'model': multinomial_bow_model,
        'feature_type': 'BoW',
        'model_type': 'Multinomial',
        'feature_matrix': X,
        'val_acc': multinomial_bow_val_acc,
        'test_acc': multinomial_bow_test_acc,
        'name': 'Multinomial NB + BoW'
    },
    {
        'model': multinomial_tfidf_model,
        'feature_type': 'TF-IDF',
        'model_type': 'Multinomial',
        'feature_matrix': X_tfidf,
        'val_acc': multinomial_tfidf_val_acc,
        'test_acc': multinomial_tfidf_test_acc,
        'name': 'Multinomial NB + TF-IDF'
    }
]

# Create ranked list of all models (best to worst)
print("\n" + "="*60)
print("📊 RANKED MODEL PERFORMANCE (Best to Worst)")
print("="*60)

# Sort models by validation accuracy (descending order)
ranked_models = sorted(models_info, key=lambda x: x['val_acc'], reverse=True)

for rank, model_info in enumerate(ranked_models, 1):
    print(f"{rank}. {model_info['name']}")
    print(f"   📈 Validation: {model_info['val_acc']:.4f} ({model_info['val_acc']*100:.2f}%)")
    print(f"   🧪 Test: {model_info['test_acc']:.4f} ({model_info['test_acc']*100:.2f}%)")
    
    # Add performance indicator
    if rank == 1:
        print("   🏆 BEST MODEL")
    elif rank == 2:
        print("   🥈 Second Best")
    elif rank == 3:
        print("   🥉 Third Best")
    else:
        print("   📉 Lowest Performance")
    print()

# Find the best model based on validation accuracy
best_model_info = max(models_info, key=lambda x: x['val_acc'])

print(f"🏆 SELECTED BEST MODEL: {best_model_info['name']}")
print(f"   Validation accuracy: {best_model_info['val_acc']:.4f}")
print(f"   Test accuracy: {best_model_info['test_acc']:.4f}")

# Set variables for the best model
best_model = best_model_info['model']
best_features = best_model_info['feature_type']
best_model_type = best_model_info['model_type']
best_feature_matrix = best_model_info['feature_matrix']

Training all model combinations...

Start training Gaussian NB with BoW features...
Gaussian NB with BoW training completed.
Gaussian NB + BoW - Val accuracy: 0.7354
Gaussian NB + BoW - Test accuracy: 0.7348
Start training Gaussian NB with TF-IDF features...
Gaussian NB with TF-IDF training completed.
Gaussian NB + TF-IDF - Val accuracy: 0.8780
Gaussian NB + TF-IDF - Test accuracy: 0.8513
Start training Multinomial NB with BoW features...
Multinomial NB with BoW training completed.
Multinomial NB + BoW - Val accuracy: 0.9722
Multinomial NB + BoW - Test accuracy: 0.9695
Start training Multinomial NB with TF-IDF features...
Multinomial NB with TF-IDF training completed.
Multinomial NB + TF-IDF - Val accuracy: 0.9632
Multinomial NB + TF-IDF - Test accuracy: 0.9659

📊 RANKED MODEL PERFORMANCE (Best to Worst)
1. Multinomial NB + BoW
   📈 Validation: 0.9722 (97.22%)
   🧪 Test: 0.9695 (96.95%)
   🏆 BEST MODEL

2. Multinomial NB + TF-IDF
   📈 Validation: 0.9632 (96.32%)
   🧪 Test: 0.9659 (96.5

In [9]:
import pickle
import os

# Create models directory if it doesn't exist
models_dir = "models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print(f"✓ Created directory: {models_dir}")

# Save all 4 models with their configurations
print("Saving all 4 trained models and configurations...")

# Save all models individually
model_files = {
    'gaussian_bow_model.pkl': gaussian_bow_model,
    'gaussian_tfidf_model.pkl': gaussian_tfidf_model,
    'multinomial_bow_model.pkl': multinomial_bow_model,
    'multinomial_tfidf_model.pkl': multinomial_tfidf_model
}

for filename, model in model_files.items():
    filepath = os.path.join(models_dir, filename)
    with open(filepath, 'wb') as f:
        pickle.dump(model, f)
    print(f"✓ Saved {filepath}")

# Save all model configurations
all_models_config = {
    'models_info': models_info,
    'best_model_info': best_model_info,
    'ranked_models': ranked_models
}

config_path = os.path.join(models_dir, 'all_nb_models_config.pkl')
with open(config_path, 'wb') as f:
    pickle.dump(all_models_config, f)
print(f"✓ Saved {config_path}")

# Save the best model separately for easy access
best_model_path = os.path.join(models_dir, 'best_model.pkl')
with open(best_model_path, 'wb') as f:
    pickle.dump(best_model, f)
print(f"✓ Saved {best_model_path}")

# Save feature extraction components
# Save dictionary for BoW models
dictionary_path = os.path.join(models_dir, 'dictionary.pkl')
with open(dictionary_path, 'wb') as f:
    pickle.dump(dictionary, f)
print(f"✓ Saved {dictionary_path} (for BoW models)")

# Save vectorizer for TF-IDF models
vectorizer_path = os.path.join(models_dir, 'vectorizer.pkl')
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"✓ Saved {vectorizer_path} (for TF-IDF models)")

# Save label encoder
label_encoder_path = os.path.join(models_dir, 'label_encoder.pkl')
with open(label_encoder_path, 'wb') as f:
    pickle.dump(le, f)
print(f"✓ Saved {label_encoder_path}")

print(f"\n🎯 Summary of saved files in '{models_dir}' folder:")
print("="*60)
print("Individual Models:")
print(f"- {models_dir}/gaussian_bow_model.pkl")
print(f"- {models_dir}/gaussian_tfidf_model.pkl") 
print(f"- {models_dir}/multinomial_bow_model.pkl")
print(f"- {models_dir}/multinomial_tfidf_model.pkl")
print("\nBest Model:")
print(f"- {models_dir}/best_model.pkl (quick access to best performing model)")
print("\nConfigurations:")
print(f"- {models_dir}/all_nb_models_config.pkl (all model info, rankings, and performance)")
print("\nFeature Extraction:")
print(f"- {models_dir}/dictionary.pkl (for BoW feature extraction)")
print(f"- {models_dir}/vectorizer.pkl (for TF-IDF feature extraction)")
print(f"- {models_dir}/label_encoder.pkl (for label encoding)")

print(f"\n🏆 Best model saved: {best_model_info['name']}")
print(f"   Validation accuracy: {best_model_info['val_acc']:.4f}")
print(f"   Test accuracy: {best_model_info['test_acc']:.4f}")

Saving all 4 trained models and configurations...
✓ Saved models\gaussian_bow_model.pkl
✓ Saved models\gaussian_tfidf_model.pkl
✓ Saved models\multinomial_bow_model.pkl
✓ Saved models\multinomial_tfidf_model.pkl
✓ Saved models\all_nb_models_config.pkl
✓ Saved models\best_model.pkl
✓ Saved models\dictionary.pkl (for BoW models)
✓ Saved models\vectorizer.pkl (for TF-IDF models)
✓ Saved models\label_encoder.pkl

🎯 Summary of saved files in 'models' folder:
Individual Models:
- models/gaussian_bow_model.pkl
- models/gaussian_tfidf_model.pkl
- models/multinomial_bow_model.pkl
- models/multinomial_tfidf_model.pkl

Best Model:
- models/best_model.pkl (quick access to best performing model)

Configurations:
- models/all_nb_models_config.pkl (all model info, rankings, and performance)

Feature Extraction:
- models/dictionary.pkl (for BoW feature extraction)
- models/vectorizer.pkl (for TF-IDF feature extraction)
- models/label_encoder.pkl (for label encoding)

🏆 Best model saved: Multinomial NB

In [10]:
def predict(text, model, feature_type, feature_extractor, label_encoder):
    """
    Make prediction on text using the trained model
    
    Args:
        text: Input text to classify
        model: Trained model
        feature_type: "BoW" or "TF-IDF"
        feature_extractor: Dictionary (for BoW) or Vectorizer (for TF-IDF)
        label_encoder: Label encoder for converting predictions back to class names
    
    Returns:
        prediction_cls: Predicted class name
        prediction_score: Confidence score (probability) for the prediction
    """
    processed_text = preprocess_text(text)
    
    if feature_type == "BoW":
        # Use dictionary for BoW features
        features = create_features(processed_text, feature_extractor)
        features = np.array(features).reshape(1, -1)
    else:
        # Use vectorizer for TF-IDF features
        features = feature_extractor.transform([processed_text]).toarray()
    
    # Get prediction and probabilities
    prediction = model.predict(features)
    prediction_proba = model.predict_proba(features)
    
    # Get the class name and confidence score
    prediction_cls = label_encoder.inverse_transform(prediction)[0]
    prediction_score = np.max(prediction_proba[0])  # Highest probability
    
    return prediction_cls, prediction_score

# Load the saved model and components from models folder
models_dir = "models"
print("Loading saved model and components from models folder...")

# Load the best model and configuration
best_model_path = os.path.join(models_dir, 'best_model.pkl')
config_path = os.path.join(models_dir, 'all_nb_models_config.pkl')
label_encoder_path = os.path.join(models_dir, 'label_encoder.pkl')

model_ = pickle.load(open(best_model_path, 'rb'))
all_config_ = pickle.load(open(config_path, 'rb'))
best_model_info_ = all_config_['best_model_info']
le_ = pickle.load(open(label_encoder_path, 'rb'))

# Determine feature type and load appropriate extractor
feature_type_ = best_model_info_['feature_type']

if feature_type_ == "BoW":
    dictionary_path = os.path.join(models_dir, 'dictionary.pkl')
    feature_extractor_ = pickle.load(open(dictionary_path, 'rb'))
else:
    vectorizer_path = os.path.join(models_dir, 'vectorizer.pkl')
    feature_extractor_ = pickle.load(open(vectorizer_path, 'rb'))

print(f"Loaded model: {best_model_info_['name']}")
print(f"Model type: {best_model_info_['model_type']} Naive Bayes")
print(f"Feature type: {best_model_info_['feature_type']}")
print(f"Validation accuracy: {best_model_info_['val_acc']:.4f}")
print(f"Test accuracy: {best_model_info_['test_acc']:.4f}")

# Test the prediction
test_input = "Do u wanna buy a IceCream of this company, Do you want to buy this product?"
prediction_cls, prediction_score = predict(test_input, model_, feature_type_, feature_extractor_, le_)

print(f"\n--- PREDICTION TEST ---")
print(f"Input text: '{test_input}'")
print(f"Prediction: {prediction_cls}")
print(f"Confidence score: {prediction_score:.4f} ({prediction_score*100:.2f}%)")

Loading saved model and components from models folder...
Loaded model: Multinomial NB + BoW
Model type: Multinomial Naive Bayes
Feature type: BoW
Validation accuracy: 0.9722
Test accuracy: 0.9695

--- PREDICTION TEST ---
Input text: 'Do u wanna buy a IceCream of this company, Do you want to buy this product?'
Prediction: ham
Confidence score: 0.9999 (99.99%)
