## üìß Classifica√ß√£o de SMS: Detec√ß√£o de Spam com Naive Bayes

Este projeto implementa um classificador de SMS para distinguir entre mensagens **"ham"** (leg√≠timas) e **"spam"** (indesejadas) usando o algoritmo **Naive Bayes Multinomial** e vetoriza√ß√£o **TF-IDF**.

In [1]:
import pandas as pd
import re
import os
import matplotlib.pyplot as plt

def	load_dataset(file_path, encoding='latin-1'):
	"""load dataset from a CSV file."""
	try:
		df = pd.read_csv(file_path, encoding=encoding)
		print(f"‚úÖ Dataset load successfully!")
		return (df)
	except FileNotFoundError:
		print("‚ùå File not found. Please check the file path. {file_path}")
		return (None)
	
def	rename_and_map_labels(df):
	"""Rename columns and map labels to binary values."""
	df = df.rename(columns={'v1': 'label', 'v2': 'message'})
	df = df[['label', 'message']]
	df['label'] = df['label'].map({'ham': 0, 'spam': 1})
	return (df)

def	clean_text(text):
	"""Clean text by removing special characters and converting to lowercase."""
	text = text.lower()
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
	return (text)

def	process_message(df):
	"""Apply text cleaning to the 'message' column."""
	df['message'] = df['message'].apply(clean_text)
	return (df)

def plot_class_distribution(df):
    """Plot the distribution of classes in the dataset."""
    class_counts = df['label'].value_counts()
    plt.figure(figsize=(6,4))
    class_counts.plot(kind='bar', color=['blue', 'orange'])
    plt.title('Class Distribution')
    plt.xlabel('Class (0: Ham, 1: Spam)')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    
    path = 'data/visualizations'
    os.makedirs(path, exist_ok=True) 
    file_path = os.path.join(path, 'class_distribution.png')
    plt.savefig(file_path)
    plt.close()
    print(f"‚úÖ Class distribution plot saved to: {file_path}") 

def load_and_preprocess_data(file_path):
	"""Load and preprocess the dataset."""
	print(f"\n--- 1. CARREGAMENTO E PR√â-PROCESSAMENTO DE DADOS ---")
	
	df = load_dataset(file_path)
	if df is None:
		print("‚ùå Data loading failed. Exiting preprocessing.")
		return (None)
	
	df = rename_and_map_labels(df)
	df = process_message(df)
	plot_class_distribution(df)
	
	print(f"‚úÖ Data preprocessing completed!")
	return (df)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display



# 1Ô∏è‚É£ data splitting

def split_data(df, test_size=0.2, random_state=42):
	"""Split the dataset into training and testing sets."""
	X = df['message']
	y = df['label']
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
	print(f"‚úÖ Data split into training and testing sets!")
	return (X_train, X_test, y_train, y_test)

# 2Ô∏è‚É£ vectorization

def vectorize_text(X_train, X_test):
	"""Vectorize text data using TF-IDF."""
	vectorizer = TfidfVectorizer()
	X_train_vec = vectorizer.fit_transform(X_train)
	X_test_vec = vectorizer.transform(X_test)
	print(f"‚úÖ Text data vectorized using TF-IDF!")
	return (vectorizer, X_train_vec, X_test_vec)

# 3Ô∏è‚É£ model training

def train_model(X_train_vec, y_train):
	"""Train a Naive Bayes model."""
	model = MultinomialNB()
	model.fit(X_train_vec, y_train)
	print(f"‚úÖ Model trained successfully!")
	return (model)

# 4Ô∏è‚É£ save artifacts

def save_artifacts(model, vectorizer, model_dir='models'):
	"""Save the trained model and vectorizer to disk."""
	os.makedirs(model_dir, exist_ok=True)
	joblib.dump(model, os.path.join(model_dir, 'naive_bayes_model.pkl'))
	joblib.dump(vectorizer, os.path.join(model_dir, 'tfidf_vectorizer.pkl'))
	print(f"üíæ Model and vectorizer saved to disk {model_dir}!")

# 5Ô∏è‚É£ model evaluation

def evaluate_model(model, X_test_vec, y_test):
    """Evaluate the model and return accuracy, classification report, confusion matrix, and predictions."""
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=False)
    cm = confusion_matrix(y_test, y_pred)
    return (acc, report, cm, y_pred)

# 6Ô∏è‚É£ plot confusion matrix

def plot_confusion_matrix(cm, save_path='./data/visualizations/confusion_matrix.png'):
    """Plot and save the confusion matrix."""
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    plt.figure(figsize=(7, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['HAM (0)', 'SPAM (1)'],
                yticklabels=['HAM (0)', 'SPAM (1)'])
    plt.xlabel('Predito')
    plt.ylabel('Real')
    plt.title('Matriz de Confus√£o - Classifica√ß√£o de SMS')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()
    print(f"üìä Matriz de confus√£o salva em: {save_path}")

# 7Ô∏è‚É£ full pipeline

def train_and_evaluate_pipeline(df):
    """Orquestra todo o processo: treino, avalia√ß√£o e salvamento."""
    print(f"\n--- 2. TREINAMENTO E AVALIA√á√ÉO DO MODELO ---")

    X_train, X_test, y_train, y_test = split_data(df)
    vectorizer, X_train_vec, X_test_vec = vectorize_text(X_train, X_test)
    model = train_model(X_train_vec, y_train)
    save_artifacts(model, vectorizer)

    acc, report, cm, _ = evaluate_model(model, X_test_vec, y_test)

    print(f"\nAcur√°cia: {acc:.4f}")
    print("\nRelat√≥rio de Classifica√ß√£o:\n", report)
    plot_confusion_matrix(cm)

    print("‚úÖ Treinamento e avalia√ß√£o conclu√≠dos.")
    return (model, X_test_vec, y_test)

In [3]:
import os
from src.data_pipeline import load_and_preprocess_data
from src.model_pipeline import train_and_evaluate_pipeline, evaluate_model


ROOT_DIR = '.'

os.makedirs(os.path.join(ROOT_DIR, 'data/visualizations'), exist_ok=True) 
os.makedirs(os.path.join(ROOT_DIR, 'models'), exist_ok=True)

DATA_PATH = os.path.join(ROOT_DIR, 'data/spam.csv')

def run_pipeline():
    """ execute the full data and model pipeline """

    df_clean = load_and_preprocess_data(DATA_PATH)
    
    if df_clean is not None:

        model, X_test_vec, y_test = train_and_evaluate_pipeline(df_clean)
        evaluate_model(model, X_test_vec, y_test)
        print("\nProcesso de Treinamento conclu√≠do com sucesso!")

if __name__ == '__main__':
    run_pipeline()


[92m--- 1. CARREGAMENTO E PR√â-PROCESSAMENTO DE DADOS ---[0m
[92m‚úÖ Dataset load successfully![0m
[92m‚úÖ Class distribution plot saved to: data/visualizations/class_distribution.png[0m
[92m‚úÖ Data preprocessing completed![0m

[94m--- 2. TREINAMENTO E AVALIA√á√ÉO DO MODELO ---[0m
[94m‚úÖ Data split into training and testing sets![0m
[94m‚úÖ Text data vectorized using TF-IDF![0m
[94m‚úÖ Model trained successfully![0m
[94müíæ Model and vectorizer saved to disk models![0m

Acur√°cia: 0.9543

Relat√≥rio de Classifica√ß√£o:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       1.00      0.66      0.80       150

    accuracy                           0.95      1115
   macro avg       0.97      0.83      0.88      1115
weighted avg       0.96      0.95      0.95      1115

üìä Matriz de confus√£o salva em: ./data/visualizations/confusion_matrix.png
‚úÖ Treinamento e avalia√ß√£o conclu√≠dos.

Pro

In [10]:

df_clean = load_and_preprocess_data(DATA_PATH)
df_clean['clean_message'] = df_clean['message'].copy() 


import pandas as pd
df_original = pd.read_csv(DATA_PATH, encoding='latin-1')
df_original = df_original.rename(columns={'v1': 'label', 'v2': 'message'})

df_clean['message'] = df_original['message']


print("\n## Contagem de Classes üìä")
display(df_clean['label'].value_counts().to_frame(name='Contagem'))


print("\n## Compara√ß√£o de Mensagens: Original vs. Pr√©-processada üßºüì©")
pd.set_option('display.max_colwidth', None)
display(df_clean[['label', 'message', 'clean_message']].sample(5, random_state=42))
pd.set_option('display.max_colwidth', 50)


[92m--- 1. CARREGAMENTO E PR√â-PROCESSAMENTO DE DADOS ---[0m
[92m‚úÖ Dataset load successfully![0m
[92m‚úÖ Class distribution plot saved to: data/visualizations/class_distribution.png[0m
[92m‚úÖ Data preprocessing completed![0m

## Contagem de Classes üìä


Unnamed: 0_level_0,Contagem
label,Unnamed: 1_level_1
0,4825
1,747



## Compara√ß√£o de Mensagens: Original vs. Pr√©-processada üßºüì©


Unnamed: 0,label,message,clean_message
3245,0,"Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife Natural disasters just happens",funny fact nobody teaches volcanoes 2 erupt tsunamis 2 arise hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife natural disasters just happens
944,0,"I sent my scores to sophas and i had to do secondary application for a few schools. I think if you are thinking of applying, do a research on cost also. Contact joke ogunrinde, her school is one me the less expensive ones",i sent my scores to sophas and i had to do secondary application for a few schools i think if you are thinking of applying do a research on cost also contact joke ogunrinde her school is one me the less expensive ones
1044,1,"We know someone who you know that fancies you. Call 09058097218 to find out who. POBox 6, LS15HB 150p",we know someone who you know that fancies you call 09058097218 to find out who pobox 6 ls15hb 150p
2484,0,Only if you promise your getting out as SOON as you can. And you'll text me in the morning to let me know you made it in ok.,only if you promise your getting out as soon as you can and youll text me in the morning to let me know you made it in ok
812,1,Congratulations ur awarded either √•¬£500 of CD gift vouchers & Free entry 2 our √•¬£100 weekly draw txt MUSIC to 87066 TnCs www.Ldew.com1win150ppmx3age16,congratulations ur awarded either 500 of cd gift vouchers free entry 2 our 100 weekly draw txt music to 87066 tncs wwwldewcom1win150ppmx3age16
