# Spam Detection NLP Pipeline

Notebook ini membangun model deteksi spam berbasis teks menggunakan dataset `dataset/spam.csv` dengan dua kolom: `Category` (label: ham/spam) dan `Message` (teks pesan).

## Penjelasan Dataset yang digunakan

In [None]:
## Berikan penjelasan terkait data apa yang digunakan diantaranya:
# 1. Kasus yang diambil
# 2. Penjelasan setiap kolomnya
# 3. Import library
# 4. Load data

## Data Understanding

In [None]:
# 1. Jumlah baris data
# 2. Panjang rata-rata setiap baris
# 3. Cek data duplikasi
# 4. Cek data kosong
# 5. Distribusi data menggunakan bar chart, line chart atau word cloud, seperti kata yang sering muncul.

## Data Text Processing

In [None]:
# 1. Tokenisasi
# 2. Lemmatization
# 3. Stemming
# 4. Stopword removal (Tanda baca, angka dan kata)
# 5. Text Normalisasi
# 6. Matrix correlation (opsional)
# 7. Labeling data (Lexicon, Bert, atau polarity)
# 8. Text Vektorisasi
# 9. Data splitting dengan skala (0.8, 02)(0.9, 0.1)(0.75, 0.25)(0.85, 0.15) pilih salah satu dari beberapa skala yang ditentukan
# catatan (lakukan tahap 1- 5 apabila diperlukan, jika tidak langsung ke bagian 6-9)

## Data Modeling

In [None]:
# Berikan Penjelasan tentang model yang dipilih (baik machine learning maupun deep learning)
# Disarankan menggunakan deep learning.

## Data Evaluasi

In [None]:
# 1. Confussion Matrix 
# 2. Laporann Klasifikasi (Classification report)

## Perbandingan Model apabila menggunakan beberapa algoritma

In [None]:
# Gunakan line chart atau barchart untuk perbandingannya

In [None]:
# imports.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

# Text processing
import re
import string
from collections import Counter

# NLTK (English)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Optional Indonesian stemmer
# pip install Sastrawi
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# ML models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Deep learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D


In [None]:
# load_data.py
df = pd.read_csv('spam.csv')  # pastikan path benar

# Jika file punya header tambahan (beberapa versi spam.csv punya kolom tambahan),
# sesuaikan: df = pd.read_csv('spam.csv', encoding='latin-1')[['v1','v2']] etc.
# lalu rename:
# df.columns = ['Category','Message']

df = df[['Category','Message']]  # pastikan hanya dua kolom ini
df.head()


In [None]:
# data_stats.py
num_rows = len(df)
print("Jumlah baris:", num_rows)


In [None]:
# avg_length.py
df['char_len'] = df['Message'].apply(len)
df['word_len'] = df['Message'].apply(lambda x: len(str(x).split()))
print("Rata-rata karakter per pesan:", df['char_len'].mean())
print("Rata-rata kata per pesan:", df['word_len'].mean())


In [None]:
# duplicate_check.py
dupes = df.duplicated(subset=['Message']).sum()
print("Jumlah duplikat message:", dupes)
# untuk melihat:
df[df.duplicated(subset=['Message'], keep=False)].sort_values('Message').head(10)


In [None]:
# missing_check.py
print(df.isnull().sum())
# drop kosong jika ada
df = df.dropna(subset=['Message','Category']).reset_index(drop=True)


In [None]:
# label_distribution.py
sns.countplot(x='Category', data=df)
plt.title('Distribusi Category (ham vs spam)')
plt.show()

# Wordcloud (gabungan pesan spam dan ham terpisah)
spam_text = " ".join(df[df['Category']=='spam']['Message'].astype(str))
ham_text = " ".join(df[df['Category']=='ham']['Message'].astype(str))

wc_spam = WordCloud(width=800, height=400).generate(spam_text)
plt.figure(figsize=(10,4))
plt.imshow(wc_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud - Spam')
plt.show()

wc_ham = WordCloud(width=800, height=400).generate(ham_text)
plt.figure(figsize=(10,4))
plt.imshow(wc_ham, interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud - Ham')
plt.show()


In [None]:
# top_words.py
def get_top_n_words(corpus, n=20):
    words = re.findall(r'\w+', corpus.lower())
    common = Counter(words).most_common(n)
    return pd.DataFrame(common, columns=['word','count'])

top_spam = get_top_n_words(spam_text, 20)
top_spam.plot.bar(x='word', y='count', legend=False)
plt.title('Top words in Spam')
plt.show()


In [None]:
# preprocessing.py
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words_en = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# If Indonesian:
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()
# stop_words_id = set([...])  # optionally define extra stopwords

def clean_text(text, lang='en'):
    """Lowercase, remove urls, mentions, punctuations, numbers, extra spaces."""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+',' ', text)
    text = re.sub(r'@\w+',' ', text)
    text = re.sub(r'[^a-z0-9\s]',' ', text)  # keep alnum for en; adjust for id if needed
    text = re.sub(r'\d+',' ', text)
    text = re.sub(r'\s+',' ', text).strip()
    return text

def preprocess(text, lang='en', do_lemmatize=True, do_stem=False, remove_stop=True):
    text = clean_text(text, lang=lang)
    tokens = word_tokenize(text)
    if remove_stop and lang=='en':
        tokens = [t for t in tokens if t not in stop_words_en]
    # Lemmatize
    if do_lemmatize and lang=='en':
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    # Indonesian stemming (opsional)
    if do_stem and lang=='id':
        # text_id = " ".join(tokens)
        # stemmed = stemmer.stem(text_id)
        # tokens = stemmed.split()
        pass
    return " ".join(tokens)


In [None]:
# apply_preprocess.py
df['clean'] = df['Message'].apply(lambda x: preprocess(x, lang='en', do_lemmatize=True, do_stem=False, remove_stop=True))
df[['Message','clean']].head()


In [None]:
# tfidf_vectorize.py
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df['clean'])
print("TF-IDF shape:", X_tfidf.shape)


In [None]:
# tokenizer_seq.py
MAX_VOCAB = 10000
MAX_LEN = 100  # max sequence length

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean'])
sequences = tokenizer.texts_to_sequences(df['clean'])
X_seq = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
print("Sequences shape:", X_seq.shape)


In [None]:
# label_encode.py
le = LabelEncoder()
y = le.fit_transform(df['Category'])  # ham->0, spam->1
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


In [None]:
# train_test_split.py
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Untuk sequences (DL)
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(X_seq, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# model_nb.py
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)
print("NB Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


In [None]:
# model_logreg.py
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)
print("LogReg Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


In [None]:
# model_svm.py
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


In [None]:
# model_lstm.py
EMBEDDING_DIM = 100

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
    LSTM(64, return_sequences=False),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(X_train_seq, y_train_seq, epochs=8, batch_size=64, validation_split=0.1)
# Evaluate
loss, acc = model.evaluate(X_test_seq, y_test_seq)
print("LSTM Test acc:", acc)
y_pred_dl = (model.predict(X_test_seq) > 0.5).astype(int).reshape(-1)
print(classification_report(y_test_seq, y_pred_dl))


In [None]:
# eval_report.py
from sklearn.metrics import ConfusionMatrixDisplay

def eval_and_plot(y_true, y_pred, title='Model'):
    print(f"=== {title} ===")
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix - {title}')
    plt.show()

# ML models
eval_and_plot(y_test, y_pred_nb, "Naive Bayes")
eval_and_plot(y_test, y_pred_lr, "Logistic Regression")
eval_and_plot(y_test, y_pred_svm, "Linear SVM")

# DL model (jika pakai)
eval_and_plot(y_test_seq, y_pred_dl, "LSTM (DL)")


In [None]:
# compare_models.py
# Ambil akurasi setiap model (pastikan variabel accuracy tersedia)
accs = {
    'NaiveBayes': accuracy_score(y_test, y_pred_nb),
    'LogisticRegression': accuracy_score(y_test, y_pred_lr),
    'LinearSVM': accuracy_score(y_test, y_pred_svm),
    'LSTM_DL': accuracy_score(y_test_seq, y_pred_dl)
}

names = list(accs.keys())
vals = [accs[n] for n in names]
plt.figure(figsize=(8,5))
sns.barplot(x=names, y=vals)
plt.ylim(0,1)
plt.title('Perbandingan Akurasi Model')
plt.ylabel('Accuracy')
plt.show()
