<a href="https://colab.research.google.com/github/vhiotirta22/NaiveBayes-ClassificationEmail/blob/main/Classification_Email_(Spam_or_Not).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import re


In [None]:
from google.colab import drive
drive.mount ('/content/drive/')

Mounted at /content/drive/


In [None]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/ML/email_spam_dataset.csv', encoding='latin-1')
data = data[['label', 'message']]

print(data.head())

  label                                            message
0  spam  Selamat! Anda memenangkan hadiah Rp1.000.000. ...
1   ham  Halo, bagaimana kabar Anda hari ini? Apakah An...
2  spam  Dapatkan pinjaman cepat dengan bunga rendah. A...
3   ham  Tolong kirimkan laporan keuangan terbaru ke em...
4  spam  Pil obat kuat pria terbaik! Pesan sekarang dan...


In [None]:
# Download stopwords
nltk.download('stopwords')

# Add stopwords for Indonesian
stopwords_indonesia = set(stopwords.words('indonesian'))

# Data preprocessing function for Indonesian
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords_indonesia]
    return ' '.join(tokens)

# Apply preprocessing to each message
data['message'] = data['message'].apply(preprocess_text)

# Encode the labels (spam: 1, ham: 0)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

print(data.head())

   label                                            message
0      1  selamat memenangkan hadiah rp1 000 000 klik me...
1      0                                 halo kabar bertemu
2      1        dapatkan pinjaman cepat bunga rendah ajukan
3      0  tolong kirimkan laporan keuangan terbaru email...
4      1  pil obat kuat pria terbaik pesan rasakan perbe...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Transform text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(data['message']).toarray()
y = data['label']

print(X.shape)

(98, 179)


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

(78, 179) (20, 179)


In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict the labels on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print classification report
print(classification_report(y_test, y_pred))

# Print confusion matrix
print(confusion_matrix(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        14

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

[[ 6  0]
 [ 0 14]]


In [None]:
# Example prediction
example_text = ["Selamat! Anda memenangkan hadiah Rp1.000.000. Klik di sini untuk mengklaim hadiah Anda."]
example_text = [preprocess_text(text) for text in example_text]
example_features = vectorizer.transform(example_text).toarray()

prediction = nb_classifier.predict(example_features)
print("Spam" if prediction[0] else "Not Spam")


Spam
