# Spam Email Classification using NLP and Machine Learning

## Step 1: Importing Libraries

In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

nltk.download('punkt')
nltk.download('stopwords')
    

## Step 2: Loading and Exploring the Dataset

In [None]:

# Load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')  # Replace with your dataset
df = df.rename(columns={"v1": "label", "v2": "text"})[['label', 'text']]
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
print(df.head())
print(df.info())
print(df.describe())
print(df['label'].value_counts())
    

## Step 3: Data Preprocessing

In [None]:

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)
print(df.head())
    

## Step 4: Feature Extraction

In [None]:

# Using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['processed_text']).toarray()
y = df['label']
    

## Step 5: Splitting the Data

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    

## Step 6: Model Training

In [None]:

model = MultinomialNB()
model.fit(X_train, y_train)
    

## Step 7: Model Evaluation

In [None]:

y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    

## Step 8: Testing with New Emails

In [None]:

def classify_new_email(email_text):
    processed_email = preprocess_text(email_text)
    vectorized_email = vectorizer.transform([processed_email]).toarray()
    prediction = model.predict(vectorized_email)
    return 'Spam' if prediction[0] == 1 else 'Ham'

# Example test
print(classify_new_email("Congratulations! You've won a $1,000 gift card. Click to claim."))
print(classify_new_email("Hello, let's meet for coffee tomorrow."))
    

## Conclusion

This notebook demonstrated how to build a spam email classification system using NLP and machine learning techniques. Feel free to modify and improve upon this base implementation.