<a href="https://colab.research.google.com/github/vijugk/CoreJava/blob/master/Spam_detection_using_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Case Study: Spam Detection Using Text Classification**

In [3]:
import pandas as pd

In [4]:
df=pd.read_csv('/content/spam_dataset.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/spam_dataset.csv'

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['Category'].unique()

In [None]:
from wordcloud import WordCloud

text = ' '.join(df['Message'])
wordcloud = WordCloud(max_words=200).generate(text)

import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
df.groupby('Category').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import re
from nltk.tokenize import word_tokenize

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def update(cat):
    if cat == "ham":
        return 0
    elif cat == "spam":
        return 1
    return cat


df["Category"] = df["Category"].apply(update)
df.head()

In [None]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(f'[{string.punctuation}]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'http\S+', ' ', text)  # Remove URLs
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()  # Stemmer for stemming

    # Tokenization
    words = word_tokenize(text)

    # Stop word removal, stemming, and lemmatization
    words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if word not in stop_words]

    return ' '.join(words)  # Convert list of words back to sentence

In [None]:
df['CleanMessage'] = df['Message'].apply(preprocess_text)
X = df['CleanMessage']
y = df['Category']

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Initialize CountVectorizer with n-grams
vectorizer_ngram = CountVectorizer(ngram_range=(1, 3))

# Transform the text data with n-grams, Unigrams, Bigrams, and Trigrams
X = vectorizer_ngram.fit_transform(df['CleanMessage'])
y = df['Category']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Initialize models
logistic_regression = LogisticRegression()
naive_bayes = MultinomialNB()

# Train and predict with Logistic Regression
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)

# Train and predict with Naive Bayes
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)

# Evaluate models
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", accuracy_lr)
print("Naive Bayes Accuracy:", accuracy_nb)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Evaluate Logistic Regression
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_lr)

# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam', 'Spam'], yticklabels=['Not Spam', 'Spam'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

In [None]:
# Evaluate Naive Bayes
print("\nNaive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Precision:", precision_score(y_test, y_pred_nb))
print("Recall:", recall_score(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_nb)

# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam', 'Spam'], yticklabels=['Not Spam', 'Spam'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix for Naive Bayes')
plt.show()




### Discussion

1. **Logistic Regression**:
 - The logistic regression model performed excellently with an accuracy of 97.31%.
 - It has a perfect precision score, indicating no false positives.
 - The recall score is lower, suggesting some spam messages were missed.

2. **Naive Bayes**:
 - The Naive Bayes model also performed well with an accuracy of 94.98%.
 - It has a high recall, indicating it successfully identified most spam messages.
 - However, the precision is lower than logistic regression, indicating more false positives.

### Conclusion

Both models perform well for spam detection. Logistic regression provides higher precision, making it suitable when minimizing false positives is crucial. Naive Bayes offers higher recall, making it ideal for applications prioritizing the detection of spam messages, even at the cost of some false positives.


