In [None]:
pip install pandas numpy scikit-learn nltk

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv("C:/Users/swath/Downloads/spam.csv", encoding='latin-1')
df.head()

In [None]:
#Performing the Exploratory Data Analysis Before checking the Spam Classification
#Summary of dataset(columns, datatypes, non-null values)
df.info()

In [None]:
#Count of spam and ham messages
df['v1'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x=df['v1'])
plt.title("Spam vs. Ham Distribution")
plt.xlabel("Email Type (ham = 0, spam = 1)")
plt.ylabel("Count")
plt.show()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
#Add a column for message length
df['message_length'] = df['v2'].apply(len)

#Display statistics
df[['v1', 'message_length']].groupby('v1').describe()


In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df[df['v1']=='ham']['message_length'], bins=50, label="Ham", color='blue', kde=True)
sns.histplot(df[df['v1']=='spam']['message_length'], bins=50, label="Spam", color='red', kde=True)
plt.title("Distribution of Message Lengths")
plt.xlabel("Message Length")
plt.ylabel("Frequency")
plt.legend()
plt.show()

In [None]:
from collections import Counter
import itertools

#Tokenizing words
ham_words = list(itertools.chain(*[message.split() for message in df[df['v1'] == 'ham']['v2']]))
spam_words = list(itertools.chain(*[message.split() for message in df[df['v1'] == 'spam']['v2']]))

#Counting the most common words
ham_common = Counter(ham_words).most_common(20)
spam_common = Counter(spam_words).most_common(20)

#Convert it into DataFrame for visualization
ham_df = pd.DataFrame(ham_common, columns=['Word', 'Count'])
spam_df = pd.DataFrame(spam_common, columns=['Word', 'Count'])

#Plotting the data 
fig, axes = plt.subplots(1, 2, figsize=(12,5))
sns.barplot(x="Count", y="Word", data=ham_df, ax=axes[0], palette="Blues_r")
axes[0].set_title("Top Words in Ham Messages")

sns.barplot(x="Count", y="Word", data=spam_df, ax=axes[1], palette="Reds_r")
axes[1].set_title("Top Words in Spam Messages")

plt.show()

In [None]:
print(df.columns) 

In [None]:
df.columns = df.columns.str.strip()
print(df.columns)

In [None]:
df = df[['v1', 'v2']] 
df.columns = ['labels', 'message']
df.head()

In [None]:
print(df.columns)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string

nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))
def clean_text(text):
    
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\W', ' ', text)  # Replace non-word characters with space
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d', '', text)  # Remove numbers
    
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)


df['cleaned_text'] = df['message'].apply(clean_text)
df[['message', 'cleaned_text']].head() 


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Convert text data into numerical features using TF-IDF
tfidf = TfidfVectorizer(max_features=3000)  #Keep top 3000 important words
X = tfidf.fit_transform(df['cleaned_text']).toarray()


df['label'] = df['labels'].map({'ham': 0, 'spam': 1})  


y = df['label'].values

print(X.shape)  
print(y[:5])   


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  #Keep top 5000 important words
X = vectorizer.fit_transform(df['cleaned_text']).toarray() 

# Extract target labels
y = df['label']  

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Check the shape of training and testing data
X_train.shape, X_test.shape


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#Initialize and train the Na√Øve Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Calculating  accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")  # Print accuracy with 2 decimal places

#Print classification report (precision, recall, F1-score, etc.)
print(classification_report(y_test, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
def predict_spam(email):
    email = preprocess_text(email)
    vectorized_email = vectorizer.transform([email]).toarray()
    prediction = model.predict(vectorized_email)
    return "Spam" if prediction[0] == 1 else "Not Spam"
    
test_email = "Congratulations! You have won a $1000 gift card. Click here to claim your prize."
print(predict_spam(test_email))
