<a href="https://colab.research.google.com/github/technomancerAdarsh/Spam-and-Ham-Using-NLP-/blob/main/SpamAndHam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import nltk



In [6]:
url = 'https://raw.githubusercontent.com/technomancerAdarsh/Spam-and-Ham-Using-NLP-/main/Spam/SMSSpamCollection'

messages = pd.read_csv(url, sep='\t', header=None, names=["label", "message"])

messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# Convert the 'message' column to string
messages['message'] = messages['message'].astype(str)

# Calculate message length
messages['message_length'] = messages['message'].apply(len)

# Summary of the data

messages.head()




Unnamed: 0,label,message,message_length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [11]:
messages.describe()

Unnamed: 0,message_length
count,5572.0
mean,80.48995
std,59.942907
min,2.0
25%,36.0
50%,62.0
75%,122.0
max,910.0


In [12]:
# Longest and shortest messages
print("Longest message has:", messages['message_length'].max(), "Characters")
print("Shortest message has:", messages['message_length'].min(), "Characters")

# Index of messages with the shortest length
shortest_length_index = messages['message_length'].idxmin()
print("Index of messages with shortest length:", shortest_length_index)
print("Shortest message is:", messages.loc[shortest_length_index, 'message'])
print("Longest message is:", messages.loc[messages['message_length'].idxmax(), 'message'])

# Label distribution
print(messages['label'].value_counts())

Longest message has: 910 Characters
Shortest message has: 2 Characters
Index of messages with shortest length: 1925
Shortest message is: Ok
Longest message is: For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the d

In [13]:
#text preprocessing
def text_preprocessing(text):
    stemmer = SnowballStemmer('english')
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    return " ".join([stemmer.stem(word) for word in tokens])

messages['processed_message'] = messages['message'].apply(text_preprocessing)


In [14]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(messages['processed_message'], messages['label'], test_size=0.25, random_state=42)

# Create bag of words using CountVectorizer
vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train_dtm, y_train)
y_pred_dt = dt.predict(X_test_dtm)

# Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_nb = nb.predict(X_test_dtm)

# Evaluation
print("\nDecision Tree Classifier:")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

print("\nNaive Bayes Classifier:")
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))



Decision Tree Classifier:
[[1192   15]
 [  30  156]]
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98      1207
        spam       0.91      0.84      0.87       186

    accuracy                           0.97      1393
   macro avg       0.94      0.91      0.93      1393
weighted avg       0.97      0.97      0.97      1393


Naive Bayes Classifier:
[[1201    6]
 [  12  174]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1207
        spam       0.97      0.94      0.95       186

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



# Thank You