In [2]:
import pandas as pd
messages=pd.read_csv('/content/SMSSpamCollection',sep='\t',names=['label','message'])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# lets check the head of the dataset
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
import pandas as pd
from sklearn.utils import resample


# Separate spam and non-spam messages
spam = messages[messages['label'] == 'spam']
non_spam = messages[messages['label'] == 'ham']

# Undersample non-spam messages to match the number of spam messages
non_spam_undersampled = resample(non_spam,
                                 replace=False,    # Do not sample with replacement
                                 n_samples=len(spam),  # Match the number of spam messages
                                 random_state=42)  # For reproducibility

# Combine undersampled non-spam with spam
undersampled_data = pd.concat([non_spam_undersampled, spam])

# Shuffle the dataset
undersampled_data = undersampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the number of labels after undersampling
print(undersampled_data['label'].value_counts())


label
spam    747
ham     747
Name: count, dtype: int64


In [5]:
# cleaning the texts
# importing the libraries for Natural Language Processing

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# Initialize stemmer
ps = PorterStemmer()
# Create a cleaned corpus from the undersampled data
corpus = []
for i in range(len(undersampled_data)):
    # Remove non-alphabetic characters and convert to lowercase
    review = re.sub('[^a-zA-Z]', ' ', undersampled_data['message'].iloc[i])
    review = review.lower()
    review = review.split()
    # Stem and remove stopwords
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

['love echo',
 'love',
 'sometim play game answer question correctli alexa say got wrong answer like abl turn light away home',
 'lot fun thing yr old learn dinosaur control light play game like categori nice sound play music well',
 'music',
 'receiv echo gift need anoth bluetooth someth play music easili access found smart speaker wait see els',
 'without cellphon cannot use mani featur ipad see use great alarm u r almost deaf hear alarm bedroom live room reason enough keep fun ask random question hear respons seem smartbon polit yet',
 'think th one purchas work get one everi room hous realli like featur offer specifili play music echo control light throughout hous',
 'look great',
 'love listen song heard sinc childhood get news weather inform great',
 'sent year old dad talk constantli',
 'love learn knew thing eveyday still figur everyth work far easi use understand make laugh time',
 'purchas mother knee problem give someth tri come get around fast like enjoy littl big thing ale

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(max_features=2500) # You can adjust max_features

# Fit the vectorizer to the corpus and transform it
X = vectorizer.fit_transform(corpus).toarray()

# Get the target variable (labels)
y = undersampled_data['label']

In [11]:
# splitting the data into training and testing sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(1045, 2500)
(1045,)
(449, 2500)
(449,)


In [12]:
#Support Vector machine
from sklearn.svm import SVC

model = SVC()
model.fit(x_train, y_train)



In [13]:
print("Training Accuracy :", model.score(x_train, y_train))
print("Testing Accuracy :", model.score(x_test, y_test))

Training Accuracy : 0.999043062200957
Testing Accuracy : 0.9354120267260579


In [14]:
y_pred = model.predict(x_test)

In [15]:
#model evaluation,confusion matrix and classification report
from sklearn.metrics import confusion_matrix ,classification_report
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[215   2]
 [ 27 205]]


In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.89      0.99      0.94       217
        spam       0.99      0.88      0.93       232

    accuracy                           0.94       449
   macro avg       0.94      0.94      0.94       449
weighted avg       0.94      0.94      0.94       449



In [None]:
#Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)


print("Training Accuracy :", model.score(x_train, y_train))
print("Testing Accuracy :", model.score(x_test, y_test))

Training Accuracy : 0.9827751196172249
Testing Accuracy : 0.955456570155902


In [18]:
y_pred = model.predict(x_test)
y_pred

array(['ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham',
       'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam',
       'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam',
       'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham',
       'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham',
       'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam',
       'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam',
       'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam',
       'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham',
       'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham',
       'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham',
       'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham',
       'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam',
       'ham', 'ham', 'ham', 'ham', 'spa

In [19]:
#model evaluation,confusion matrix and classification report
from sklearn.metrics import confusion_matrix ,classification_report
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[210   7]
 [ 13 219]]


In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.94      0.97      0.95       217
        spam       0.97      0.94      0.96       232

    accuracy                           0.96       449
   macro avg       0.96      0.96      0.96       449
weighted avg       0.96      0.96      0.96       449



In [21]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# List of models
models = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

# Train each model and evaluate accuracy
for model_name, model in models.items():
    # Train the model
    model.fit(x_train, y_train)

    # Make predictions
    y_pred = model.predict(x_test)

    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")


SVM Accuracy: 93.54%
Random Forest Accuracy: 95.55%
Logistic Regression Accuracy: 93.10%
Naive Bayes Accuracy: 95.55%
K-Nearest Neighbors Accuracy: 93.54%
Decision Tree Accuracy: 92.87%
Gradient Boosting Accuracy: 91.76%




AdaBoost Accuracy: 92.43%


Implementation with bag of word

In [22]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
import pandas as pd
messages=pd.read_csv('/content/SMSSpamCollection',sep='\t',names=['label','message'])
messages.head()





# Separate spam and non-spam messages
spam = messages[messages['label'] == 'spam']
non_spam = messages[messages['label'] == 'ham']

# Undersample non-spam messages to match the number of spam messages
non_spam_undersampled = resample(non_spam,
                                 replace=False,    # Do not sample with replacement
                                 n_samples=len(spam),  # Match the number of spam messages
                                 random_state=42)  # For reproducibility

# Combine undersampled non-spam with spam
undersampled_data = pd.concat([non_spam_undersampled, spam])

# Shuffle the dataset
undersampled_data = undersampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Ensure NLTK resources are downloaded
nltk.download('stopwords')

# Initialize stemmer
ps = PorterStemmer()

# Create a cleaned corpus from the undersampled data
corpus = []
for i in range(len(undersampled_data)):
    # Remove non-alphabetic characters and convert to lowercase
    review = re.sub('[^a-zA-Z]', ' ', undersampled_data['message'].iloc[i])
    review = review.lower()
    review = review.split()
    # Stem and remove stopwords
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Convert the cleaned corpus into a bag-of-words model
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

# Create the binary target variable from the undersampled data
y = pd.get_dummies(undersampled_data['label'], drop_first=True).values
# y will have 0 for 'ham' and 1 for 'spam'

# Check shapes of X and y to ensure they match
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Proceed to split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Shape of X: (1494, 3047)
Shape of y: (1494, 1)


Bag of word

In [None]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # Limit to 5000 features
X_bow = vectorizer.fit_transform(X)

# Convert to array for compatibility with scikit-learn
X_bow = X_bow.toarray()


In [None]:
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)


In [24]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Assuming X and y are already defined
# Convert y to a 1D array
y = y.ravel()  # Flatten the array

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Initialize models
models = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier()
}

# Train models and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)  # Fit model
    accuracy = model.score(X_test, y_test) * 100  # Calculate accuracy
    print(f"{name} Accuracy: {accuracy:.2f}%")



SVM Accuracy: 92.31%
Random Forest Accuracy: 94.31%
Logistic Regression Accuracy: 93.98%
Naive Bayes Accuracy: 95.65%
K-Nearest Neighbors Accuracy: 74.92%
Decision Tree Accuracy: 91.97%
Gradient Boosting Accuracy: 91.30%




AdaBoost Accuracy: 92.64%
