In [2]:
# importing the Dataset

import pandas as pd
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t',
                           names=["label", "message"])


In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\sai
[nltk_data]     sushanth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [6]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
corpus[:4]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say']

In [8]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4500)
X = cv.fit_transform(corpus).toarray()

In [9]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [10]:
y

array([False, False,  True, ..., False, False, False])

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [12]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [13]:
y_pred=spam_detect_model.predict(X_test)
y_pred

array([False,  True, False, ..., False,  True, False])

In [14]:
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_m=confusion_matrix(y_test,y_pred)
accuracy=accuracy_score(y_test,y_pred)
print(confusion_m,accuracy)


[[946   9]
 [  7 153]] 0.9856502242152466


In [15]:
metrics=pd.DataFrame({'confusion matrix':[confusion_m],'accuracy score':[accuracy]})
metrics

Unnamed: 0,confusion matrix,accuracy score
0,"[[946, 9], [7, 153]]",0.98565


In [16]:
def predict_spam_or_ham(input_text):
    # Preprocess the input
    ps = PorterStemmer()
    review = re.sub('[^a-zA-Z]', ' ', input_text)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)

    # Vectorize the input
    input_vector = cv.transform([review]).toarray()

    # Make prediction
    prediction = spam_detect_model.predict(input_vector)

    # Output the prediction
    if prediction == 1:
        return "SPAM"
    else:
        return "HAM"

# Test the function
unknown_input = "Congratulations! You've won a free cruise. Reply 'YES' to claim your prize."
prediction = predict_spam_or_ham(unknown_input)
print("Prediction:", prediction)

Prediction: SPAM


In [17]:
# Test with a non-spam input
non_spam_input = "Hey, how are you doing? Are you free for lunch tomorrow?"
prediction = predict_spam_or_ham(non_spam_input)
print("Prediction:", prediction)


Prediction: HAM


In [21]:
import pickle

# Train your model
spam_detect_model = MultinomialNB().fit(X_train, y_train)
# Train the CountVectorizer with the training data
cv.fit(corpus)
with open('cv.pkl', 'wb') as file:
    pickle.dump(cv, file)
# Save the model to a file
with open('spam_model.pkl', 'wb') as file:
    pickle.dump(spam_detect_model, file)


In [22]:
# Load the model from file
with open('spam_model.pkl', 'rb') as file:
    spam_detect_model = pickle.load(file)

In [23]:
# Example text for prediction
input_text = "Congratulations! You've won a free cruise. Reply 'YES' to claim your prize."

# Preprocess the input text
ps = PorterStemmer()
review = re.sub('[^a-zA-Z]', ' ', input_text)
review = review.lower()
review = review.split()
review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)

# Vectorize the input
input_vector = cv.transform([review]).toarray()

# Make prediction using the loaded model
prediction = spam_detect_model.predict(input_vector)

# Output the prediction
if prediction == 1:
    print("Prediction: SPAM")
else:
    print("Prediction: HAM")


Prediction: SPAM
