In [1]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('SMSSpamCollection.txt', sep='\t',
                           names=["label", "message"])

In [2]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#Data cleaning and preprocessing
import re
import nltk

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
   # Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [6]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [7]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB().fit(X_train, y_train)

y_pred_1=nb_classifier.predict(X_test)

In [8]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_1)

0.9856502242152466

In [9]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

y_pred_2=lr_classifier.predict(X_test)

In [10]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_2)

0.9847533632286996

In [11]:
msg='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...t'
b=[]
var1=list(str(msg))
b.append(''.join(var1))

nb_classifier.predict(cv.transform(b))

array([0], dtype=uint8)

In [12]:
from pickle import dump


dump(cv, open('models/cv.pkl', 'wb'))
dump(lr_classifier, open('models/lr_model.pkl', 'wb'))
dump(nb_classifier, open('models/nb_model.pkl', 'wb'))