### Importing the libraries

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

### Import the dataset

In [2]:
dataset = pd.read_csv('SMSSpamCollection', delimiter = '\t', names =['labels', 'messages'])
dataset

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
dataset.isnull().sum()

labels      0
messages    0
dtype: int64

### Text Cleaning

In [4]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
corpus = []

for i in range(len(dataset['messages'])):
    sentence = re.sub('[^a-zA-Z]', ' ', dataset.iloc[i, 1])
    tokens = nltk.word_tokenize(sentence)
    tokens = [ps.stem(word) for word in tokens if word not in stopwords.words('english')]
    tokens = ' '.join(tokens)
    corpus.append(tokens)    

### Creating the Bag of Words (BoW) model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)
X = cv.fit_transform(corpus).toarray()

### Declaring the variables

In [6]:
y = pd.get_dummies(dataset['labels'], drop_first = True)
y = y.values.reshape(-1)

### Splitting the dataset into Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 5)

### Using Multinomial Naive Bayes model for Classification

In [8]:
from sklearn.naive_bayes import MultinomialNB
mnb_classifier = MultinomialNB()
mnb_classifier.fit(X_train, y_train)

MultinomialNB()

### Predicting the Test set results

In [9]:
y_pred = mnb_classifier.predict(X_test)

# Accuracy Score and Confusion Matrix
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy_score :', round(accuracy*100, 2))
cm = confusion_matrix(y_test, y_pred)
print('Confusion matix :\n', cm)

Accuracy_score : 99.37
Confusion matix :
 [[967   3]
 [  4 141]]


### Using the model to compare the Test set predicted values with original values

In [10]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(dataset['messages'].values, y, test_size = 0.20, random_state = 5)
df_results = pd.DataFrame()
df_results['Messages'] = X_test_new
df_results['Spam_actual'] = y_test
df_results['Spam_predicted'] = y_pred
df_results.to_csv('results.csv')
df_results

Unnamed: 0,Messages,Spam_actual,Spam_predicted
0,PRIVATE! Your 2004 Account Statement for 07742...,1,1
1,No go. No openings for that room 'til after th...,0,0
2,GENT! We are trying to contact you. Last weeke...,1,1
3,Can you plz tell me the ans. BSLVYL sent via f...,0,0
4,I know girls always safe and selfish know i go...,0,0
...,...,...,...
1110,I think I‘m waiting for the same bus! Inform m...,0,0
1111,Dude we should go sup again,0,0
1112,Guess which pub im in? Im as happy as a pig in...,0,0
1113,URGENT! Your Mobile number has been awarded wi...,1,1


### Applying k-fold Cross Validation for checking model performance

In [11]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = mnb_classifier, X = X_train, y = y_train, cv = 20)
print('Best accuracy:', round(max(accuracies)*100, 2), '%')
print('Worst accuracy:', round(min(accuracies)*100, 2), '%')
print('Average accuracy:', round(accuracies.mean()*100, 2), '%')
print('Standard Deviation of accuracies:', round(accuracies.std()*100, 2), '%')

Best accuracy: 99.55 %
Worst accuracy: 96.41 %
Average accuracy: 98.25 %
Standard Deviation of accuracies: 0.72 %
