# SMS Classification using Machine Learning

## By: [Tahsin Jahin Khalid](https://tahsinjahinkhalid.github.io/)

### Import Required Modules

In [21]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

### Setup NLTK

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Load Data

In [4]:
messages = pd.read_csv('../data/messages.txt', 
                       sep='\t', 
                       names=["label", "message"])

### Explore the Dataset

In [5]:
messages.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
messages.tail(5)

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [7]:
messages['label'].unique()

array(['ham', 'spam'], dtype=object)

In [9]:
print(f"Rows: {messages.shape[0]}")
print(f"Columns: {messages.shape[1]}")

Rows: 5572
Columns: 2


### Data Preprocessing

In [10]:
ps = PorterStemmer()
corpus = []

In [11]:
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [13]:
messages["corpus"] = corpus

In [14]:
messages.head(5)

Unnamed: 0,label,message,corpus
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


### Model Building

In [15]:
# we pick bag of words
cv = CountVectorizer(max_features=2500)
var_X = cv.fit_transform(corpus).toarray()

In [16]:
var_y = pd.get_dummies(messages['label'])
var_y = var_y.iloc[:,1].values

In [18]:
# train test split 80-20 ratio
X_train, X_test, y_train, y_test = train_test_split(
    var_X, var_y, 
    test_size = 0.20, 
    random_state = 0)

### Model Training

In [19]:
# train a Naive Bayes Model
spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train, y_train)

### Model Prediction

In [20]:
y_pred = spam_detect_model.predict(X_test)

### Model Evaluation

In [28]:
acc_metric = accuracy_score(y_pred=y_pred, y_true=y_test)
print(f"Accuracy Score: {acc_metric:.02f} out of 1.00")

Accuracy Score: 0.99 out of 1.00


In [29]:
confusion_matrix(y_pred=y_pred,
                 y_true=y_test)

array([[946,   9],
       [  7, 153]])