In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('SMSSpamCollection', sep='\t')

# for text data always used sep=\t

In [3]:
df.head()

Unnamed: 0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [5]:
df = pd.read_csv('SMSSpamCollection', sep='\t',
                names=['Label','Messages'])

In [6]:
df.head()

Unnamed: 0,Label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Ham means real message and spam means fake message

In [8]:
# Label is dependent variable, hence classification problem
# It is sentiment analysis, its text data, based on message have to identify ham or spam 

In [9]:
df

Unnamed: 0,Label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [11]:
df.isnull().sum()

Label       0
Messages    0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Label     5572 non-null   object
 1   Messages  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [13]:
df['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [14]:
df['Label'].value_counts()/len(df)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [None]:
# Data is imbalanced, Oversampling method to be used

In [15]:
ham = df[df['Label']=='ham']
spam = df[df['Label']=='spam']

In [16]:
print(ham.shape,spam.shape)

(4825, 2) (747, 2)


In [17]:
ham.shape[0]

4825

In [20]:
# Balancing the data

spam = spam.sample(ham.shape[0], replace=True)

# As there is large difference in the data of ham and spam,
# we have to use replacement method
# Here, spam data is added till it reaches the ham data value ie 4825

In [21]:
data = ham.append(spam, ignore_index=True)

  data = ham.append(spam, ignore_index=True)


In [22]:
data.shape

(9650, 2)

In [23]:
data.head()

Unnamed: 0,Label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,ham,Even my brother is not like to speak with me. ...


In [24]:
data['Label'].value_counts()

ham     4825
spam    4825
Name: Label, dtype: int64

In [25]:
# Now data is balanced

# Data Cleaning Part

In [26]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/Sumit/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
# stopwords means conjunctions, prepositions like I, we, is, are, do, will which are insignificant and frequently repeated

In [28]:
from nltk.corpus import stopwords 

# corpus means complete document

from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()


# PorterStemmer / Limitizer (words like go, going, goes, gone words are similar but since python is case sensitive
# will consider different) is method which makes these words into one word which reduces complexity

In [29]:
corpus = []

for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]',' ',data['Messages'][i]) # considers only alphabets and no numericals or symbols
    review = review.lower() # converts into lower case
    review = review.split() # splits the sentences
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review) # joins into sentence
    corpus.append(review)

In [30]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'search right word thank breather promis wont take help grant fulfil promis wonder bless time',
 'date sunday',
 'oh k watch',
 'eh u rememb spell name ye v naughti make v wet',
 'fine way u feel way gota b',
 'serious spell name',
 'go tri month ha ha joke',
 'pay first lar da stock comin',
 'aft finish lunch go str lor ard smth lor u finish ur lunch alreadi',
 'ffffffffff alright way meet sooner',
 'forc eat slice realli hungri tho suck mark get worri know sick turn pizza lol',
 'lol alway convinc',
 'catch bu fri egg make tea eat mom left dinner feel love',
 'back amp pack

# Bag of Words

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x=cv.fit_transform(corpus).toarray()

In [32]:
x.shape

(9650, 6285)

In [33]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
data['Label'] = data['Label'].astype('category')
data['Label'] = data['Label'].cat.codes

In [35]:
data.head()

Unnamed: 0,Label,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,0,U dun say so early hor... U c already then say...
3,0,"Nah I don't think he goes to usf, he lives aro..."
4,0,Even my brother is not like to speak with me. ...


In [36]:
data.shape

(9650, 2)

# Split the data into train and test

In [37]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, data['Label'], test_size=0.25, random_state=1)

# Building the Naive Bayes Theorm

In [38]:
from sklearn.naive_bayes import MultinomialNB
nbmodel = MultinomialNB().fit(x_train, y_train)

In [39]:
y_pred_train = nbmodel.predict(x_train)
y_pred_test = nbmodel.predict(x_test)

In [40]:
from sklearn.metrics import classification_report, accuracy_score

In [41]:
print(classification_report(y_train, y_pred_train))
print()
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3625
           1       0.98      0.99      0.98      3612

    accuracy                           0.98      7237
   macro avg       0.98      0.98      0.98      7237
weighted avg       0.98      0.98      0.98      7237


              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1200
           1       0.97      0.98      0.97      1213

    accuracy                           0.97      2413
   macro avg       0.97      0.97      0.97      2413
weighted avg       0.97      0.97      0.97      2413



In [42]:
print(accuracy_score(y_train, y_pred_train))
print()
print(accuracy_score(y_test, y_pred_test))

0.9839712588088987

0.9738914214670534
