In [1]:
#needed libraries
!pip install pandas 
!pip install nltk
!pip install sklearn



In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
pd.set_option('min_rows', 50)

In [3]:
dataset = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [4]:
dataset = dataset.iloc[:,0:2]

In [5]:
dataset

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


# Text Preprocessing

In [6]:
#taking only text which are alphabets
#iterating through all the sms data and removing characters other than the alphabets

dataset['cleaned'] = dataset['v2'].apply(lambda x:re.sub('[^A-Za-z]',' ',x))

In [7]:
dataset.head()

Unnamed: 0,v1,v2,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...


### Text Normalization

##### Making all text lower case,this will easy when we come to word vertorization

In [8]:
dataset['lower'] = dataset['cleaned'].apply(lambda x:x.lower())

In [9]:
dataset.head()

Unnamed: 0,v1,v2,cleaned,lower
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,go until jurong point crazy available only ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,nah i don t think he goes to usf he lives aro...


##### Removing stops words

In [10]:
#downloading the needed model from nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Menanv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# first we are going to tokenize the messages.
#example
from nltk.tokenize import word_tokenize
s = "hello this is awesome"
t = word_tokenize(s)
t

['hello', 'this', 'is', 'awesome']

In [12]:
dataset['tokenized'] = dataset['lower'].apply(lambda x: word_tokenize(x))
dataset

Unnamed: 0,v1,v2,cleaned,lower,tokenized
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling it s been week s n...,freemsg hey there darling it s been week s n...,"[freemsg, hey, there, darling, it, s, been, we..."
6,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me ...,even my brother is not like to speak with me ...,"[even, my, brother, is, not, like, to, speak, ..."
7,ham,As per your request 'Melle Melle (Oru Minnamin...,As per your request Melle Melle Oru Minnamin...,as per your request melle melle oru minnamin...,"[as, per, your, request, melle, melle, oru, mi..."
8,spam,WINNER!! As a valued network customer you have...,WINNER As a valued network customer you have...,winner as a valued network customer you have...,"[winner, as, a, valued, network, customer, you..."
9,spam,Had your mobile 11 months or more? U R entitle...,Had your mobile months or more U R entitle...,had your mobile months or more u r entitle...,"[had, your, mobile, months, or, more, u, r, en..."


##### Removing stopwords

In [13]:
#example
# nltk.download('stopwords')
from nltk.corpus import stopwords 

s= 'Hi this is a great building, this building is pretty cool'
t = word_tokenize(s)
print("Tokenized words:",t)
#iterating through each word and checking whether these are stop words
for word in t:
    if word in stopwords.words('english'):
        print(word)

Tokenized words: ['Hi', 'this', 'is', 'a', 'great', 'building', ',', 'this', 'building', 'is', 'pretty', 'cool']
this
is
a
this
is


In [14]:
dataset['stopword_removed'] = dataset['tokenized'].apply(lambda t: [x for x in t if x not in stopwords.words('english')])

In [15]:
dataset.head()

Unnamed: 0,v1,v2,cleaned,lower,tokenized,stopword_removed
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,...","[nah, think, goes, usf, lives, around, though]"


##### Lemmatizing

In [16]:
# Lemmatize with POS Tag
nltk.download("averaged_perceptron_tagger")
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Menanv\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [17]:
#example
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
s = "The boy ran the race by running. Then boy fell came. The fallen boy got up"
lower = s.lower()
tokenize = word_tokenize(lower)
stopwords_removed = []
for word in tokenize:
    if word not in stopwords.words('english'):
        stopwords_removed.append(word)
stemmed = []
for word in stopwords_removed:
    stemmed.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
stemmed

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Menanv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['boy',
 'ran',
 'race',
 'run',
 '.',
 'boy',
 'fell',
 'come',
 '.',
 'fall',
 'boy',
 'get']

In [18]:
dataset["lemmatized"] = dataset["stopword_removed"].apply(lambda words:[lemmatizer.lemmatize(x, get_wordnet_pos(x)) for x in words])

In [19]:
dataset.head()

Unnamed: 0,v1,v2,cleaned,lower,tokenized,stopword_removed,lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,...","[nah, think, goes, usf, lives, around, though]","[nah, think, go, usf, life, around, though]"


##### Final data

In [20]:
dataset["final_data"] = dataset["lemmatized"].apply(lambda x: " ".join(x))
dataset

Unnamed: 0,v1,v2,cleaned,lower,tokenized,stopword_removed,lemmatized,final_data
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...,go until jurong point crazy available only ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...,"[free, entry, in, a, wkly, comp, to, win, fa, ...","[free, entry, wkly, comp, win, fa, cup, final,...","[free, entry, wkly, comp, win, fa, cup, final,...",free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I don t think he goes to usf he lives aro...,nah i don t think he goes to usf he lives aro...,"[nah, i, don, t, think, he, goes, to, usf, he,...","[nah, think, goes, usf, lives, around, though]","[nah, think, go, usf, life, around, though]",nah think go usf life around though
5,spam,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling it s been week s n...,freemsg hey there darling it s been week s n...,"[freemsg, hey, there, darling, it, s, been, we...","[freemsg, hey, darling, week, word, back, like...","[freemsg, hey, darling, week, word, back, like...",freemsg hey darling week word back like fun st...
6,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me ...,even my brother is not like to speak with me ...,"[even, my, brother, is, not, like, to, speak, ...","[even, brother, like, speak, treat, like, aids...","[even, brother, like, speak, treat, like, aid,...",even brother like speak treat like aid patent
7,ham,As per your request 'Melle Melle (Oru Minnamin...,As per your request Melle Melle Oru Minnamin...,as per your request melle melle oru minnamin...,"[as, per, your, request, melle, melle, oru, mi...","[per, request, melle, melle, oru, minnaminungi...","[per, request, melle, melle, oru, minnaminungi...",per request melle melle oru minnaminunginte nu...
8,spam,WINNER!! As a valued network customer you have...,WINNER As a valued network customer you have...,winner as a valued network customer you have...,"[winner, as, a, valued, network, customer, you...","[winner, valued, network, customer, selected, ...","[winner, value, network, customer, select, rec...",winner value network customer select receivea ...
9,spam,Had your mobile 11 months or more? U R entitle...,Had your mobile months or more U R entitle...,had your mobile months or more u r entitle...,"[had, your, mobile, months, or, more, u, r, en...","[mobile, months, u, r, entitled, update, lates...","[mobile, month, u, r, entitle, update, late, c...",mobile month u r entitle update late colour mo...


In [21]:
# creating the feature matrix 
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(dataset["final_data"].to_list()).toarray()

In [22]:
X[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
print(matrix.get_feature_names(),len(matrix.get_feature_names()))

['abiola', 'able', 'abt', 'ac', 'accept', 'access', 'account', 'across', 'actually', 'add', 'address', 'admirer', 'aft', 'afternoon', 'age', 'ago', 'ah', 'aight', 'al', 'almost', 'alone', 'already', 'alright', 'also', 'always', 'amp', 'an', 'angry', 'another', 'answer', 'anyone', 'anything', 'anytime', 'anyway', 'apply', 'appreciate', 'ard', 'area', 'around', 'as', 'asap', 'ask', 'askd', 'attempt', 'auction', 'available', 'await', 'award', 'away', 'awesome', 'babe', 'baby', 'back', 'bad', 'bag', 'balance', 'bank', 'bath', 'bathe', 'bb', 'bcoz', 'beautiful', 'bed', 'believe', 'best', 'bid', 'big', 'bill', 'birthday', 'bit', 'blood', 'blue', 'bluetooth', 'bold', 'bonus', 'book', 'bore', 'bos', 'bout', 'box', 'boy', 'boytoy', 'break', 'bring', 'brings', 'brother', 'bslvyl', 'bt', 'bus', 'busy', 'buy', 'buying', 'bx', 'cake', 'call', 'caller', 'callertune', 'camcorder', 'camera', 'cancel', 'cant', 'car', 'card', 'care', 'carlos', 'case', 'cash', 'catch', 'cause', 'cd', 'chance', 'change', 

In [23]:
#preparing labels
y = dataset.iloc[:,0]

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [24]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7666905958363245


In [25]:
X.shape

(5572, 1000)

In [39]:
test = matrix.transform(["Hey how are you?","Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"])
test.shape

(2, 1000)

In [40]:
print(classifier.predict(test.toarray()))

['spam' 'spam']


In [28]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(solver = 'lbfgs')
logisticRegr.fit(X_train, y_train)

LogisticRegression()

In [29]:
# Predict Class
y_pred = logisticRegr.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9806173725771715


In [41]:
logisticRegr.predict(test)

array(['ham', 'spam'], dtype=object)