#### BOW,TFIDF,Machine Learning Algorithms

1. Preprocessing And Cleaning
2. Train Test Split
3. BOW And TF-IDF (Sentences--->vectors) {Preventing Data Leakage}
4. Trained Our Models

In [1]:
import pandas as pd
messages=pd.read_csv('SMSSpamCollection',
                    sep='\t',names=["label","message"])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
## Data Cleaning And Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dev\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
## Corpus-> a collection of authentic text or audio that is organized into datasets
## PorterStemmer -> Process of Stemming results in root stem / word stem
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [12]:
corpus=[]
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [13]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

## Create Bag of words

In [14]:
## Output Features
y=pd.get_dummies(messages['label'])
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [15]:
# first column of data frame with all rows
y=y.iloc[:,0].values
y

array([1, 1, 0, ..., 1, 1, 1], dtype=uint8)

## iloc examples
df.iloc[:3] # slice your object, i.e. first three rows of your dataframe
df.iloc[0:3] # same
df.iloc[0, 1] # index both axis. Select the element from the first row, second column.
df.iloc[:, 0:5] # first five columns of data frame with all rows

In [17]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,y,test_size=0.20)

In [21]:
X_train

['usual iam fine happi amp well',
 'ya referin mei ex wat ah waitin u treat somebodi shld b rich liao gd den u dun work frm tmr onward',
 'gran onlyfound afew day ago cusoon honi',
 'text way cup stop work bu',
 'thanx yup come back sun finish dinner go back hotel time fli tog exactli mth today hope haf mani mth come',
 'good afternoon sexi bun goe job search wake first thought alway love wish fine happi know ador',
 'good afternoon loverboy goe day luck come way think sweeti send love across sea make smile happi',
 'costa del sol holiday await collect call toclaim sae tc pobox stockport sk xh cost pm max min',
 'hey next sun basic yoga cours bugi go pilat intro next sat tell time r free',
 'think get away trek long famili town sorri',
 'uncl atlanta wish guy great semest',
 'howz person stori',
 'ye princess toledo',
 'yeah lol luckili star role like',
 'msg rajini come',
 'keep ten rs shelf buy two egg',
 'friend got say up order gram got lt gt get',
 'ok right later',
 'amaz rearran

In [27]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))

In [28]:
len(X_train),len(y_train)

(4457, 4457)

In [29]:
## independent features
X_train=cv.fit_transform(X_train).toarray()
X_test=cv.transform(X_test).toarray()

In [30]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
X_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
spam_detect_model=MultinomialNB().fit(X_train,y_train)

In [33]:
y_pred=spam_detect_model.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score,classification_report

In [35]:
accuracy_score(y_test,y_pred)

0.9739910313901345

In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91       162
           1       0.98      0.99      0.98       953

    accuracy                           0.97      1115
   macro avg       0.96      0.93      0.95      1115
weighted avg       0.97      0.97      0.97      1115



## Creating The TF-IDF Model

In [37]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(corpus,y,test_size=0.20)

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
# top 2500 words having maximum features
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X_train=tv.fit_transform(X_train).toarray()
X_test=tv.transform(X_test).toarray()

In [39]:
X_train

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0.216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0.152, 0, 0, 0, 0.23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0,

In [40]:
tv.vocabulary_

{'show': 1881,
 'wot': 2439,
 'say': 1809,
 'could': 443,
 'realli': 1711,
 'quit': 1679,
 'funni': 763,
 'lor': 1214,
 'wat': 2343,
 'shd': 1872,
 'haf': 897,
 'run': 1787,
 'lor wat': 1217,
 'leav': 1144,
 'soon': 1947,
 'littl': 1185,
 'sometim': 1941,
 'put': 1671,
 'around': 97,
 'heart': 923,
 'safe': 1794,
 'get': 782,
 'hurt': 995,
 'find': 698,
 'care': 301,
 'enough': 620,
 'break': 216,
 'amp': 64,
 'care enough': 302,
 'pl': 1579,
 'stop': 2003,
 'invit': 1039,
 'friend': 748,
 'repli': 1740,
 'ye': 2473,
 'see': 1828,
 'www': 2452,
 'sm': 1920,
 'ac': 4,
 'send': 1842,
 'frnd': 754,
 'invit friend': 1041,
 'friend repli': 749,
 'repli ye': 1746,
 'ye see': 2477,
 'see www': 1832,
 'www sm': 2456,
 'sm ac': 1921,
 'stop send': 2009,
 'send stop': 1849,
 'stop frnd': 2007,
 'special': 1964,
 'congrat': 418,
 'year': 2481,
 'cinema': 351,
 'pass': 1532,
 'call': 247,
 'etc': 630,
 'free': 728,
 'bx': 239,
 'ip': 1043,
 'pm': 1604,
 'dont': 569,
 'miss': 1334,
 'etc free': 631

In [41]:
from sklearn.naive_bayes import MultinomialNB
spam_tfidf_model = MultinomialNB().fit(X_train, y_train)

In [42]:

#prediction
y_pred=spam_tfidf_model.predict(X_test)

In [43]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9838565022421525


In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94       135
           1       1.00      0.98      0.99       980

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.99      0.98      0.98      1115

