## **Text Representation - Bag Of Words (BOW)**

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./datasets/emails_spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
df.shape

(5572, 3)

**Train test split**

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [11]:
type(X_train)

pandas.core.series.Series

**Create bag of words representation using CountVectorizer**

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

V = CountVectorizer()

X_train_c = V.fit_transform(X_train.values)
X_train_c

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59620 stored elements and shape (4457, 7762)>

In [13]:
X_train_c.toarray()

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 7762))

In [15]:
V.get_feature_names_out()[1771]

'cherish'

In [16]:
V.vocabulary_

{'final': 2853,
 'chance': 1727,
 'claim': 1822,
 'ur': 7219,
 '150': 299,
 'worth': 7626,
 'of': 4899,
 'discount': 2344,
 'vouchers': 7349,
 'today': 6948,
 'text': 6793,
 'yes': 7710,
 'to': 6943,
 '85023': 676,
 'now': 4848,
 'savamob': 5933,
 'member': 4443,
 'offers': 4907,
 'mobile': 4547,
 'cs': 2080,
 'pobox84': 5294,
 'm263uz': 4285,
 '00': 0,
 'subs': 6566,
 '16': 309,
 'pls': 5271,
 'go': 3163,
 'there': 6837,
 'lt': 4258,
 'gt': 3258,
 'dont': 2406,
 'want': 7398,
 'any': 992,
 'excuses': 2700,
 'this': 6860,
 'weeks': 7464,
 'are': 1049,
 'accessible': 772,
 'just': 3879,
 'call': 1616,
 '08709501522': 93,
 'for': 2940,
 'details': 2267,
 'pobox': 5283,
 '139': 290,
 'la3': 3991,
 '2wu': 420,
 'only': 4952,
 '50': 530,
 'week': 7459,
 'eh': 2542,
 'laptop': 4017,
 'got': 3200,
 'no': 4801,
 'stock': 6499,
 'lei': 4078,
 'he': 3365,
 'say': 5938,
 'mon': 4571,
 'muz': 4667,
 'come': 1899,
 'again': 855,
 'take': 6693,
 'look': 4200,
 'not': 4835,
 'urgent': 7223,
 'we': 74

In [17]:
X_train_np = X_train_c.toarray()
X_train_np[0]

array([1, 0, 0, ..., 0, 0, 0], shape=(7762,))

In [18]:
np.where(X_train_np[0]!=0)

(array([   0,  299,  309,  676, 1727, 1822, 2080, 2344, 2853, 4285, 4443,
        4547, 4848, 4899, 4907, 5294, 5933, 6566, 6793, 6943, 6948, 7219,
        7349, 7626, 7710]),)

**Train the naive bayes model**

In [19]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_c, y_train)

In [20]:
X_test_c = V.transform(X_test)

**Evaluate Performance**

In [21]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_c)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       958
           1       0.97      0.94      0.95       157

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [24]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = V.transform(emails)
model.predict(emails_count)

array([0, 1])

**Train the model using sklearn pipeline and reduce number of lines of code**

In [25]:
from sklearn.pipeline import Pipeline

clf_pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [26]:
clf_pipe.fit(X_train, y_train)

In [28]:
y_pred = clf_pipe.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       958
           1       0.97      0.94      0.95       157

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

