<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/kaggle/input/spam-file/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)



In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

4691                                           Ok no prob
365     Good afternoon sunshine! How dawns that day ? ...
2627        Unni thank you dear for the recharge..Rakhesh
2630    Hey there! Glad u r better now. I hear u treat...
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

4691    0
365     0
2627    0
2630    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7730 sparse matrix of type '<class 'numpy.int64'>'
	with 59184 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
X_train_cv.shape

(4457, 7730)

In [18]:
v.get_feature_names_out()[1771]

'chill'

In [19]:
v.vocabulary_

{'ok': 4923,
 'no': 4799,
 'prob': 5440,
 'good': 3168,
 'afternoon': 838,
 'sunshine': 6587,
 'how': 3505,
 'dawns': 2158,
 'that': 6787,
 'day': 2159,
 'are': 1030,
 'we': 7396,
 'refreshed': 5667,
 'and': 939,
 'happy': 3321,
 'to': 6908,
 'be': 1259,
 'alive': 892,
 'do': 2355,
 'breathe': 1483,
 'in': 3630,
 'the': 6791,
 'air': 865,
 'smile': 6246,
 'think': 6821,
 'of': 4898,
 'you': 7693,
 'my': 4671,
 'love': 4229,
 'as': 1063,
 'always': 910,
 'unni': 7148,
 'thank': 6780,
 'dear': 2170,
 'for': 2918,
 'recharge': 5644,
 'rakhesh': 5573,
 'hey': 3410,
 'there': 6809,
 'glad': 3133,
 'better': 1320,
 'now': 4850,
 'hear': 3365,
 'treated': 7009,
 'urself': 7190,
 'digi': 2299,
 'cam': 1606,
 'is': 3739,
 'it': 3751,
 'off': 4899,
 'at': 1097,
 '9pm': 728,
 'have': 3343,
 'fab': 2717,
 'new': 4761,
 'year': 7666,
 'coupla': 2014,
 'wks': 7544,
 'ugh': 7093,
 'can': 1612,
 'just': 3873,
 'apologize': 994,
 'admit': 808,
 'were': 7443,
 'wrong': 7613,
 'ask': 1073,
 'me': 4412,
 

In [20]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [21]:
np.where(X_train_np[0]!=0)

(array([4799, 4923, 5440]),)

In [22]:
#X_train[:4][1579]

In [23]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [25]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       962
           1       0.98      0.90      0.94       153

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [27]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [28]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [29]:
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       962
           1       0.98      0.90      0.94       153

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

