### Bag of words model

In [1]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [12]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [13]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)


['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movies releasing week .']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Creating bag of words model using count vectorizer function

In [14]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the rown and column number of cells which have 1 as value

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 13 stored elements and shape (3, 12)>
  Coords	Values
  (0, 2)	1
  (0, 10)	1
  (0, 3)	1
  (0, 4)	1
  (1, 4)	1
  (1, 9)	1
  (1, 1)	1
  (1, 7)	1
  (1, 0)	1
  (2, 6)	1
  (2, 5)	1
  (2, 8)	1
  (2, 11)	1


In [33]:
# print the full sparse matrix
print(bow_model.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [34]:
# Calculate the sum of all values in the bag of words model
bow_sum = bow_model.toarray().sum()
print("Sum of all values in the bag of words model:", bow_sum)

Sum of all values in the bag of words model: 934


In [None]:
print(bow_model.shape)
# get feature names in a version-compatible way
print(vectorizer.get_feature_names_out())
bow_model.shape

(3, 12)
['actors' 'depends' 'gangs' 'great' 'movie' 'movies' 'new' 'performance'
 'releasing' 'success' 'wasseypur' 'week']


### Let's create a bag of words model on the spam dataset.

In [17]:
# load data
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.shape

(5572, 2)

##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [19]:
spam = spam.iloc[0:100,:]
print(spam)

   label  \
0    ham   
1    ham   
2   spam   
3    ham   
4    ham   
..   ...   
95  spam   
96   ham   
97   ham   
98   ham   
99   ham   

                                                                                                message  
0   Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...  
1                                                                         Ok lar... Joking wif u oni...  
2   Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...  
3                                                     U dun say so early hor... U c already then say...  
4                                         Nah I don't think he goes to usf, he lives around here though  
..                                                                                                  ...  
95  Your free ringtone is waiting to be collected. Simply text the password "MIX" to 85069 to verify...  
96     

In [20]:
# extract the messages from the dataframe
messages = spam.message
print(messages)

0     Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...
1                                                                           Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3                                                       U dun say so early hor... U c already then say...
4                                           Nah I don't think he goes to usf, he lives around here though
                                                     ...                                                 
95    Your free ringtone is waiting to be collected. Simply text the password "MIX" to 85069 to verify...
96                                                                      Watching telugu movie..wat abt u?
97                                                    i see. When we finish we have loads of loans to pay
98    Hi. Wk been ok - on hols now! Yes on for

In [21]:
# convert messages into list
messages = [message for message in messages]
print(messages)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though", "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv", 'Even my brother is not like to speak with me. They treat me like aids patent.', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune", 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.', 'Had your mobile 

In [22]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(messages)

['go jurong point , crazy .. available bugis n great world la e buffet ... cine got amore wat ...', 'ok lar ... joking wif u oni ...', "free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question ( std txt rate ) & c 's apply 08452810075over18 's", 'u dun say early hor ... u c already say ...', "nah n't think goes usf , lives around though", "freemsg hey darling 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcv", 'even brother like speak . treat like aids patent .', "per request 'melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . press * 9 copy friends callertune", 'winner ! ! valued network customer selected receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hours .', 'mobile 11 months ? u r entitled update latest colour mobiles camera free ! call mobile update co free 08002986030', "'m gon na home soon n't want talk stuff anymore tonight , k ? 've cried enoug

In [23]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [31]:
# version-compatible feature names

feature_names = vectorizer.get_feature_names_out()


df = pd.DataFrame(bow_model.toarray(), columns = feature_names)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 640 entries, 000 to ú1
dtypes: int64(640)
memory usage: 500.1 KB


In [32]:
pd.Series(feature_names).info()

<class 'pandas.core.series.Series'>
RangeIndex: 640 entries, 0 to 639
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
640 non-null    object
dtypes: object(1)
memory usage: 5.1+ KB


In [11]:
# print feature names in a version-compatible way
# if hasattr(vectorizer, "get_feature_names_out"):
#     print(vectorizer.get_feature_names_out())
# else:
print(vectorizer.get_feature_names_out())

['actors' 'depends' 'gangs' 'great' 'movie' 'movies' 'new' 'performance'
 'releasing' 'success' 'wasseypur' 'week']


* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc. 

In [25]:
bow_model.shape

(100, 640)