# Word counts with bag-of-words
- Bag of Words (BoW) is a imple and powerful technique for converting text into numerical features that can be used in machine learning algorithms.

![q1.png](q1.png)

In [2]:
text = "The cat is in the box. The cat box."

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
# Tokenization
tokens = word_tokenize(text.lower())
tokens

['the', 'cat', 'is', 'in', 'the', 'box', '.', 'the', 'cat', 'box', '.']

In [5]:
# Creating the Vocabulary
vocab = sorted(set(tokens))
vocab

['.', 'box', 'cat', 'in', 'is', 'the']

In [6]:
from collections import Counter

In [7]:
# Counting Frequencies
word_freq = Counter(tokens)
word_freq

Counter({'the': 3, 'cat': 2, 'box': 2, '.': 2, 'is': 1, 'in': 1})

#### Building a Counter with bag-of-words

![q2.png](q2.png)

In [8]:
article = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem-solving".
"""

In [9]:
from collections import Counter
from nltk.tokenize import word_tokenize

In [10]:
# Tokenize the article: tokens
tokens = word_tokenize(article)
tokens

['Artificial',
 'intelligence',
 '(',
 'AI',
 ')',
 'is',
 'intelligence',
 'demonstrated',
 'by',
 'machines',
 ',',
 'in',
 'contrast',
 'to',
 'the',
 'natural',
 'intelligence',
 'displayed',
 'by',
 'humans',
 'and',
 'animals',
 '.',
 'Leading',
 'AI',
 'textbooks',
 'define',
 'the',
 'field',
 'as',
 'the',
 'study',
 'of',
 '``',
 'intelligent',
 'agents',
 "''",
 ':',
 'any',
 'device',
 'that',
 'perceives',
 'its',
 'environment',
 'and',
 'takes',
 'actions',
 'that',
 'maximize',
 'its',
 'chance',
 'of',
 'successfully',
 'achieving',
 'its',
 'goals',
 '.',
 'Colloquially',
 ',',
 'the',
 'term',
 '``',
 'artificial',
 'intelligence',
 "''",
 'is',
 'often',
 'used',
 'to',
 'describe',
 'machines',
 '(',
 'or',
 'computers',
 ')',
 'that',
 'mimic',
 '``',
 'cognitive',
 "''",
 'functions',
 'that',
 'humans',
 'associate',
 'with',
 'the',
 'human',
 'mind',
 ',',
 'such',
 'as',
 '``',
 'learning',
 "''",
 'and',
 '``',
 'problem-solving',
 "''",
 '.']

In [11]:
# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]
lower_tokens

['artificial',
 'intelligence',
 '(',
 'ai',
 ')',
 'is',
 'intelligence',
 'demonstrated',
 'by',
 'machines',
 ',',
 'in',
 'contrast',
 'to',
 'the',
 'natural',
 'intelligence',
 'displayed',
 'by',
 'humans',
 'and',
 'animals',
 '.',
 'leading',
 'ai',
 'textbooks',
 'define',
 'the',
 'field',
 'as',
 'the',
 'study',
 'of',
 '``',
 'intelligent',
 'agents',
 "''",
 ':',
 'any',
 'device',
 'that',
 'perceives',
 'its',
 'environment',
 'and',
 'takes',
 'actions',
 'that',
 'maximize',
 'its',
 'chance',
 'of',
 'successfully',
 'achieving',
 'its',
 'goals',
 '.',
 'colloquially',
 ',',
 'the',
 'term',
 '``',
 'artificial',
 'intelligence',
 "''",
 'is',
 'often',
 'used',
 'to',
 'describe',
 'machines',
 '(',
 'or',
 'computers',
 ')',
 'that',
 'mimic',
 '``',
 'cognitive',
 "''",
 'functions',
 'that',
 'humans',
 'associate',
 'with',
 'the',
 'human',
 'mind',
 ',',
 'such',
 'as',
 '``',
 'learning',
 "''",
 'and',
 '``',
 'problem-solving',
 "''",
 '.']

In [12]:
# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)
bow_simple

Counter({'the': 5,
         '``': 5,
         "''": 5,
         'intelligence': 4,
         'that': 4,
         ',': 3,
         'and': 3,
         '.': 3,
         'its': 3,
         'artificial': 2,
         '(': 2,
         'ai': 2,
         ')': 2,
         'is': 2,
         'by': 2,
         'machines': 2,
         'to': 2,
         'humans': 2,
         'as': 2,
         'of': 2,
         'demonstrated': 1,
         'in': 1,
         'contrast': 1,
         'natural': 1,
         'displayed': 1,
         'animals': 1,
         'leading': 1,
         'textbooks': 1,
         'define': 1,
         'field': 1,
         'study': 1,
         'intelligent': 1,
         'agents': 1,
         ':': 1,
         'any': 1,
         'device': 1,
         'perceives': 1,
         'environment': 1,
         'takes': 1,
         'actions': 1,
         'maximize': 1,
         'chance': 1,
         'successfully': 1,
         'achieving': 1,
         'goals': 1,
         'colloquially': 1,
       

In [13]:
# Print the 10 most common tokens
print(bow_simple.most_common(10))

[('the', 5), ('``', 5), ("''", 5), ('intelligence', 4), ('that', 4), (',', 3), ('and', 3), ('.', 3), ('its', 3), ('artificial', 2)]


# Simple text preprocessing

![q3.png](q4.png)

In [23]:
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [24]:
article = "This is an example text for testing bag of words. The cat is in the box. The cat box."
tokens = word_tokenize(article)
tokens

['This',
 'is',
 'an',
 'example',
 'text',
 'for',
 'testing',
 'bag',
 'of',
 'words',
 '.',
 'The',
 'cat',
 'is',
 'in',
 'the',
 'box',
 '.',
 'The',
 'cat',
 'box',
 '.']

In [25]:
# Convert the tokens into lowercase
lower_tokens = [t.lower() for t in tokens]
lower_tokens

['this',
 'is',
 'an',
 'example',
 'text',
 'for',
 'testing',
 'bag',
 'of',
 'words',
 '.',
 'the',
 'cat',
 'is',
 'in',
 'the',
 'box',
 '.',
 'the',
 'cat',
 'box',
 '.']

In [26]:
# Filter out non-alphabetic tokens
alpha_only = [t for t in lower_tokens if t.isalpha()]
alpha_only

['this',
 'is',
 'an',
 'example',
 'text',
 'for',
 'testing',
 'bag',
 'of',
 'words',
 'the',
 'cat',
 'is',
 'in',
 'the',
 'box',
 'the',
 'cat',
 'box']

In [27]:
# Remove stop words
english_stops = set(stopwords.words('english'))
no_stops = [t for t in alpha_only if t not in english_stops]
no_stops

['example', 'text', 'testing', 'bag', 'words', 'cat', 'box', 'cat', 'box']

In [28]:
# Initialize the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [31]:
# Lemmatize the tokens
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
lemmatized

['example', 'text', 'testing', 'bag', 'word', 'cat', 'box', 'cat', 'box']

In [32]:
# Create the bag-of-words
bow = Counter(lemmatized)
bow

Counter({'cat': 2,
         'box': 2,
         'example': 1,
         'text': 1,
         'testing': 1,
         'bag': 1,
         'word': 1})

In [33]:
# Print the 10 most common tokens
print(bow.most_common(10))

[('cat', 2), ('box', 2), ('example', 1), ('text', 1), ('testing', 1), ('bag', 1), ('word', 1)]


# Introduction to gensim
- Gensim is an open-source library designed for natural language processing (NLP), particularly focused on unsupervised topic modeling and natural language understanding.

![q5.png](q5.png)

In [36]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

In [39]:
articles = [
    ["computer", "science", "study", "of", "computers", "and", "computational", "systems"],
    ["artificial", "intelligence", "branch", "of", "computer", "science", "concerned", "with", "making", "computers", "behave", "like", "humans"],
    ["machine", "learning", "subset", "of", "artificial", "intelligence", "focused", "on", "algorithms", "that", "allow", "computers", "to", "learn", "from", "data"],
    ["neural", "networks", "computing", "systems", "inspired", "by", "the", "biological", "neural", "networks", "that", "constitute", "animal", "brains"],
    ["data", "science", "interdisciplinary", "field", "that", "uses", "scientific", "methods", "processes", "algorithms", "and", "systems", "to", "extract", "knowledge", "and", "insights", "from", "data"]
]

In [42]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles)

In [44]:
# Select the id for "computer": computer_id
computer_id = dictionary.token2id.get("computer")
computer_id

2

In [45]:
# Use computer_id with the dictionary to print the word
print(dictionary.get(computer_id))

computer


In [46]:
# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles]
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1)],
 [(3, 1),
  (4, 1),
  (8, 1),
  (13, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1)],
 [(7, 1),
  (27, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 2),
  (37, 2),
  (38, 1)],
 [(0, 2),
  (5, 1),
  (7, 1),
  (17, 1),
  (19, 2),
  (21, 1),
  (27, 1),
  (28, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1)]]

In [47]:
# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus[4][:10])

[(0, 2), (5, 1), (7, 1), (17, 1), (19, 2), (21, 1), (27, 1), (28, 1), (39, 1), (40, 1)]


#### Gensim bag-of-words

![q6.png](q6.png)

In [48]:
import itertools
from collections import defaultdict

In [49]:
# Save the fifth document: doc
doc = corpus[4]

In [50]:
# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

In [51]:
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)
    

and 2
data 2
science 1
systems 1
algorithms 1


In [53]:
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count
total_word_count

defaultdict(int,
            {0: 3,
             1: 1,
             2: 2,
             3: 3,
             4: 3,
             5: 3,
             6: 1,
             7: 3,
             8: 2,
             9: 1,
             10: 1,
             11: 1,
             12: 1,
             13: 2,
             14: 1,
             15: 1,
             16: 1,
             17: 2,
             18: 1,
             19: 3,
             20: 1,
             21: 2,
             22: 1,
             23: 1,
             24: 1,
             25: 1,
             26: 1,
             27: 3,
             28: 2,
             29: 1,
             30: 1,
             31: 1,
             32: 1,
             33: 1,
             34: 1,
             35: 1,
             36: 2,
             37: 2,
             38: 1,
             39: 1,
             40: 1,
             41: 1,
             42: 1,
             43: 1,
             44: 1,
             45: 1,
             46: 1,
             47: 1})

![q7.png](q7.png)

In [54]:
import itertools
from collections import defaultdict

In [55]:

# Save the fifth document: doc
doc = corpus[4]

In [57]:
# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

In [58]:
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)

and 2
data 2
science 1
systems 1
algorithms 1


In [60]:
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

In [61]:
# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True) 

In [62]:
# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

and 3
computers 3
of 3
science 3
systems 3


# Tf-idf with gensim