In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import word_tokenize

In [2]:
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "I love natural language processing and deep learning",
    "Word embeddings capture semantic relationships between words",
    "The fox is quick and the dog is lazy",
    "Deep learning models learn representations automatically",
    "I love to learn about NLP and machine learning",
    "The king and the queen shared their kingdom fairly",
    "A man and a woman walked into the room",
    "The queen loves her people and the king protects them",
    "Natural language tasks include translation and summarization"
]

In [5]:
vocab_list = []
for sentence in corpus:
    for word in sentence.lower().split():
        if word not in vocab_list:
            vocab_list.append(word)


In [12]:
vocabulary = set(vocab_list)

In [13]:
vocabulary

{'a',
 'about',
 'and',
 'automatically',
 'between',
 'brown',
 'capture',
 'deep',
 'dog',
 'embeddings',
 'fairly',
 'fox',
 'her',
 'i',
 'include',
 'into',
 'is',
 'jumps',
 'king',
 'kingdom',
 'language',
 'lazy',
 'learn',
 'learning',
 'love',
 'loves',
 'machine',
 'man',
 'models',
 'natural',
 'nlp',
 'over',
 'people',
 'processing',
 'protects',
 'queen',
 'quick',
 'relationships',
 'representations',
 'room',
 'semantic',
 'shared',
 'summarization',
 'tasks',
 'the',
 'their',
 'them',
 'to',
 'translation',
 'walked',
 'woman',
 'word',
 'words'}

In [14]:
word_index = {}
for idx, word in enumerate(sorted(vocabulary)):
    word_index[word] = idx

In [15]:
word_index

{'a': 0,
 'about': 1,
 'and': 2,
 'automatically': 3,
 'between': 4,
 'brown': 5,
 'capture': 6,
 'deep': 7,
 'dog': 8,
 'embeddings': 9,
 'fairly': 10,
 'fox': 11,
 'her': 12,
 'i': 13,
 'include': 14,
 'into': 15,
 'is': 16,
 'jumps': 17,
 'king': 18,
 'kingdom': 19,
 'language': 20,
 'lazy': 21,
 'learn': 22,
 'learning': 23,
 'love': 24,
 'loves': 25,
 'machine': 26,
 'man': 27,
 'models': 28,
 'natural': 29,
 'nlp': 30,
 'over': 31,
 'people': 32,
 'processing': 33,
 'protects': 34,
 'queen': 35,
 'quick': 36,
 'relationships': 37,
 'representations': 38,
 'room': 39,
 'semantic': 40,
 'shared': 41,
 'summarization': 42,
 'tasks': 43,
 'the': 44,
 'their': 45,
 'them': 46,
 'to': 47,
 'translation': 48,
 'walked': 49,
 'woman': 50,
 'word': 51,
 'words': 52}

In [16]:
bow_matrix = np.zeros((len(corpus), len(vocabulary)), dtype=int)

In [17]:
# Fill the matrix
for i, sentence in enumerate(corpus):
    words = sentence.lower().split()
    for word in words:
        if word in word_index:  # safety check
            bow_matrix[i, word_index[word]] += 1

In [19]:
bow_df = pd.DataFrame(bow_matrix, columns=sorted(vocabulary))
bow_df

Unnamed: 0,a,about,and,automatically,between,brown,capture,deep,dog,embeddings,...,tasks,the,their,them,to,translation,walked,woman,word,words
0,0,0,0,0,0,1,0,0,1,0,...,0,2,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,1
3,0,0,1,0,0,0,0,0,1,0,...,0,2,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,...,0,2,1,0,0,0,0,0,0,0
7,2,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
8,0,0,1,0,0,0,0,0,0,0,...,0,2,0,1,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [23]:
def get_bow_representation(corpus, frequency = True):
    vocabulary = set([x for x in " ".join(corpus).lower().split(" ")])

    bow_rep = []
    for sentence in corpus:
        sentence_rep = dict([(v,0) for v in vocabulary])
        for word in word_tokenize(sentence.lower()):
            if frequency:
                sentence_rep[word] += 1
            else:
                sentence_rep[word] = 1
        bow_rep.append(sentence_rep)
    return bow_rep

In [25]:
bow_representation = get_bow_representation(corpus, True)
df = pd.DataFrame(bow_representation)
df.index = corpus
display(df.head())

Unnamed: 0,them,protects,fairly,embeddings,shared,is,love,relationships,semantic,quick,...,into,fox,translation,language,to,people,woman,kingdom,dog,room
The quick brown fox jumps over the lazy dog,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
I love natural language processing and deep learning,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Word embeddings capture semantic relationships between words,0,0,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
The fox is quick and the dog is lazy,0,0,0,0,0,2,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
Deep learning models learn representations automatically,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
for x in " ".join(corpus).lower().split(" "):
    print(x,end=" ")

the quick brown fox jumps over the lazy dog i love natural language processing and deep learning word embeddings capture semantic relationships between words the fox is quick and the dog is lazy deep learning models learn representations automatically i love to learn about nlp and machine learning the king and the queen shared their kingdom fairly a man and a woman walked into the room the queen loves her people and the king protects them natural language tasks include translation and summarization 