## Let’s create some sample sentences.

In [None]:
texts = [
    "blue car and blue window",
    "black crow in the window",
    "i see my reflection in the window"
]

## First we need to create the vocabulary

In [9]:
vocab = sorted(set(word for sentence in texts for word in sentence.split()))
print(len(vocab), vocab)

12 ['and', 'black', 'blue', 'car', 'crow', 'i', 'in', 'my', 'reflection', 'see', 'the', 'window']


## Binary Encoding

In [10]:
import numpy as np
def binary_transform(text):
    # create a vector with all entries as 0
    output = np.zeros(len(vocab))
    # tokenize the input
    words = set(text.split())
    # for every word in vocab check if the doc contains it
    for i, v in enumerate(vocab):
        output[i] = v in words 
    return output

print(binary_transform("i saw crow"))

[0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.]


## Transform a collection of documents into the feature matrix.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(binary=True)
vec.fit(texts)
print([w for w in sorted(vec.vocabulary_.keys())])

['and', 'black', 'blue', 'car', 'crow', 'in', 'my', 'reflection', 'see', 'the', 'window']


## The columns are each word in the vocabulary and the rows represent the documents.

In [12]:
import pandas as pd
pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,1,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


## Checks how many times a word appeared.

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(binary=False) # we cound ignore binary=False argument since it is default
vec.fit(texts)

import pandas as pd
pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,2,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


## TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
vec.fit(texts)

import pandas as pd
pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,0.396875,0.0,0.793749,0.396875,0.0,0.0,0.0,0.0,0.0,0.0,0.2344
1,0.0,0.534093,0.0,0.0,0.534093,0.406192,0.0,0.0,0.0,0.406192,0.315444
2,0.0,0.0,0.0,0.0,0.0,0.358291,0.47111,0.47111,0.47111,0.358291,0.278245
