In [2]:
from nltk import FreqDist
import numpy as np
import re

In [24]:
def buildDict(docs):
    doc_tokens = []
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower())
        if tokens[-1] == '' : tokens = tokens[:-1]
        doc_tokens.append(tokens)
        
    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0] : id for id, word in enumerate(vocab)}
    id_to_word = {id:word[0] for id, word in enumerate(vocab)}
    
    return doc_tokens, vocab, word_to_id, id_to_word

In [25]:
docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be, I am what I am.')
docs.append('I think therefore I am. Do be do be do.')
docs.append('Do do do, da da da, Let it be, let it be')

doc_tokens, vocab, word_to_id, id_to_word = buildDict(docs)

In [12]:
from collections import Counter
import math


In [13]:
tf_vectors = []
for doc in doc_tokens:
    vec = [0.0 for _ in range((len(word_to_id)))]
    word_count = Counter(doc)
    for key, value in word_count.items():
        vec[word_to_id[key]] = 1+math.log2(value)
    tf_vectors.append(vec)
    

In [14]:
import pandas as pd
df = pd.DataFrame(tf_vectors, columns=id_to_word.values())
df

Unnamed: 0,do,be,to,i,am,da,is,let,it,or,not,what,think,therefore
0,2.0,2.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2,2.584963,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,2.584963,2.0,0.0,0.0,0.0,2.584963,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0


In [15]:
idf = { }
for id, _ in id_to_word.items():
    idf[id] = 0.0
    for doc in tf_vectors:
        if doc[id] > 0:
            idf[id] += 1

In [16]:
N = len(tf_vectors)
idf = {id : math.log2(N/val) for id, val in idf.items()}

In [17]:
df = pd.Series(idf.values(), index = idf.keys())
print(df)

0     0.415037
1     0.000000
2     1.000000
3     1.000000
4     1.000000
5     2.000000
6     2.000000
7     2.000000
8     2.000000
9     2.000000
10    2.000000
11    2.000000
12    2.000000
13    2.000000
dtype: float64


In [18]:
import numpy as np

idf_list = [val for _, val in idf.items()]
tfidf = np.array([np.multiply(tf, idf_list) for tf in tf_vectors])

In [19]:
df = pd.DataFrame(tfidf, columns=id_to_word.values())
print(df)
print(df.T)

         do   be   to    i   am        da   is  let   it   or  not  what  \
0  0.830075  0.0  3.0  0.0  0.0  0.000000  4.0  0.0  0.0  0.0  0.0   0.0   
1  0.000000  0.0  2.0  2.0  2.0  0.000000  0.0  0.0  0.0  2.0  2.0   2.0   
2  1.072856  0.0  0.0  2.0  1.0  0.000000  0.0  0.0  0.0  0.0  0.0   0.0   
3  1.072856  0.0  0.0  0.0  0.0  5.169925  0.0  4.0  4.0  0.0  0.0   0.0   

   think  therefore  
0    0.0        0.0  
1    0.0        0.0  
2    2.0        2.0  
3    0.0        0.0  
                  0    1         2         3
do         0.830075  0.0  1.072856  1.072856
be         0.000000  0.0  0.000000  0.000000
to         3.000000  2.0  0.000000  0.000000
i          0.000000  2.0  2.000000  0.000000
am         0.000000  2.0  1.000000  0.000000
da         0.000000  0.0  0.000000  5.169925
is         4.000000  0.0  0.000000  0.000000
let        0.000000  0.0  0.000000  4.000000
it         0.000000  0.0  0.000000  4.000000
or         0.000000  2.0  0.000000  0.000000
not        0.0

In [26]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

encoder = LabelEncoder()
labels = encoder.fit_transform([word for word, id in vocab])
for label in labels:
    print('[{:2d} : {}]'.format(label, encoder.classes_[label]))
print(encoder.classes_)

[ 3 : do]
[ 1 : be]
[12 : to]
[ 4 : i]
[ 0 : am]
[ 2 : da]
[ 5 : is]
[ 7 : let]
[ 6 : it]
[ 9 : or]
[ 8 : not]
[13 : what]
[11 : think]
[10 : therefore]
['am' 'be' 'da' 'do' 'i' 'is' 'it' 'let' 'not' 'or' 'therefore' 'think'
 'to' 'what']


In [28]:
encode_data = [encoder.transform(doc_token) for doc_token in doc_tokens]
print(encode_data)
for code in encode_data:
    print(encoder.inverse_transform(code))

[array([12,  3,  5, 12,  1, 12,  1,  5, 12,  3]), array([12,  1,  9,  8, 12,  1,  4,  0, 13,  4,  0]), array([ 4, 11, 10,  4,  0,  3,  1,  3,  1,  3]), array([3, 3, 3, 2, 2, 2, 7, 6, 1, 7, 6, 1])]
['to' 'do' 'is' 'to' 'be' 'to' 'be' 'is' 'to' 'do']
['to' 'be' 'or' 'not' 'to' 'be' 'i' 'am' 'what' 'i' 'am']
['i' 'think' 'therefore' 'i' 'am' 'do' 'be' 'do' 'be' 'do']
['do' 'do' 'do' 'da' 'da' 'da' 'let' 'it' 'be' 'let' 'it' 'be']


In [30]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(categories='auto')
labels = labels.reshape(-1, 1)
oh_labels = oh_encoder.fit_transform(labels)

In [31]:
oh_vectors = []
for data in encode_data:
    data = data.reshape(-1,1)
    oh_vector = oh_encoder.transform(data).toarray()
    oh_vectors.append(oh_vector)

In [33]:
for data, oh_vector in zip(encode_data, oh_vectors):
    print(encoder.inverse_transform(data))
    print(oh_vector)

['to' 'do' 'is' 'to' 'be' 'to' 'be' 'is' 'to' 'do']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
['to' 'be' 'or' 'not' 'to' 'be' 'i' 'am' 'what' 'i' 'am']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be. II am what II am')
docs.append('II think therefore II am. Do be do be do.')
docs.append('Do do do da da da. Let it be let it be.')

cnt_vectr = CountVectorizer()
vectors = cnt_vectr.fit_transform(docs)

print(cnt_vectr.vocabulary_)
print(cnt_vectr.get_feature_names_out())
print(vectors.toarray())
print(pd.DataFrame(vectors.toarray(),
                   columns=cnt_vectr.get_feature_names_out()))

{'to': 12, 'do': 3, 'is': 5, 'be': 1, 'or': 9, 'not': 8, 'ii': 4, 'am': 0, 'what': 13, 'think': 11, 'therefore': 10, 'da': 2, 'let': 7, 'it': 6}
['am' 'be' 'da' 'do' 'ii' 'is' 'it' 'let' 'not' 'or' 'therefore' 'think'
 'to' 'what']
[[0 2 0 2 0 2 0 0 0 0 0 0 4 0]
 [2 2 0 0 2 0 0 0 1 1 0 0 2 1]
 [1 2 0 3 2 0 0 0 0 0 1 1 0 0]
 [0 2 3 3 0 0 2 2 0 0 0 0 0 0]]
   am  be  da  do  ii  is  it  let  not  or  therefore  think  to  what
0   0   2   0   2   0   2   0    0    0   0          0      0   4     0
1   2   2   0   0   2   0   0    0    1   1          0      0   2     1
2   1   2   0   3   2   0   0    0    0   0          1      1   0     0
3   0   2   3   3   0   0   2    2    0   0          0      0   0     0


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf = TfidfVectorizer().fit(docs)
dtm = tfidf.transform(docs).toarray()

df = pd.DataFrame(dtm, columns=tfidf.get_feature_names_out())
print(df)
print(sorted(tfidf.vocabulary_.items()))

         am        be        da        do        ii        is        it  \
0  0.000000  0.255666  0.000000  0.312717  0.000000  0.489931  0.000000   
1  0.464005  0.307120  0.000000  0.000000  0.464005  0.000000  0.000000   
2  0.251031  0.332310  0.000000  0.609695  0.502063  0.000000  0.000000   
3  0.000000  0.223758  0.643179  0.410533  0.000000  0.000000  0.428786   

        let       not        or  therefore     think        to      what  
0  0.000000  0.000000  0.000000   0.000000  0.000000  0.772535  0.000000  
1  0.000000  0.294266  0.294266   0.000000  0.000000  0.464005  0.294266  
2  0.000000  0.000000  0.000000   0.318401  0.318401  0.000000  0.000000  
3  0.428786  0.000000  0.000000   0.000000  0.000000  0.000000  0.000000  
[('am', 0), ('be', 1), ('da', 2), ('do', 3), ('ii', 4), ('is', 5), ('it', 6), ('let', 7), ('not', 8), ('or', 9), ('therefore', 10), ('think', 11), ('to', 12), ('what', 13)]
