## defaultdict를 활용한 TDM

In [None]:
docs=['동물원 코끼리',
      '동물원 원숭이 바나나',
      '엄마 코끼리 아기 코끼리',
      '원숭이 바나나 코끼리 바나나']

In [None]:
doc_ls=[]
for doc in docs:
  doc_ls.append(doc.split(' '))
doc_ls

[['동물원', '코끼리'],
 ['동물원', '원숭이', '바나나'],
 ['엄마', '코끼리', '아기', '코끼리'],
 ['원숭이', '바나나', '코끼리', '바나나']]

In [None]:
from collections import defaultdict
word2id = defaultdict(lambda:len(word2id))
for doc in doc_ls:
  for token in doc:
    word2id[token]
word2id

defaultdict(<function __main__.<lambda>>,
            {'동물원': 0, '바나나': 3, '아기': 5, '엄마': 4, '원숭이': 2, '코끼리': 1})

In [None]:
import numpy as np

TDM = np.zeros((len(word2id),len(docs)),dtype=int)
for i, doc in enumerate(doc_ls):
  for token in doc:
    TDM[word2id[token], i] += 1
TDM

array([[1, 1, 0, 0],
       [1, 0, 2, 1],
       [0, 1, 0, 1],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [None]:
import pandas as pd
doc_name = ['문서' + str(i) for i in range(len(doc_ls))]
sorted_vocab = sorted((values,key) for key, values in word2id.items())
vocab = [ v[1] for v in sorted_vocab ]

df_TDM = pd.DataFrame(TDM, columns=doc_name)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
원숭이,0,1,0,1
바나나,0,1,0,2
엄마,0,0,1,0
아기,0,0,1,0


## sklearn을 활용한 DTM
 - 참로: sklearn은 DTM으로 만들어지게 설정되어 있음

In [None]:
docs

['동물원 코끼리', '동물원 원숭이 바나나', '엄마 코끼리 아기 코끼리', '원숭이 바나나 코끼리 바나나']

In [None]:
# 토큰빈도계산 : CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray().T

array([[1, 1, 0, 0],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 1],
       [1, 0, 2, 1]])

In [None]:
import pandas as pd

doc_names = ['문서' + str(i) for i in range(len(docs))]
vocab = count_vect.get_feature_names()


df_TDM = pd.DataFrame(DTM.toarray().T, columns = doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
바나나,0,1,0,2
아기,0,0,1,0
엄마,0,0,1,0
원숭이,0,1,0,1
코끼리,1,0,2,1


## gensim을 활용한 TDM

In [None]:
docs

['동물원 코끼리', '동물원 원숭이 바나나', '엄마 코끼리 아기 코끼리', '원숭이 바나나 코끼리 바나나']

In [None]:
from gensim import corpora

doc_ls = [ doc.split(' ') for doc in docs]

id2word = corpora.Dictionary(doc_ls)
print(id2word)

TDM=[]
for doc in doc_ls:
  TDM.append(id2word.doc2bow(doc))
TDM

Dictionary(6 unique tokens: ['동물원', '코끼리', '바나나', '원숭이', '아기']...)


[[(0, 1), (1, 1)],
 [(0, 1), (2, 1), (3, 1)],
 [(1, 2), (4, 1), (5, 1)],
 [(1, 1), (2, 2), (3, 1)]]

In [None]:
from gensim.matutils import sparse2full
import pandas as pd
import numpy as np

vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in TDM]


df_TDM = pd.DataFrame(np.array(DTM_matrix, dtype=int).T,
                      columns = doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
바나나,0,1,0,2
원숭이,0,1,0,1
아기,0,0,1,0
엄마,0,0,1,0
