In [1]:
import pprint

In [2]:
# corpus

text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [3]:
# preprocessing

# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))

# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [4]:
# make dictionary

from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [5]:
# doc 2 bow

# just one test
new_doc = "Human computer interaction"
bow_new_doc = dictionary.doc2bow(new_doc.lower().split())
print("BoW one document")
print(bow_new_doc)
print("="*50)

# corpus
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
print("BoW test corpus")
pprint.pprint(bow_corpus)


BoW one document
[(0, 1), (1, 1)]
BoW test corpus
[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [None]:
# TF-IDF (손코딩)
from math import log

# test doc
new_doc_words = "system minors".lower().split()
new_doc_bow = dictionary.doc2bow(new_doc_words)
print("words of corpus")
pprint.pprint(processed_corpus)
print('='*50)
print("new doc words")
print(new_doc_words)
print("new doc BoW")
print(new_doc_bow)
print("new doc TF-IDF")

# 손코딩 검증
tf_system = len([word for word in new_doc_words if word == 'system']) / len(new_doc_words)
idf_system = log(len(processed_corpus) / len([doc for doc in processed_corpus if 'system' in doc]))
tfidf_system = tf_system * idf_system
tf_minors = len([word for word in new_doc_words if word == 'minors']) / len(new_doc_words)
idf_minors = log(len(processed_corpus) / len([doc for doc in processed_corpus if 'minors' in doc]))
tfidf_minors = tf_minors * idf_minors
print("손코딩 검증 / log는 자연로그(ln) 사용")
print(f'word : system / id : {dictionary.token2id["system"]} / tf : {tf_system}, / idf : {idf_system} / tfidf : {tfidf_system}')
print(f'word : minors / id : {dictionary.token2id["minors"]} / tf : {tf_minors}, / idf : {idf_minors} / tfidf : {tfidf_minors}')



words of corpus
[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]
new doc words

['system', 'minors']
new doc BoW
[(5, 1), (11, 1)]
new doc TF-IDF
[(5, np.float64(0.5898341626740045)), (11, np.float64(0.8075244024440723))]
손코딩 검증
word : system / id : 5 / tf : 0.5, / idf : 1.0986122886681098 / tfidf : 0.5493061443340549
word : minors / id : 11 / tf : 0.5, / idf : 1.5040773967762742 / tfidf : 0.7520386983881371
gensim 은 tf를 단순 해당 단어의 출현 횟수로 계산하며
gensim 은 idf를 log(문서수+1/단어출현문서수+1) + 1 로 계산하여 0이 되는 것을 방지한다.
또한 gensim은 최종적으로 문서 길이 정규화(L2norm)를 수행해 벡터의 유클리드 거리가 1이 되도록 한다
4.264833145637557
word : system / id : 5 / tf : 1, / idf : 1.916290731874155 / tfidf : 0.4493237288390295


In [31]:
from gensim import models

# train tf-idf model from corpus
tfidf = models.TfidfModel(bow_corpus) # bow_corpus : test_corpus 를 BoW 한 리스트

# test doc
new_doc_words = "system minors".lower().split()
new_doc_tfidf = tfidf[new_doc_bow]
print(new_doc_tfidf)

[(5, np.float64(0.5898341626740045)), (11, np.float64(0.8075244024440723))]


In [None]:
# gensim 라이브러리와 손코딩 내용이 다른 이유
print('='*50)
print('gensim 은 tf를 단순 해당 단어의 출현 횟수로 계산하며')
print('gensim 은 idf를 log(문서수+1/단어출현문서수+1) + 1 로 계산하여 0이 되는 것을 방지한다.')
print('또한 gensim은 최종적으로 문서 길이 정규화(L2norm)를 수행해 벡터의 유클리드 거리가 1이 되도록 한다')
tf_system_compare = len([word for word in new_doc_words if word == 'system'])
idf_system_compare = log((len(processed_corpus)+1) / (len([doc for doc in processed_corpus if 'system' in doc])+1)) + 1
tf_minors_compare = len([word for word in new_doc_words if word == 'minors'])
idf_minors_compare = log((len(processed_corpus)+1) / (len([doc for doc in processed_corpus if 'minors' in doc])+1)) + 1
norm = ((tf_system_compare * idf_system_compare)**2 + (tf_minors_compare * idf_minors_compare)**2)**1/2
print(norm)
tfidf_system_compare = tf_system_compare * idf_system_compare
print(f'word : system / id : {dictionary.token2id["system"]} / tf : {tf_system_compare}, / idf : {idf_system_compare} / tfidf : {tfidf_system_compare/norm}')