In [8]:
import pprint

In [9]:
# corpus

text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [10]:
# tokenizer
class WhitespaceTokenizer():
    def tokenize(self, input:str) -> list[str]:
        if isinstance(input, str):
            result = input.split(" ")
        return result

# Text Cleaner
class TextCleaner:
    def __init__(self):
        # Create a set of frequent words
        self.stopwords = set('for a of the and to in'.split(' '))
    def clean_text(self, words:list[str]) -> list[str]:
        # Lowercase each document, split it by white space and filter out stopwords
        words = [word.lower() for word in words if word.lower() not in self.stopwords]
        return words

# filter by frequency
class FilterByFrequency:
    def __init__(self):
        # Count word frequencies
        from collections import defaultdict
        self.frequency_dict = defaultdict(int)
    def make_filter(self, docs:list[list[str]]):
        for text in docs:
            for token in text:
                self.frequency_dict[token] += 1
    def filter(self, words:list[str], threshold:int=1):
        # Only keep words that appear more than once
        filtered_words = [token for token in words if self.frequency_dict[token] > threshold]
        return filtered_words

# bow
class BagOfWords:
    def __init__(self):
        self.dictionary:dict[str,int]|None=None
    def create_dictionary(self, input:list[list[str]]):
        from gensim import corpora
        self.dictionary = corpora.Dictionary(input)
    def represent_bow(self, input:list[list[str]]):
        bow_corpus = [self.dictionary.doc2bow(text) for text in input]
        return bow_corpus

In [12]:
# (1) 토크나이징 : 공백을 기준으로
tokenizer = WhitespaceTokenizer()
tokenized_docs = [tokenizer.tokenize(doc) for doc in text_corpus]
# (2) 텍스트 클리닝 - lower + stopwords
text_cleaner = TextCleaner()
cleaned_docs = [text_cleaner.clean_text(words) for words in tokenized_docs]
# (3) 빈도 기반 필터링 : 1회 발생 단어는 제외
filter = FilterByFrequency()
filter.make_filter(cleaned_docs)
processed_corpus = [filter.filter(doc, 1) for doc in cleaned_docs]
# (4) BoW 생성
bow_model = BagOfWords()
bow_model.create_dictionary(processed_corpus)
bow = bow_model.represent_bow(processed_corpus)

In [23]:
# TF-IDF (손코딩)
from math import log

class CustomTFIDF:
    def __init__(self, Documents):
        self.Documents:list[list[str]] = Documents
    def tf(self, word, document):
        target_word_count = len([dw for dw in document if dw == word])
        all_word_count = len(document)
        return target_word_count/all_word_count
    def idf(self, word):
        document_count = len(self.Documents)
        include_word_document_count = len([doc for doc in self.Documents if word in doc])
        return log(document_count/include_word_document_count)
    def tfidf(self, word, document):
        return self.tf(word, document) * self.idf(word)

In [None]:
# 신규 문장의 특정 단어 TF-IDF 계산

# 신규 문장
new_sentence = "system minors"
cleaned_words = filter.filter(text_cleaner.clean_text(tokenizer.tokenize(new_sentence)))
print(cleaned_words)

# TF-IDF 계산
tfidf_model = CustomTFIDF(processed_corpus)
for word in cleaned_words:
    print(f"===== {word} =====")
    tf = tfidf_model.tf(word=word, document=cleaned_words)
    idf = tfidf_model.idf(word=word)
    tfidf = tfidf_model.tfidf(word=word, document=cleaned_words)
    print(f"{word}'s tf : {tf}\nidf : {idf}\ntfidf : {tfidf}")

['system', 'minors']
===== system =====
system's tf : 0.5
idf : 1.0986122886681098
tfidf : 0.5493061443340549
===== minors =====
minors's tf : 0.5
idf : 1.5040773967762742
tfidf : 0.7520386983881371


In [None]:
# gensim 라이브러리를 이용하는 경우

from gensim import models

# train tf-idf model from corpus
bow_corpus = [bow_model.dictionary.doc2bow(doc) for doc in processed_corpus]
tfidf = models.TfidfModel(bow_corpus)

# test doc
new_sentence = "system minors"
cleaned_words = filter.filter(text_cleaner.clean_text(tokenizer.tokenize(new_sentence)))
new_doc_bow = bow_model.dictionary.doc2bow(cleaned_words)
new_doc_tfidf = tfidf[new_doc_bow]
print(new_doc_tfidf)

[(5, np.float64(0.5898341626740045)), (11, np.float64(0.8075244024440723))]


In [44]:
# 스무딩 적용 TF-IDF
from math import log
class SmoothingTFIDF:
    def __init__(self, Documents):
        self.Documents:list[list[str]] = Documents
    def tf(self, word, document):
        target_word_count = len([dw for dw in document if dw == word])
        all_word_count = len(document)
        return target_word_count/all_word_count
    def idf(self, word):
        document_count = len(self.Documents) + 1
        include_word_document_count = len([doc for doc in self.Documents if word in doc]) + 1
        return log(document_count/include_word_document_count) + 1
    def tfidf(self, word, document):
        return self.tf(word, document) * self.idf(word)

# 신규 문장의 특정 단어 TF-IDF 계산

# 신규 문장
new_sentence = "system minors"
cleaned_words = filter.filter(text_cleaner.clean_text(tokenizer.tokenize(new_sentence)))
print(cleaned_words)

# TF-IDF 계산
tfidf_model = SmoothingTFIDF(processed_corpus)
for word in cleaned_words:
    print(f"===== {word} =====")
    tf = tfidf_model.tf(word=word, document=cleaned_words)
    idf = tfidf_model.idf(word=word)
    tfidf = tfidf_model.tfidf(word=word, document=cleaned_words)
    print(f"{word}'s tf : {tf}\nidf : {idf}\ntfidf : {tfidf}")

['system', 'minors']
===== system =====
system's tf : 0.5
idf : 1.916290731874155
tfidf : 0.9581453659370776
===== minors =====
minors's tf : 0.5
idf : 2.203972804325936
tfidf : 1.101986402162968


In [None]:
tf_system_compare = len([word for word in new_doc_words if word == 'system'])
idf_system_compare = log((len(processed_corpus)+1) / (len([doc for doc in processed_corpus if 'system' in doc])+1)) + 1
tf_minors_compare = len([word for word in new_doc_words if word == 'minors'])
idf_minors_compare = log((len(processed_corpus)+1) / (len([doc for doc in processed_corpus if 'minors' in doc])+1)) + 1
norm = ((tf_system_compare * idf_system_compare)**2 + (tf_minors_compare * idf_minors_compare)**2)**1/2
print(norm)
tfidf_system_compare = tf_system_compare * idf_system_compare
print(f'word : system / id : {dictionary.token2id["system"]} / tf : {tf_system_compare}, / idf : {idf_system_compare} / tfidf : {tfidf_system_compare/norm}')