In [1]:
import pprint

In [2]:
# corpus

text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [3]:
# tokenizer
class WhitespaceTokenizer():
    def tokenize(self, input:str) -> list[str]:
        if isinstance(input, str):
            result = input.split(" ")
        return result

# Text Cleaner
class TextCleaner:
    def __init__(self):
        # Create a set of frequent words
        self.stopwords = set('for a of the and to in'.split(' '))
    def clean_text(self, words:list[str]) -> list[str]:
        # Lowercase each document, split it by white space and filter out stopwords
        words = [word.lower() for word in words if word.lower() not in self.stopwords]
        return words

# filter by frequency
class FilterByFrequency:
    def __init__(self):
        # Count word frequencies
        from collections import defaultdict
        self.frequency_dict = defaultdict(int)
    def make_filter(self, docs:list[list[str]]):
        for text in docs:
            for token in text:
                self.frequency_dict[token] += 1
    def filter(self, words:list[str], threshold:int=1):
        # Only keep words that appear more than once
        filtered_words = [token for token in words if self.frequency_dict[token] > threshold]
        return filtered_words

# bow
class BagOfWords:
    def __init__(self):
        self.dictionary:dict[str,int]|None=None
    def create_dictionary(self, input:list[list[str]]):
        from gensim import corpora
        self.dictionary = corpora.Dictionary(input)
    def represent_bow(self, input:list[list[str]]):
        bow_corpus = [self.dictionary.doc2bow(text) for text in input]
        return bow_corpus

In [4]:
# (1) 토크나이징 : 공백을 기준으로
tokenizer = WhitespaceTokenizer()
tokenized_docs = [tokenizer.tokenize(doc) for doc in text_corpus]
# (2) 텍스트 클리닝 - lower + stopwords
text_cleaner = TextCleaner()
cleaned_docs = [text_cleaner.clean_text(words) for words in tokenized_docs]
# (3) 빈도 기반 필터링 : 1회 발생 단어는 제외
filter = FilterByFrequency()
filter.make_filter(cleaned_docs)
processed_corpus = [filter.filter(doc, 1) for doc in cleaned_docs]
# (4) BoW 생성
bow_model = BagOfWords()
bow_model.create_dictionary(processed_corpus)
bow = bow_model.represent_bow(processed_corpus)

In [None]:
# TF-IDF (전통적 TF-IDF)
from collections import Counter, defaultdict
from math import log

class StandardTFIDF:
    def __init__(self, corpus:list[list[str]]=None):
        self.corpus:list[list[str]] = corpus
        self.vocabulary:dict[int,str]
        self.total_doc_count:int
        self.document_frequency:dict[str,int]
        if corpus:
            self.fit(corpus)
    
    def _make_vocabulary(self, corpus:list[list[str]]) -> dict[str,int]:
        unique_tokens = sorted(set([token for doc in corpus for token in doc]))
        vocabulary = dict()
        for idx, token in enumerate(unique_tokens):
            vocabulary[token] = idx
        self.vocabulary = vocabulary
        return vocabulary
    
    def _total_doc_count(self, corpus:list[list[str]]) -> int:
        return len(corpus)
    
    def _document_frequency(self, corpus:list[list[str]]) -> dict[str,int]:
        df = defaultdict(int)
        for doc in corpus:
            for token in set(doc):
                df[token] += 1
        return df
    
    def id2token(self) -> dict[int,str]:
        id2token = {idx:token for token, idx in self.vocabulary.items()}
        return id2token
    
    def token2id(self) -> dict[str,int]:
        token2id = self.vocabulary
        return token2id
    
    def _convert_to_vector(self, tfidf:dict[str,float|int]):
        vector = [0] * len(self.vocabulary)
        for token, score in tfidf.items():
            idx = self.vocabulary[token]
            vector[idx] = score
        return vector
    
    def _tf(self, document:list[str]) -> dict[str,int|float]:
        
        cnt = Counter(document)
        tf = {token : (cnt[token]/len(document)) for token in cnt}
        return tf
    
    def _idf(self, document:list[str]) -> dict[str,int|float]:
        idf = dict()
        document_count = self.total_doc_count
        cnt = Counter(document)
        for token in cnt:
            idf[token] = log(document_count / self.document_frequency[token])
        return idf
    
    def _tfidf(self, document:list[str]) -> dict[str,int|float]:
        tfidf = dict()
        tf = self._tf(document)
        idf = self._idf(document)
        cnt = Counter(document)
        for token in cnt:
            tfidf[token] = tf[token] * idf[token]
        return tfidf
    
    def fit(self, corpus:list[list[str]]) -> None:
        self.corpus = corpus
        self.vocabulary = self._make_vocabulary(corpus)
        self.total_doc_count = self._total_doc_count(corpus)
        self.document_frequency = self._document_frequency(corpus)
    
    def transform(self, documents:list[list[str]] | list[str]):
        result:list[list[float|int]] = list()
        if isinstance(documents, list) and documents and isinstance(documents[0], str):
            documents = [documents]
        if len(documents) == 0:
            return []
        for doc in documents:
            if len(doc) == 0:
                result.append([0] * len(self.vocabulary))
            else:
                tfidf = self._tfidf(doc)
                result.append(self._convert_to_vector(tfidf))
        return result
    
    def fit_transform(self, corpus:list[list[str]], documents:list[list[str]] | list[str]=None):
        if documents is None:
            documents = corpus.copy()
        self.fit(corpus)
        result = self.transform(documents)
        return result

In [8]:
# 신규 문장의 특정 단어 TF-IDF 계산

# 신규 문장
new_sentence = "system minors"
cleaned_words = filter.filter(text_cleaner.clean_text(tokenizer.tokenize(new_sentence)))
print(cleaned_words)

# TF-IDF 계산
tfidf_model = StandardTFIDF(processed_corpus)
tf = tfidf_model._tf(cleaned_words)
idf = tfidf_model._idf(cleaned_words)
tfidf = tfidf_model._tfidf(cleaned_words)
print(f"tf : {tf}")
print(f"idf : {idf}")
print(f"tfidf : {tfidf}")

['system', 'minors']
tf : {'system': 0.5, 'minors': 0.5}
idf : {'system': 1.0986122886681098, 'minors': 1.5040773967762742}
tfidf : {'system': 0.5493061443340549, 'minors': 0.7520386983881371}


In [7]:
# gensim 라이브러리를 이용하는 경우

from gensim import models

# train tf-idf model from corpus
bow_corpus = [bow_model.dictionary.doc2bow(doc) for doc in processed_corpus]
tfidf = models.TfidfModel(bow_corpus)

# test doc
new_sentence = "system minors"
cleaned_words = filter.filter(text_cleaner.clean_text(tokenizer.tokenize(new_sentence)))
new_doc_bow = bow_model.dictionary.doc2bow(cleaned_words)
new_doc_tfidf = tfidf[new_doc_bow]
print(new_doc_tfidf)

[(5, np.float64(0.5898341626740045)), (11, np.float64(0.8075244024440723))]


In [None]:
# TF-IDF (스무딩 적용)
from math import log

class SmoothingTFIDF(StandardTFIDF):
    def __init__(self, corpus:list[list[str]]=None):
        super().__init__(corpus)
    
    def _idf(self, document:list[str]) -> dict[str,int|float]:
        idf = dict()
        document_count = self.total_doc_count
        cnt = Counter(document)
        for token in cnt:
            idf[token] = log((document_count + 1) / (self.document_frequency[token] + 1)) + 1
        return idf

# 신규 문장의 특정 단어 TF-IDF 계산

# 신규 문장
new_sentence = "system minors"
cleaned_words = filter.filter(text_cleaner.clean_text(tokenizer.tokenize(new_sentence)))
print(cleaned_words)

# TF-IDF 계산
tfidf_model = SmoothingTFIDF(processed_corpus)
tf = tfidf_model._tf(cleaned_words)
idf = tfidf_model._idf(cleaned_words)
tfidf = tfidf_model._tfidf(cleaned_words)
print(f"tf : {tf}")
print(f"idf : {idf}")
print(f"tfidf : {tfidf}")

['system', 'minors']
tf : {'system': 0.5, 'minors': 0.5}
idf : {'system': 1.916290731874155, 'minors': 2.203972804325936}
tfidf : {'system': 0.9581453659370776, 'minors': 1.101986402162968}


In [9]:
tf_system_compare = len([word for word in new_doc_words if word == 'system'])
idf_system_compare = log((len(processed_corpus)+1) / (len([doc for doc in processed_corpus if 'system' in doc])+1)) + 1
tf_minors_compare = len([word for word in new_doc_words if word == 'minors'])
idf_minors_compare = log((len(processed_corpus)+1) / (len([doc for doc in processed_corpus if 'minors' in doc])+1)) + 1
norm = ((tf_system_compare * idf_system_compare)**2 + (tf_minors_compare * idf_minors_compare)**2)**1/2
print(norm)
tfidf_system_compare = tf_system_compare * idf_system_compare
print(f'word : system / id : {dictionary.token2id["system"]} / tf : {tf_system_compare}, / idf : {idf_system_compare} / tfidf : {tfidf_system_compare/norm}')

NameError: name 'new_doc_words' is not defined