In [188]:
import re
import math
from collections import Counter

In [171]:
#문제 1번
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov': 0}
        self.fit_checker = False

    def preprocessing(self, sequences):
        result = []
        for seq in sequences:
            seq = seq.lower()
            seq = re.sub(r"[^a-zA-Z\s]", "", seq)
            result.append(seq.split(" "))
        return result
  
    def fit(self, sequences):
        self.fit_checker = False

        #prep
        sequences_prep = self.preprocessing(sequences)
        sequences_prep = sum(sequences_prep, [])

        #tokenizer
        index = 1
        for word in sequences_prep:
            if not self.word_dict.get(word):
                self.word_dict[word] = index
                index+=1
        self.fit_checker = True

    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)

        if self.fit_checker:
            for token in tokens:
                rst = []
                for word in token:
                    value = self.word_dict.get(word, self.word_dict['oov'])
                    rst.append(value)
                result.append(rst)
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [281]:
#문제 2번
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False

    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        n = len(tokenized)
        tok = sum([list(set(seq)) for seq in tokenized],[])
        tok_counter = Counter(tok)
        result = []
        for i in range(1, max(tok)+1):
            result.append(math.log(n / (1 + tok_counter.get(i, 0))))
        self.fit_checker = True
        return result


    def transform(self, sequences):
        idf_list = self.fit(sequences)
        if self.fit_checker:
            self.tfidf_matrix = []
            tokenized = self.tokenizer.transform(sequences)
            
            for token in tokenized:
                tf_idf = []
                token_counter = Counter(token)
                
                for key in sorted(list(token_counter.keys())):
                    tf_idf.append(idf_list[key - 1] * token_counter[key])
                
                self.tfidf_matrix.append(tf_idf)
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")


    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)