In [5]:
import re

class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    '''
    문제 1-1.
    - nested list 형태의 전처리 결과를 반환.
    - 특수문자, 소문자 처리
    '''
    for seq in sequences:
      seq = seq.lower()
      seq = re.sub(r'[^a-zA-Z\d\s]', ' ', seq)
      inner = seq.split(" ")
      inner = list(filter(None, inner))
      result.append(inner)
      
    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    '''
    문제 1-2.
    - 정수 인덱싱
    '''
    tokens = self.preprocessing(sequences)
    
    for token in tokens:
      for word in token:
        if word not in self.word_dict:
          self.word_dict[word] = len(self.word_dict)
    
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      '''
      문제 1-3.
      - 인덱스 변환
      '''
      result = [*map(lambda x: [self.word_dict[k] for k in x], tokens)]
      
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [3]:
import math
import numpy as np
from collections import Counter


class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    '''
    문제 2-1.
    - idf matrix 
    '''
    idf_list = []
    word_index = list(self.tokenizer.word_dict.values())
    word_index.pop(0)
    N = len(tokenized)
    for i in word_index:
      cnt = 0
      for seq in tokenized:
        if i in seq:
          cnt += 1
      idf_list.append(math.log((N)/(cnt+1)))
        
    self.fit_checker = True
    return idf_list

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
    '''
    문제 2-2.
    tf-idf 반환      
    '''
    tf = []
    word_index = list(self.tokenizer.word_dict.values())
    word_index.pop(0)
    for seq in tokenized:
        c = Counter(seq)
        tf.append([c[i] for i in word_index])
        
    self.tfidf_matrix = []
     
    for i in tf:
      res=self.fit(sequences) * np.array(i)
      self.tfidf_matrix.append(list(res))
    
    return self.tfidf_matrix
    
     
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [6]:
test = ['I go to school.', 'I LIKE pizza!']

# tfidf transform

tokenizer = Tokenizer()
tfidfvectorizer = TfidfVectorizer(tokenizer)
ex_transformed=tfidfvectorizer.fit_transform(test)
print(type(ex_transformed))
print(ex_transformed)

<class 'list'>
[[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0], [-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]


In [7]:
tokenizer.word_dict

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}