### **문제 1) Tokenizer 생성하기**


In [1]:
# 정규표현식
import re

In [2]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False
  
  def preprocessing(self, sequences):
    result = []
    '''
    문제 1-1.
    '''
    for i in sequences:
      text = re.sub("[^ a-zA-Z0-9]", '', i).lower().split(' ')
      result.append(text)

    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    '''
    문제 1-2.
    '''
    token_list = self.preprocessing(sequences)
    for tokens in token_list:
      for token in tokens:
        if token in self.word_dict:
          self.word_dict[token] += 1
        else:
          self.word_dict[token] = 1
    self.fit_checker = True
  
  def transform(self, sequences):
    result = []
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      '''
      문제 1-3.
      '''
      token_list = self.preprocessing(sequences)
      # indexing 위한 dict 생성
      idict = {}
      for idx, key in enumerate(self.word_dict):
        idict[key] = idx

      # comprehesion ver.
      # idict = {key : idx for idx, key in enumerate(word_dict)}
      
      for tokens in token_list:
        tmp = []
        for token in tokens:
          if token in self.word_dict:
            tmp.append(idict[token])
          else : tmp.append(idict['oov'])
        result.append(tmp)
          
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

#### **Test code**

In [3]:
sentences = ['I go to school.', 'I LIKE pizza!']
t = Tokenizer()

In [4]:
t.preprocessing(sentences)

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]

In [5]:
t.fit(sentences)
t.word_dict

{'go': 1, 'i': 2, 'like': 1, 'oov': 0, 'pizza': 1, 'school': 1, 'to': 1}

In [6]:
t.transform(sentences)

[[1, 2, 3, 4], [1, 5, 6]]

### **문제 2) TfidfVectorizer 생성하기**

In [7]:
from math import log
import numpy as np

In [8]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    tokenized = self.tokenizer.fit_transform(sequences)
    '''
    문제 2-1.
    '''
    N = len(tokenized)
    wordset = self.tokenizer.word_dict.copy()
    wordset.pop('oov')

    idf_dict = dict.fromkeys(wordset, 0)
    
    for w in wordset:
        idf_dict[w] = np.log(N/(1+wordset[w]))
    self.idf_matrix = list(idf_dict.values())

    self.fit_checker = True
    

  def transform(self, sequences):
    if self.fit_checker:
      tokenized = self.tokenizer.transform(sequences)
      words = list(self.tokenizer.word_dict.keys())[1:]
      token_list = self.tokenizer.preprocessing(sequences)
      '''
      문제 2-2.
      '''
      # TF
      N = len(tokenized)
      TF = []
      for i in range(N):
        TF.append([])
        d = token_list[i]
        for j in range(len(words)):
          t = words[j]
          TF[-1].append(d.count(t))
      
      self.tfidf_matrix = (np.array(TF) * self.idf_matrix).tolist()

      return self.tfidf_matrix
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

### **Test code**


In [9]:
sentences = ['I go to school.', 'I LIKE pizza!']
t = Tokenizer()
tv = TfidfVectorizer(t)
tv.fit(sentences)
tv.transform(sentences)

[[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0],
 [-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]