# 2022.02.07. 이창석

# 문제1) Tokenizer 생성하기

In [1]:
# 필요한 라이브러리 로드
import re # 특수문자 제거
from math import log # tf-idf 행렬곱 계산
import numpy as np # tf-idf 행렬곱 계산
import pandas as pd # 데이터프레임

In [2]:
class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov' : 0}
        self.fit_checker = False

    def preprocessing(self, sequences):
        result = []
        '''
        문제 1-1.
        '''
        def clean_text(seq):
            seq = re.sub('[-=+,#/\?:^.@*\"※~ㆍ!』‘|\(\)\[\]`\'…》\”\“\’·]', ' ', seq)
            return seq

        for seq in sequences:
            seq = seq.lower()
            seq = clean_text(seq)
            seq = seq.split()
            result.append((seq))
        return result

    def fit(self, sequences):
        self.fit_checker = False
        '''
        문제 1-2.
        '''
        preprocessed_seqs = self.preprocessing(sequences)
        pre_result = []

        for preprocessed_seq in preprocessed_seqs:
            pre_result += preprocessed_seq

        pre_result = set(pre_result)

        for num, word in enumerate(pre_result,1):
            self.word_dict[word] = num

        self.fit_checker = True

    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)

        if self.fit_checker:
            '''
            문제 1-3.
            '''

            for token in tokens:
                token2index = []
                for word in token:
                    try:
                        token2index.append((self.word_dict[word]))
                    except KeyError:
                        token2index.append((self.word_dict['oov']))
                result.append(token2index)

            return result

        else:
            raise Exception("Tokenizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [3]:
# 예시 문장
sequences = ['I go to school.', 'I LIKE pizza!']
oov_sequences = ['I go to school.', 'I LIKE pizza!', 'i HaTe You!~']
tokenizer_seqs = Tokenizer()

# 1-1) preprocessing()
print('1-1) preprocessing : ', tokenizer_seqs.preprocessing(sequences))
print('\n')

# 1-2) fit()
tokenizer_seqs.fit(sequences)
print('1-2) word_dict : ', tokenizer_seqs.word_dict)
print('\n')

# 1-3) transform()
print('1-3) transform : ', tokenizer_seqs.transform(sequences))
print('1-3) oov_transform : ', tokenizer_seqs.transform(oov_sequences))
print('\n')

# 1-4) fit_transform()
print('1-4) fit_transform : ', tokenizer_seqs.fit_transform(sequences))

1-1) preprocessing :  [['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]


1-2) word_dict :  {'oov': 0, 'go': 1, 'to': 2, 'i': 3, 'school': 4, 'like': 5, 'pizza': 6}


1-3) transform :  [[3, 1, 2, 4], [3, 5, 6]]
1-3) oov_transform :  [[3, 1, 2, 4], [3, 5, 6], [3, 0, 0]]


1-4) fit_transform :  [[3, 1, 2, 4], [3, 5, 6]]


---

# 문제2) TfidfVectorizer 생성하기

In [4]:
class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False

    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        '''
        문제 2-1.
        '''
        vocab = list(self.tokenizer.word_dict.keys())
        idf_matrix = []
        n = len(sequences)

        fixed_seq = []
        sequences = self.tokenizer.preprocessing(sequences)
        for sequence in sequences:
            fixed_seq.append((' '.join(sequence)))

        def idf(t):
            df = 0
            for seq in fixed_seq:
                df += t in seq
            return log(n/(df+1))

        for j in range(len(vocab)):
            t = vocab[j]
            idf_matrix.append(idf(t))

        self.idf_matrix = idf_matrix
        self.fit_checker = True

    def transform(self, sequences):

        vocab = list(self.tokenizer.word_dict.keys())
        n = len(sequences)
        tf_matrix = []
        tfidf_matrix = []

        fixed_seq = []
        sequences = self.tokenizer.preprocessing(sequences)
        for sequence in sequences:
            fixed_seq.append((' '.join(sequence)))

        def tf(t, s):
            cnt = 0
            ss = s.split( )
            for s in ss:
                if t == s:
                    cnt += 1
            return cnt

        if self.fit_checker:
            '''
            문제 2-2.
            '''
            for i in range(n):
                tf_matrix.append([])
                s = fixed_seq[i]
                for j in range(len(vocab)):
                    t = vocab[j]
                    tf_matrix[-1].append(tf(t,s))
            tf_df_matrix = pd.DataFrame(tf_matrix, columns=vocab) # TF dataframe

            self.tf_df_matrix = tf_df_matrix
            self.tf_matrix = tf_matrix

            for idx in range(n):
                tfidf_matrix.append(np.multiply(tf_matrix[idx], self.idf_matrix))
            tfidf_df_matrix = pd.DataFrame(tfidf_matrix, columns=vocab) # TF-IDF dataframe

            self.tfidf_df_matrix = tfidf_df_matrix
            self.tfidf_matrix = tfidf_matrix

            return self.tfidf_matrix

        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")

    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)





In [5]:
# 예시 문장
sequences = ['I go to school.', 'I LIKE pizza!']
# oov_sequences = ['I go to school.', 'I LIKE pizza!', 'i HaTe You!~']
tokenizer_seqs = Tokenizer()
Tfidf_seqs = TfidfVectorizer(tokenizer_seqs)
Tfidf_seqs.fit(sequences)
Tfidf_seqs.transform(sequences)

print('word dict : ', tokenizer_seqs.word_dict)
print('\n')

# 2-1) fit()
print('2-1) IDF matrix : ', Tfidf_seqs.idf_matrix)
print('\n')

# 2-2) transform()
print('2-2)')
print('TF DataFrame : ')
print(Tfidf_seqs.tf_df_matrix)
print('\n')
print('TF matrix : ')
print(Tfidf_seqs.tf_matrix)
print('\n')
print('TF-IDF DataFrame : ')
print(Tfidf_seqs.tfidf_df_matrix)
print('\n')
print('TF-IDF matrix : ')
print(Tfidf_seqs.tfidf_matrix)
print('\n')

word dict :  {'oov': 0, 'go': 1, 'to': 2, 'i': 3, 'school': 4, 'like': 5, 'pizza': 6}


2-1) IDF matrix :  [0.6931471805599453, 0.0, 0.0, -0.40546510810816444, 0.0, 0.0, 0.0]


2-2)
TF DataFrame : 
   oov  go  to  i  school  like  pizza
0    0   1   1  1       1     0      0
1    0   0   0  1       0     1      1


TF matrix : 
[[0, 1, 1, 1, 1, 0, 0], [0, 0, 0, 1, 0, 1, 1]]


TF-IDF DataFrame : 
   oov   go   to         i  school  like  pizza
0  0.0  0.0  0.0 -0.405465     0.0   0.0    0.0
1  0.0  0.0  0.0 -0.405465     0.0   0.0    0.0


TF-IDF matrix : 
[array([ 0.        ,  0.        ,  0.        , -0.40546511,  0.        ,
        0.        ,  0.        ]), array([ 0.        ,  0.        ,  0.        , -0.40546511,  0.        ,
        0.        ,  0.        ])]




---