## 🏷️ 학습 데이터 기반의 사전 구축을 통한 전처리

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('../data/train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('../data/test.csv', encoding = 'utf-8-sig')

In [None]:
print(len(train))
print(len(test))

In [None]:
train.head()

In [None]:
train['input'][0]

In [None]:
train['output'][0]

## 🏷️ 형태소 기반 단어 사전 생성

In [None]:
from kiwipiepy import Kiwi
from collections import defaultdict

kiwi = Kiwi()
obfuscation_vocab = defaultdict(set)

for idx, row in train.iterrows():
    original_text = row['output']
    obfuscated_text = row['input']
    obfuscated_chars = list(obfuscated_text)

    tokens = kiwi.tokenize(original_text)

    for token in tokens:
        if token.len != 1: # 하나의 글자인 경우는 제외
            obfuscated_noun = ''.join(obfuscated_chars[token.start:token.start+token.len])
            obfuscation_vocab[original_text[token.start:token.start+token.len]].add(obfuscated_noun)

## 🏷️ 사전을 바탕으로 전처리

In [None]:
for _, row in test.iterrows():
    for noun, obfuscated_list in obfuscation_vocab.items():
        for obfuscated_word in obfuscated_list:
            if obfuscated_word in row['input']:
                row['input'] = row['input'].replace(obfuscated_word, noun)

## 🏷️ 전처리된 데이터 저장

In [None]:
test.to_csv("../data/test_processed.csv", index=False, encoding='utf-8-sig')