# LLM 훈련 데이터 정제
## 당혹도 계산

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    return torch.exp(outputs.loss).item()

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
clean_text = "The quick brown fox jumps over the lazy dog."
noisy_text = "Th3 qu1ck br0wn f0x jumps 0ver th3 l@zy d0g."

clean_perplexity = calculate_perplexity(model, tokenizer, clean_text)
noisy_perplexity = calculate_perplexity(model, tokenizer, noisy_text)

print("깨끗한 텍스트 당혹도:", clean_perplexity)
print("노이즈 텍스트 당혹도:", noisy_perplexity)

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


깨끗한 텍스트 당혹도: 162.47128295898438
노이즈 텍스트 당혹도: 587.935302734375


## 텍스트 품질 검사

In [11]:
%pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/7.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m4.5/7.2 MB[0m [31m65.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m79.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.3


In [12]:
import spacy
from spellchecker import SpellChecker
from collections import Counter

nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

def analyze_text_quality(text):
    doc = nlp(text)

    # 철자 검사
    misspelled = [token.text for token in doc if token.is_alpha and token.text.lower() in spell.unknown([token.text.lower()])]

    # 문법 점수 (간단한 품사 카운트)
    pos_counts = Counter(token.pos_ for token in doc)
    grammar_score = pos_counts['NOUN'] + pos_counts['VERB'] + pos_counts['ADJ'] + pos_counts['ADV']

    # 문장 완전성 검사
    incomplete_sentences = [sent.text for sent in doc.sents if len(sent) < 3]

    return {
        "misspelled_words": misspelled,
        "grammar_score": grammar_score,
        "incomplete_sentences": incomplete_sentences
    }

text = "This iz a smple txt with sum issues. Incomplet"
quality_report = analyze_text_quality(text)
print(quality_report)


{'misspelled_words': ['iz', 'smple', 'txt', 'Incomplet'], 'grammar_score': 5, 'incomplete_sentences': ['Incomplet']}


## 텍스트 전처리

In [16]:
import unicodedata
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# 필요한 NLTK 데이터 다운로드
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
def preprocess_text(text):
    # 텍스트를 소문자화
    text = text.lower()

    # 유니코드 캐릭터 정규화
    text = unicodedata.normalize(
        'NFKD', text
    ).encode('ascii', 'ignore').decode('utf-8')

    # 구두점 제거
    text = re.sub(r'[^\w\s]', '', text)

    # 화이트스페이스 정규화
    text = ' '.join(text.split())

    # 토큰화
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [
        token for token in tokens if token not in stop_words
    ]

    # 토큰을 다시 텍스트로 결합
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [17]:
raw_text = "This is an EXAMPLE of text preprocessing... It's quite useful!"
cleaned_text = preprocess_text(raw_text)
print(f"원문: {raw_text}")
print(f"전처리 결과: {cleaned_text}")


원문: This is an EXAMPLE of text preprocessing... It's quite useful!
전처리 결과: example text preprocessing quite useful


## 언어 감지 및 정규화

In [19]:
%pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m696.3/981.5 kB[0m [31m20.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=efc9f5e8611a29508256dd4bef7fd78d8d037bd32321011d1365b25e9746f25e
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [21]:
%pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [22]:
from langdetect import detect
from unidecode import unidecode
from nltk import word_tokenize
import nltk

# 필요한 NLTK 데이터를 다운로드
nltk.download('punkt')

def handle_multilingual_text(text):
    # 언어 감지
    try:
        lang = detect(text)
    except:
        lang = 'unknown'

    # 비ASCII 캐릭터를 음역
    transliterated_text = unidecode(text)

    tokens = word_tokenize(transliterated_text)

    return {
        'original': text,
        'language': lang,
        'transliterated': transliterated_text,
        'tokens': tokens
    }


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
texts = [
    "This is English text.",
    "Dies ist deutscher Text.",
    "これは日本語のテキストです。",
    "This is mixed language text avec un peu de français."
]

for text in texts:
    result = handle_multilingual_text(text)
    print(f"원문: {result['original']}")
    print(f"언어: {result['language']}")
    print(f"번역문: {result['transliterated']}")
    print(f"토큰: {result['tokens']}\n")


원문: This is English text.
언어: en
번역문: This is English text.
토큰: ['This', 'is', 'English', 'text', '.']

원문: Dies ist deutscher Text.
언어: de
번역문: Dies ist deutscher Text.
토큰: ['Dies', 'ist', 'deutscher', 'Text', '.']

원문: これは日本語のテキストです。
언어: ja
번역문: korehaRi Ben Yu notekisutodesu. 
토큰: ['korehaRi', 'Ben', 'Yu', 'notekisutodesu', '.']

원문: This is mixed language text avec un peu de français.
언어: fr
번역문: This is mixed language text avec un peu de francais.
토큰: ['This', 'is', 'mixed', 'language', 'text', 'avec', 'un', 'peu', 'de', 'francais', '.']



## 중복 제거

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def deduplicate_corpus(corpus, similarity_threshold=0.9):
    # TF-IDF 벡터화기를 생성
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    # 쌍별 유사도 계산
    similarity_matrix = cosine_similarity(tfidf_matrix)

    duplicates = set()
    for i in range(len(corpus)):
        for j in range(i + 1, len(corpus)):
            if similarity_matrix[i, j] > similarity_threshold:
                duplicates.add(j)

    deduplicated_corpus = [
        doc for i, doc in enumerate(corpus)
        if i not in duplicates
    ]

    return deduplicated_corpus


In [25]:
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast auburn fox leaps above the sleepy canine.",
    "The quick brown fox jumps over the lazy dog.",
    "An entirely different sentence about cats.",
]

deduplicated = deduplicate_corpus(corpus)
print(f"원래 말뭉치 크기: {len(corpus)}")
print(f"중복 제거된 말뭉치 크기: {len(deduplicated)}")
print("중복 제거된 말뭉치:")
for doc in deduplicated:
    print(f"- {doc}")


원래 말뭉치 크기: 4
중복 제거된 말뭉치 크기: 3
중복 제거된 말뭉치:
- The quick brown fox jumps over the lazy dog.
- A fast auburn fox leaps above the sleepy canine.
- An entirely different sentence about cats.


## 데이터 정제 파이프라인 자동화

In [26]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

# 필요한 NLTK 데이터 다운로드
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

class DataCleaningPipeline:
    def __init__(
        self, similarity_threshold=0.9, min_length=10,
        max_length=1000
    ):
        self.similarity_threshold = similarity_threshold
        self.min_length = min_length
        self.max_length = max_length
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def preprocess(self, text):
        # 기본 전처리
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = [
            word for word in text.split()
            if word not in stop_words
        ]
        return ' '.join(tokens)

    def filter_by_length(self, df):
        return df[
            (df['text'].str.len() >= self.min_length) &
            (df['text'].str.len() <= self.max_length)
        ]

    def deduplicate(self, df):
        tfidf_matrix = self.vectorizer.fit_transform(df['text'])
        similarity_matrix = cosine_similarity(tfidf_matrix)

        duplicates = set()
        for i in range(len(df)):
            for j in range(i + 1, len(df)):
                if similarity_matrix[i, j] > self.similarity_threshold:
                    duplicates.add(j)

        return df.drop(df.index[list(duplicates)])

    def clean(self, input_file, output_file):
        # 데이터 읽기
        df = pd.read_csv(input_file)

        # 전처리
        df['text'] = df['text'].apply(self.preprocess)

        # 길이로 필터링
        df = self.filter_by_length(df)

        # 중복 제거
        df = self.deduplicate(df)

        # 정제한 데이터 저장
        df.to_csv(output_file, index=False)

        print(f"정제한 데이터를 {output_file}에 저장했습니다.")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
import pandas as pd

# 간단한 예제 데이터프레임
data = {
    "text": [
        "This is a clean sentence for testing.",
        "This is a clean sentence for testing.",   # 중복
        "Short",                                   # 너무 짧음
        "Another example sentence with some noise!!!",
        "This text is way toooooooooooooooooooooooooooooooooooooooooooooooooo long"
    ]
}

# CSV로 저장
pd.DataFrame(data).to_csv("input_data.csv", index=False)


In [29]:
pipeline = DataCleaningPipeline()
pipeline.clean('input_data.csv', 'cleaned_data.csv')


정제한 데이터를 cleaned_data.csv에 저장했습니다.


In [30]:
print(pd.read_csv("cleaned_data.csv"))


                                                text
0                             clean sentence testing
1                     another example sentence noise
2  text way toooooooooooooooooooooooooooooooooooo...


## 데이터 검증

In [35]:
def validate_cleaned_data(file_path, sample_size=100):
    df = pd.read_csv(file_path)
    # 기본 통계
    print(f"전체 샘플: {len(df)}")
    print(
        f"평균 텍스트 길이: "
        f"{df['text'].str.len().mean():.2f}"
    )

    print(f"고유 샘플: {df['text'].nunique()}")

    short_texts = df[df['text'].str.len() < 10]
    print(
        f"10자 미만의 텍스트 수: "
        f"{len(short_texts)}"
    )

    sample = df.sample(n=min(sample_size, len(df)))
    print("\n수동 검토를 위한 샘플:")
    print(sample['text'].head())

    # 일반적인 문제 확인
    common_issues = {
        'special_chars': df['text'].str.contains(
            r'[^a-zA-Z0-9\s]'
        ),
        'numbers': df['text'].str.contains(r'\d'),
        'all_caps': df['text'].str.isupper()
    }
    for issue, mask in common_issues.items():
        print(f"{issue}이 포함된 샘플: {mask.sum()}")

    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    def calculate_perplexity(text):
        inputs = tokenizer(
            text, return_tensors='pt', truncation=True, max_length=1024
        )
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
        return torch.exp(outputs.loss).item()

    sample_perplexities = sample['text'].apply(calculate_perplexity)
    print(
            f"\n샘플의 평균 당혹도: "
            f"{sample_perplexities.mean():.2f}"
    )

In [36]:
validate_cleaned_data('cleaned_data.csv')


전체 샘플: 3
평균 텍스트 길이: 39.00
고유 샘플: 3
10자 미만의 텍스트 수: 0

수동 검토를 위한 샘플:
0                               clean sentence testing
1                       another example sentence noise
2    text way toooooooooooooooooooooooooooooooooooo...
Name: text, dtype: object
special_chars이 포함된 샘플: 0
numbers이 포함된 샘플: 0
all_caps이 포함된 샘플: 0

샘플의 평균 당혹도: 49874.19
