#  데이터 증강


## 텍스트 데이터 증강 기법
### 동의어 대체

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
import random
from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return synonyms

text = "The movie was boring but the actors were great."


In [4]:
def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(
        set([word for word in words if word.isalnum()])
    )
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [
                synonym if word == random_word else word
                for word in new_words
            ]
            num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)


In [5]:
print("원문:", text)
print("동의어 대체:", synonym_replacement(text, n=2))


원문: The movie was boring but the actors were great.
동의어 대체: The movie was dull just the actors were great.


### 역번역
(옮긴이) googletrans가 최신 httpx와 충돌을 일으키는 문제를 피하고자 deep_translator로 대체함


In [6]:
!pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [7]:
# 교체한 패키지에 맞게 코드도 일부 수정함
from deep_translator import GoogleTranslator

def back_translation(text, target_lang="fr"):
    translated = GoogleTranslator(source="en", target=target_lang).translate(text)
    back_translated = GoogleTranslator(source=target_lang, target="en").translate(translated)
    return back_translated


In [8]:
print("원문:", text)
print("역번역 (영→불→영):", back_translation(text, target_lang="fr"))


원문: The movie was boring but the actors were great.
역번역 (영→불→영): The film was boring but the actors were great.


### T5를 사용한 텍스트 생성

In [9]:
from transformers import (T5ForConditionalGeneration, T5Tokenizer)

t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
def t5_augmentation(text, model, tokenizer, num_return_sequences=1):
    input_ids = tokenizer.encode(
        f"paraphrase: {text}",
        return_tensors="pt",
        max_length=512,
        truncation=True
    )
    outputs = model.generate(
        input_ids=input_ids,
        max_length=150,
        num_return_sequences=num_return_sequences,
        num_beams=5,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
    )
    return [
        tokenizer.decode(
            output, skip_special_tokens=True
        ) for output in outputs
    ]


In [11]:
print("원문:", text)
augmented = t5_augmentation(text, t5_model, t5_tokenizer, num_return_sequences=3)
print("T5 증강 결과:")
for i, aug in enumerate(augmented, 1):
    print(f"{i}. {aug}")


The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


원문: The movie was boring but the actors were great.
T5 증강 결과:
1. Paraphrase: The movie was boring but the actors were great.
2. Paraphrase: The movie was boring, but the actors were great.
3. paraphrase: The movie was boring but the actors were great.


## 기존 LLM을 활용한 데이터 생성

In [12]:
import openai
import os
from google.colab import userdata

# Colab 보안 비밀에 API 키를 등록한 것으로 가정
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')  # API 키를 가져와 환경 변수에 등록

In [13]:
from openai import OpenAI

def gpt4o_data_generation(prompt, num_samples=5):
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        n=num_samples,
        temperature=0.7,
    )
    return [choice.message.content.strip()
        for choice in response.choices
    ]

In [14]:
text = "The movie was boring but the actors were great."
prompt = f"Generate three variations of the sentence: {text}"

generated_texts = gpt4o_data_generation(prompt, num_samples=3)

print("원문:", text)
print("GPT-4o 생성 결과:")
for i, gen in enumerate(generated_texts, 1):
    print(f"{i}. {gen}")


원문: The movie was boring but the actors were great.
GPT-4o 생성 결과:
1. 1. The film dragged on, yet the performances by the actors were outstanding.
2. Although the movie lacked excitement, the cast delivered exceptional performances.
3. The storyline was dull, but the actors' talent truly shone through.
2. 1. The film dragged on, yet the performances by the actors were outstanding.  
2. Although the movie lacked excitement, the actors delivered excellent performances.  
3. The storyline was dull, but the actors' talent truly shone through.
3. 1. While the film itself was dull, the performances by the actors were outstanding.
2. The storyline may have been uninteresting, but the acting was exceptional.
3. Despite the tedious plot, the actors delivered impressive performances.


## 다국어 데이터 증강 전략
### 언어 간 역번역

In [15]:
def cross_lingual_back_translation(text, target_langs=['fr', 'de', 'es']):
    augmented_texts = []
    for lang in target_langs:
        translated = GoogleTranslator(source="en", target=lang).translate(text)
        back_translated = GoogleTranslator(source=lang, target="en").translate(translated)
        augmented_texts.append(back_translated)
    return augmented_texts


In [16]:
print("원문:", text)
augmented_cross = cross_lingual_back_translation(text, target_langs=['fr', 'de', 'es'])
print("언어 간 역번역 결과:")
for lang, aug in zip(['fr→en', 'de→en', 'es→en'], augmented_cross):
    print(f"{lang}: {aug}")


원문: The movie was boring but the actors were great.
언어 간 역번역 결과:
fr→en: The film was boring but the actors were great.
de→en: The film was boring, but the actors were great.
es→en: The film was boring, but the actors were great.


### 다국어 증강

In [24]:
from transformers import MarianMTModel, MarianTokenizer

def marian_translation(text, src_lang="en", tgt_lang="fr"):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # 번역 (src → tgt)
    input_ids = tokenizer(text, return_tensors="pt", padding=True).input_ids
    outputs = model.generate(input_ids, max_length=100)
    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return translated

def multilingual_marian_augmentation(text, target_langs=['fr', 'de', 'es']):
    augmented_texts = []
    for lang in target_langs:
        translated = marian_translation(text, src_lang="en", tgt_lang=lang)
        # 역번역 back translation
        back_translated = marian_translation(translated, src_lang=lang, tgt_lang="en")
        augmented_texts.append(back_translated)
    return augmented_texts


# 테스트
text = "She quickly finished her work and left for the meeting."
print("원문:", text)

augmented = multilingual_marian_augmentation(text, target_langs=['fr','de','es'])
for lang, aug in zip(['fr→en','de→en','es→en'], augmented):
    print(f"{lang}: {aug}")


원문: She quickly finished her work and left for the meeting.
fr→en: She quickly completed her work and left for the meeting.
de→en: She quickly finished her work and went to the meeting.
es→en: He quickly finished his job and went to the meeting.


## 텍스트 증강 시 의미 보존
### 문장 임베딩 사용

In [25]:
!pip install -q sentence-transformers


In [26]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 임베딩 모델 로드
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [27]:
def semantic_similarity(original, augmented, model):
    original_embedding = model.encode(original)
    augmented_embedding = model.encode(augmented)
    similarity = cosine_similarity([original_embedding], [augmented_embedding])[0][0]
    return similarity

def filter_by_semantic_similarity(original, augmented_list, model, threshold=0.8):
    return [
        aug for aug in augmented_list
        if semantic_similarity(original, aug, model) >= threshold
    ]

In [28]:
# 테스트
text = "She quickly finished her work and left for the meeting."
augmented_list = [
    "She quickly completed her work and left for the meeting.",
    "He rapidly ended his task and went to the party.",   # 의미가 달라진 경우
    "She quickly finished her work and went to the meeting."
]

print("원문:", text)
for aug in augmented_list:
    sim = semantic_similarity(text, aug, similarity_model)
    print(f"'{aug}' → 유사도: {sim:.2f}")

filtered = filter_by_semantic_similarity(text, augmented_list, similarity_model, threshold=0.8)
print("\n필터링 후:", filtered)

원문: She quickly finished her work and left for the meeting.
'She quickly completed her work and left for the meeting.' → 유사도: 0.98
'He rapidly ended his task and went to the party.' → 유사도: 0.52
'She quickly finished her work and went to the meeting.' → 유사도: 0.92

필터링 후: ['She quickly completed her work and left for the meeting.', 'She quickly finished her work and went to the meeting.']


### 동의어 대체를 위한 문맥적 단어 임베딩

다음은 책의 3.4.2절에 제시된 뼈대 코드를 바탕으로, 실행 가능하도록 보강한 예시 구현입니다.
책의 예제는 아이디어 수준으로만 제시되어 있으며, 후보 단어 생성이나 문맥 기반 유사도 계산 부분은 직접 구현해야 합니다.


In [126]:
!pip install -q sentence-transformers nltk

In [129]:
import random
import nltk
from nltk.corpus import wordnet
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("wordnet")
nltk.download("omw-1.4")

# SBERT 로딩
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# WordNet 후보 수집 (하이퍼님/하이포님까지 확장)
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        # 기본 lemma
        for lemma in syn.lemmas():
            cand = lemma.name().replace("_", " ")
            if 1 <= len(cand.split()) <= 3:
                synonyms.add(cand)
        # 하이퍼님/하이포님 lemma 추가
        for related in syn.hypernyms() + syn.hyponyms():
            for lemma in related.lemmas():
                cand = lemma.name().replace("_", " ")
                if 1 <= len(cand.split()) <= 3:
                    synonyms.add(cand)
    return list(synonyms)

# 문맥적 동의어 교체
def contextual_synonym_replacement(text, target_word, model=sbert_model, top_k=5, seed=None):
    if seed is not None:
        random.seed(seed)

    candidates = get_synonyms(target_word)
    if not candidates:
        return text, []

    # 유사도 점수 계산
    text_emb = model.encode([text])
    cand_embs = model.encode(candidates)
    sims = cosine_similarity(text_emb, cand_embs)[0]

    scored = sorted(zip(candidates, sims), key=lambda x: x[1], reverse=True)

    # 원 단어 제외
    scored = [(cand, score) for cand, score in scored if cand.lower() != target_word.lower()]

    # 가장 유사한 후보로 교체
    if scored:
        best = scored[0][0]
        new_text = text.replace(target_word, best, 1)
    else:
        new_text = text

    return new_text, scored[:top_k]

# 출력 함수
def print_candidates(original, new_text, candidates, title=""):
    print(f"{title}원문: {original}")
    print(f"{title}대체: {new_text}\n")
    print(f"{title}후보:")
    for i, (cand, score) in enumerate(candidates, 1):
        print(f"  {i}) {cand:<20} (유사도={score:.3f})")
    print("-"*60 + "\n")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [130]:
text1 = "He sat on the bank of the river and watched the boats pass by."
text2 = "She went to the bank to deposit some money."

new1, cand1 = contextual_synonym_replacement(text1, "bank", seed=42)
print_candidates(text1, new1, cand1, title="문맥1 ")

new2, cand2 = contextual_synonym_replacement(text2, "bank", seed=42)
print_candidates(text2, new2, cand2, title="문맥2 ")


문맥1 원문: He sat on the bank of the river and watched the boats pass by.
문맥1 대체: He sat on the riverbank of the river and watched the boats pass by.

문맥1 후보:
  1) riverbank            (유사도=0.506)
  2) blood bank           (유사도=0.298)
  3) eye bank             (유사도=0.285)
  4) merchant bank        (유사도=0.272)
  5) vertical bank        (유사도=0.264)
------------------------------------------------------------

문맥2 원문: She went to the bank to deposit some money.
문맥2 대체: She went to the deposit to deposit some money.

문맥2 후보:
  1) deposit              (유사도=0.569)
  2) depositary           (유사도=0.523)
  3) depository           (유사도=0.506)
  4) funds                (유사도=0.480)
  5) depository financial institution (유사도=0.459)
------------------------------------------------------------



## 증강과 데이터 품질의 균형
### 품질 필터링

In [132]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch, math
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

# 의미론적 유사성 모델 (3.4.1에서 사용했던 것과 동일)
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_similarity(original, augmented, model=similarity_model):
    original_embedding = model.encode(original)
    augmented_embedding = model.encode(augmented)
    similarity = cosine_similarity([original_embedding], [augmented_embedding])[0][0]
    return similarity

# perplexity 모델
perplexity_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2")
perplexity_model.eval()

def calculate_perplexity(text, model=perplexity_model, tokenizer=perplexity_tokenizer):
    encodings = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings["input_ids"])
        loss = outputs.loss.item()
    return math.exp(loss)

def quality_filter(
    augmented_texts, original_text,
    similarity_threshold=0.8, perplexity_threshold=100
):
    filtered_texts = []
    for aug_text in augmented_texts:
        if (
            semantic_similarity(original_text, aug_text, similarity_model)
            >= similarity_threshold
            and calculate_perplexity(aug_text, perplexity_model, perplexity_tokenizer)
            <= perplexity_threshold
        ):
            filtered_texts.append(aug_text)
    return filtered_texts

def demo_quality_filter(original, augmented_texts,
                        similarity_threshold=0.8, perplexity_threshold=100):
    print("원문:", original)
    print("\n후보 평가:")
    for aug in augmented_texts:
        sim = semantic_similarity(original, aug, similarity_model)
        ppl = calculate_perplexity(aug, perplexity_model, perplexity_tokenizer)
        status = "✅ 통과" if (sim >= similarity_threshold and ppl <= perplexity_threshold) else "❌ 탈락"
        print(f"- {aug}")
        print(f"   • 유사도={sim:.3f}, Perplexity={ppl:.1f} → {status}")
    print("\n최종 채택 문장:")
    filtered = quality_filter(augmented_texts, original,
                              similarity_threshold, perplexity_threshold)
    for f in filtered:
        print("  >", f)
    return filtered




In [133]:
# 테스트
original = "She quickly finished her work and left for the meeting."
augmented = [
    "She quickly finished her work and went to the meeting.",
    "She work finished quickly left meeting the for."
]

demo_quality_filter(original, augmented)



원문: She quickly finished her work and left for the meeting.

후보 평가:
- She quickly finished her work and went to the meeting.
   • 유사도=0.922, Perplexity=33.6 → ✅ 통과
- She work finished quickly left meeting the for.
   • 유사도=0.865, Perplexity=5437.8 → ❌ 탈락

최종 채택 문장:
  > She quickly finished her work and went to the meeting.


['She quickly finished her work and went to the meeting.']

### HITL 검증

In [134]:
augmented_texts = [
    "She quickly finished her work and went to the meeting.",
    "She work finished quickly left meeting the for."
]

def human_validation(augmented_texts):
    validated_texts = []
    for text in augmented_texts:
        ans = input(f"이 텍스트가 유효한가요? (y/n)\n{text}\n")
        if ans.lower() == "y":
            validated_texts.append(text)
    return validated_texts

validated = human_validation(augmented_texts)
print("최종 채택 문장:", validated)


이 텍스트가 유효한가요? (y/n)
She quickly finished her work and went to the meeting.
y
이 텍스트가 유효한가요? (y/n)
She work finished quickly left meeting the for.
n
최종 채택 문장: ['She quickly finished her work and went to the meeting.']


## 데이터 증강의 영향 평가
### 당혹도

In [136]:
import math
import torch

def evaluate_perplexity(model, tokenizer, test_data):
    model.eval()
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for text in test_data:
            inputs = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
            outputs = model(**inputs, labels=inputs["input_ids"])
            total_loss += (outputs.loss.item() * inputs["input_ids"].size(1))
            total_tokens += inputs["input_ids"].size(1)
    perplexity = math.exp(total_loss / total_tokens)
    return perplexity

# 원본 / 증강 데이터 준비
original_data = [
    "The movie was boring but the actors were great.",
    "She quickly finished her work and left for the meeting."
]

augmented_data = [
    "The film was dull but the performers were excellent.",
    "She rapidly completed her job and headed to the meeting."
]

# Perplexity 계산
ppl_original = evaluate_perplexity(model, tokenizer, original_data)
ppl_augmented = evaluate_perplexity(model, tokenizer, augmented_data)

# 결과 출력
print("📊 Perplexity 비교")
print(f" - 원본 데이터:   {ppl_original:.2f}")
print(f" - 증강 데이터:   {ppl_augmented:.2f}")
print()
if ppl_augmented < ppl_original:
    print("✅ 증강 데이터가 모델에 더 익숙해져 Perplexity가 낮아졌습니다.")
else:
    print("⚠️ 증강 데이터가 모델에 더 어렵게 느껴져 Perplexity가 높아졌습니다.")


📊 Perplexity 비교
 - 원본 데이터:   46.64
 - 증강 데이터:   99.24

⚠️ 증강 데이터가 모델에 더 어렵게 느껴져 Perplexity가 높아졌습니다.


### 과업별 지표

In [137]:
!pip install -q transformers datasets scikit-learn

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
import torch

# 사전 학습된 감정 분류 모델 불러오기
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
clf_tokenizer = AutoTokenizer.from_pretrained(model_name)
clf_model = AutoModelForSequenceClassification.from_pretrained(model_name)

def evaluate_classification(model, tokenizer, test_data, test_labels):
    model.eval()
    predictions = []
    with torch.no_grad():
        for text in test_data:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
            outputs = model(**inputs)
            predictions.append(torch.argmax(outputs.logits).item())

    accuracy = accuracy_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions, average="weighted")
    return accuracy, f1

# 간단한 예제 데이터
texts = ["I love this movie!", "This was the worst experience ever.", "It was okay, not great."]
labels = [1, 0, 1]  # 긍정=1, 부정=0 (SST-2 태스크 기준)

acc, f1 = evaluate_classification(clf_model, clf_tokenizer, texts, labels)
print(f"정확도: {acc:.2f}, F1 점수: {f1:.2f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

정확도: 0.67, F1 점수: 0.67


### 다양성 지표

In [138]:
def calculate_diversity_metrics(texts):
    all_words = [word for text in texts for word in text.split()]
    vocab_size = len(set(all_words))
    all_trigrams = [text[i:i+3] for text in texts
        for i in range(len(text)-2)]
    unique_trigrams = len(set(all_trigrams))
    return {
        "vocabulary_size": vocab_size,
        "unique_trigrams": unique_trigrams
    }


In [139]:
# 원문 + 증강문 예시
original_texts = [
    "She quickly finished her work and left for the meeting."
]

augmented_texts = [
    "She quickly completed her work and went to the meeting.",
    "She rapidly finished her job and left for the conference.",
    "She quickly finished the task and departed for the session."
]

print("원문 다양성:", calculate_diversity_metrics(original_texts))
print("증강 다양성:", calculate_diversity_metrics(augmented_texts))


원문 다양성: {'vocabulary_size': 10, 'unique_trigrams': 52}
증강 다양성: {'vocabulary_size': 19, 'unique_trigrams': 114}
