In [2]:
import tensorflow as tf

# GPU 메모리 동적 할당 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU 메모리 동적 할당 활성화 완료")
    except RuntimeError as e:
        print(e)

GPU 메모리 동적 할당 활성화 완료


In [1]:
import os
import zipfile
import glob
import json
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# 1. 데이터 압축 해제 및 클리닝
extract_dir = "공감형동적대화(압축푼것)/train"
cleaned_dir = "./cleaned_data"
checkpoint_file = os.path.join(cleaned_dir, "cleaned_files.json")

if not os.path.exists(cleaned_dir):
    os.makedirs(cleaned_dir)

# TSV 파일 탐색
tsv_files = glob.glob(os.path.join(extract_dir, "**/*.tsv"), recursive=True)
print(f"TSV 파일 개수: {len(tsv_files)}")

  from .autonotebook import tqdm as notebook_tqdm


TSV 파일 개수: 25456


In [2]:
# 텍스트 클리닝 함수
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

# 체크포인트 로드 또는 클리닝 수행
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, "r", encoding="utf-8") as f:
        cleaned_tsv_files = json.load(f)
else:
    cleaned_tsv_files = []
    for tsv_file in tsv_files:
        try:
            df = pd.read_csv(tsv_file, delimiter="\t", on_bad_lines="skip", engine="python")
            if "utterance_text" in df.columns:
                df["utterance_text"] = df["utterance_text"].astype(str).apply(clean_text)
                cleaned_file = os.path.join(cleaned_dir, os.path.basename(tsv_file))
                df.to_csv(cleaned_file, sep="\t", index=False)
                cleaned_tsv_files.append(cleaned_file)
        except Exception as e:
            print(f"오류 발생: {tsv_file}: {e}")
    with open(checkpoint_file, "w", encoding="utf-8") as f:
        json.dump(cleaned_tsv_files, f, ensure_ascii=False, indent=2)

In [3]:
import pandas as pd
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, PreTrainedTokenizerFast

# 2. 데이터 로드
texts = []
for file in cleaned_tsv_files:
    df = pd.read_csv(file, sep="\t")
    if "utterance_text" in df.columns:
        texts.extend(df["utterance_text"].dropna().tolist())

print(f"총 수집된 문장 수: {len(texts)}")

# 3. KoGPT2 토크나이저 및 모델 로드 (✅ 올바른 FastTokenizer 사용)
model_name = "skt/kogpt2-base-v2"
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    model_name,
    bos_token='</s>',
    eos_token='</s>',
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>'
)

model = TFGPT2LMHeadModel.from_pretrained(model_name, from_pt=True)
model.resize_token_embeddings(len(tokenizer))

# 4. 데이터 토크나이징
def encode_sentences(sentences, max_length=128):
    input_ids, attention_masks, labels = [], [], []
    for text in sentences:
        enc = tokenizer.encode_plus(
            text,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="tf"
        )
        input_ids.append(enc["input_ids"])
        attention_masks.append(enc["attention_mask"])
        labels.append(enc["input_ids"])  # GPT-2는 입력=출력

    return (
        tf.concat(input_ids, axis=0),
        tf.concat(attention_masks, axis=0),
        tf.concat(labels, axis=0)
    )

# 5. 문장 불러오기 후 인코딩 (예: cleaned_tsv_files 기반 텍스트 리스트)
X, A, y = encode_sentences(texts)
print("✅ Tensor 준비 완료:", X.shape, A.shape, y.shape)

총 수집된 문장 수: 378562


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
2025-04-17 01:27:45.259358: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-04-17 01:27:45.259672: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-04-17 01:27:45.259704: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
I0000 00:00:1744820865.260565 21025700 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1744820865.260604 21025700 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, n

✅ Tensor 준비 완료: (378562, 128) (378562, 128) (378562, 128)


In [4]:
print(X.shape, X.dtype)  # ex: (378562, 128), tf.int32
print(A.shape, A.dtype)
print(y.shape, y.dtype)

(378562, 128) <dtype: 'int32'>
(378562, 128) <dtype: 'int32'>
(378562, 128) <dtype: 'int32'>


In [5]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import tensorflow as tf

tokenizer.pad_token = tokenizer.eos_token  # pad 토큰 설정
model.resize_token_embeddings(len(tokenizer))

# 6. 모델 컴파일
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn)

# 7. 학습 수행
model.fit(
    {"input_ids": X, "attention_mask": A},
    y,
    batch_size=4,
    epochs=3
)

# 8. 모델 저장
model.save_pretrained("./kogpt2-finetuned-final")
tokenizer.save_pretrained("./kogpt2-finetuned-final") 



Epoch 1/3


2025-04-17 01:34:04.685811: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
E0000 00:00:1744821245.252467 21025700 meta_optimizer.cc:967] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.


Epoch 2/3
Epoch 3/3


('./kogpt2-finetuned-final/tokenizer_config.json',
 './kogpt2-finetuned-final/special_tokens_map.json',
 './kogpt2-finetuned-final/tokenizer.json')

In [8]:
from transformers import TFGPT2LMHeadModel, PreTrainedTokenizerFast
import tensorflow as tf

# 1. 토크나이저 및 모델 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>',
    eos_token='</s>',
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>'
)
model = TFGPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2", from_pt=True)

# 2. 텍스트 입력
text = "오늘은 너무 팔이 아파서"
input_ids = tokenizer.encode(text, return_tensors="tf")
attention_mask = tf.ones_like(input_ids)

# 3. 텍스트 생성
gen_ids = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=64,
    repetition_penalty=2.0,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    use_cache=True
)

# 4. 디코딩 및 후처리
generated = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

# 5. 마침표(.)로 끝나지 않는 마지막 문장 제거
def remove_unfinished_sentence(text):
    sentences = text.strip().split('\n')
    if not sentences[-1].strip().endswith('.'):
        sentences = sentences[:-1]
    return '\n'.join(sentences)

cleaned_output = remove_unfinished_sentence(generated)

# 6. 출력
print("🧾 생성 결과:\n", cleaned_output)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.4.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'lm_head.weight', 'transformer.h.5.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.1.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initial

🧾 생성 결과:
 오늘은 너무 팔이 아파서 병원에 갔더니 그게 안 되더라고요.
그래 가지고 제가 그때는 그냥 저한테 이렇게 얘기했었어요.
그랬는데 이제 이거 어떻게 해야 될지 모르겠다.
