In [4]:
!pip install transformers
!pip install transformers[torch]
!pip install transformers[torch] accelerate -U datasets torch

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [6]:
import json
from datasets import Dataset
from transformers import GPT2Tokenizer

# JSON 파일 로드
with open('/content/KAKAO_1208_20.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Q&A 형식으로 데이터 변환
dialogs = []
for item in data['info']:
    lines = item['annotations']['lines']
    for i in range(len(lines) - 1):
        input_text = lines[i]['text']
        output_text = lines[i + 1]['text']
        dialogs.append({"input": input_text, "output": output_text})

# 데이터셋 생성
dataset = Dataset.from_dict({"input": [d['input'] for d in dialogs], "output": [d['output'] for d in dialogs]})

# 토크나이저 로드
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# 데이터 토크나이징
def tokenize_function(examples):
    return tokenizer(examples['input'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel,   Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 토크나이저 로드 및 패딩 토큰 설정
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # 패딩 토큰을 별도로 설정
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # 패딩 토큰 추가

# 모델 로드
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # 모델에 새로운 토큰 추가

# 데이터 로드 및 전처리
import json
from datasets import Dataset

with open('/content/KAKAO_1208_20.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

dialogs = []
for item in data['info']:
    lines = item['annotations']['lines']
    for i in range(len(lines) - 1):
        input_text = lines[i]['text']
        output_text = lines[i + 1]['text']
        dialogs.append({"input": input_text, "output": output_text})

dataset = Dataset.from_dict({"input": [d['input'] for d in dialogs], "output": [d['output'] for d in dialogs]})

# 데이터 토크나이징 및 형식 조정
def tokenize_function(examples):
    input_encoding = tokenizer(examples['input'], padding='max_length', truncation=True, max_length=128)
    output_encoding = tokenizer(examples['output'], padding='max_length', truncation=True, max_length=128)
    labels = output_encoding['input_ids']
    return {
        'input_ids': input_encoding['input_ids'],
        'attention_mask': input_encoding['attention_mask'],
        'labels': labels
    }

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 데이터 콜레이터 설정
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked Language Modeling 사용 안 함
)

# 트레이닝 인자 설정
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=200,
)

# 트레이너 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# 모델 학습
trainer.train()

# 모델 저장
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss


('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 파인튜닝한 모델 로드
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')

# 챗봇 응답 생성 함수
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt', padding=True, truncation=True)
    outputs = model.generate(inputs, max_length=50, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 예시 대화
user_input = "추천해줄만한 액션 영화 있어?"
response = generate_response(user_input)
print(response)


추천해줄만한 액션 영화 있어? 가지아 가�
