# 이 노트북에서의 특성 변환

이 노트북에서는 원시 텍스트를 토큰화하여 허깅페이스 훈련 스크립트에서 사용할 수 있는 입력으로 변환합니다. 이 과정에서 토큰화와 프롬프트 생성 단계를 훈련 과정과 분리하는 것이 중요합니다. 이를 통해 각 단계에 가장 적합한 컴퓨팅 자원을 사용할 수 있습니다. 예를 들어 저렴한 CPU는 데이터 준비 단계에 가장 적합하고 고성능 GPU는 모델 훈련에 가장 효율적입니다.

## 준비사항
아래 실습은 AWS ml.m5.2xlarge 인스턴스에서 수행했습니다.

## 커널 및 필요한 종속성 설정하기

In [2]:
%pip install -U --disable-pip-version-check \
    torch==2.3.1 \
    transformers==4.44.0 \
    datasets==2.21.0 \
    accelerate==0.33.0 \
    evaluate==0.4.2 \
    py7zr==0.22.0 \
    sentencepiece==0.2.0 \
    rouge-score==0.1.2 \
    loralib==0.1.2 \
    peft==0.12.0 \
    trl==0.9.6

Collecting torch==2.3.1Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.3.1+cu121 requires torch==2.3.1+cu121, but you have torch 2.3.1 which is incompatible.
torchvision 0.18.1+cu121 requires torch==2.3.1+cu121, but you have torch 2.3.1 which is incompatible.



  Downloading torch-2.3.1-cp311-cp311-win_amd64.whl.metadata (26 kB)
Downloading torch-2.3.1-cp311-cp311-win_amd64.whl (159.8 MB)
   ---------------------------------------- 0.0/159.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/159.8 MB 1.5 MB/s eta 0:01:45
   ---------------------------------------- 0.2/159.8 MB 2.3 MB/s eta 0:01:10
   ---------------------------------------- 0.4/159.8 MB 2.5 MB/s eta 0:01:03
   ---------------------------------------- 0.5/159.8 MB 2.8 MB/s eta 0:00:57
   ---------------------------------------- 0.7/159.8 MB 2.9 MB/s eta 0:00:55
   ---------------------------------------- 0.9/159.8 MB 3.1 MB/s eta 0:00:51
   ---------------------------------------- 1.3/159.8 MB 4.0 MB/s eta 0:00:40
   ---------------------------------------- 1.8/159.8 MB 4.8 MB/s eta 0:00:33
    --------------------------------------- 2.2/159.8 MB 5.2 MB/s eta 0:00:31
    --------------------------------------- 2.6/159.8 MB 5.7 MB/s eta 0:00:28
    --------------

In [3]:
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict
import os
import time

## 기본 데이터 세트가 다운로드되었는지 확인하기

In [4]:
from datasets import concatenate_datasets
dataset = load_dataset("knkarthick/dialogsum")
dataset = concatenate_datasets([dataset['train'], dataset['test'], dataset['validation']])
!mkdir data-summarization
dataset = dataset.train_test_split(0.5, seed=1234)
dataset['test'].to_csv('./data-summarization/dialogsum-1.csv', index=False)
dataset['train'].to_csv('./data-summarization/dialogsum-2.csv', index=False)

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

6579654

## 토크나이저 및 허깅페이스 데이터 세트 적재

In [5]:
model_checkpoint='google/flan-t5-base'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
dataset = load_dataset('./data-summarization/')
dataset

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 14460
    })
})

## 예제 프롬프트 탐색하기

In [8]:
idx = 0
diag = dataset['train'][idx]['dialogue']
baseline_human_summary = dataset['train'][idx]['summary']

prompt = f'Summarize the following conversation.\n\n{diag}\n\nSummary:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

print(f'프롬프트:\n--------------------------\n{prompt}\n--------------------------')
print(f'기본 인간 요약 : {baseline_human_summary}')

Prompt:
--------------------------
Summarize the following conversation.

#Person1#: If we employ you, what starting salary would you expect?
#Person2#: I'd like to start at 3000 yuan a month.
#Person1#: I think your background and experience are worth the compensation.
#Person2#: Does it include bonuses?
#Person1#: No, there are annual bonuses, one week paid vacation a year, and health insurance.
#Person2#: Very good.

Summary:
--------------------------
Baseline human summary : #Person1# agrees #Person2#'s starting monthly salary would be 3000 yuan and tells #Person2# about other benefits.


## 데이터 세트 토큰화하기

In [9]:
def tokenize_function(example):
    prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    inp = [prompt + i + end_prompt for i in example["dialogue"]]
    example['input_ids'] = tokenizer(inp, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/14460 [00:00<?, ? examples/s]

## 전처리 과정을 반복 가능한 함수로 만들기

In [10]:
def tokenize_function(example):
    prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    inp = [prompt + i + end_prompt for i in example["dialogue"]]
    example['input_ids'] = tokenizer(inp, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return example

def transform_dataset(input_data,
                      output_data,
                      huggingface_model_name,
                      train_split_percentage,
                      test_split_percentage,
                      validation_split_percentage,
                      ):

    # 원본 데이터 세트 적재
    dataset = load_dataset(input_data)
    print(f'데이터 세트를 다음 경로에서 불러왔습니다: {input_data}\n{dataset}')
    
    # 모델의 토크나이저 적재
    print(f'모델 {huggingface_model_name}의 토크나이저를 불러오는 중...')
    tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name)
    
    # 훈련, 테스트, 검증 데이터 분할
    train_testvalid = dataset['train'].train_test_split(1 - train_split_percentage, seed=1234)
    test_valid = train_testvalid['test'].train_test_split(test_split_percentage / (validation_split_percentage + test_split_percentage), seed=1234)
    train_test_valid_dataset = DatasetDict(
        {
            'train': train_testvalid['train'],
            'test': test_valid['test'],
            'validation': test_valid['train']
        }
    )
    print(f'데이터 세트 분할 후:\n{train_test_valid_dataset}')
    
    # 데이터 세트 토큰화
    print(f'데이터 세트 토큰화하는 중...')
    tokenized_datasets = train_test_valid_dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])
    print(f'토큰화 완료!')
    
    # 저장할 디렉토리 생성
    os.makedirs(f'{output_data}/train/', exist_ok=True)
    os.makedirs(f'{output_data}/test/', exist_ok=True)
    os.makedirs(f'{output_data}/validation/', exist_ok=True)
    file_root = str(int(time.time()*1000))
    
    # 데이터 세트 디스크에 저장
    print(f'데이터 세트 {output_data}에 저장하는 중...')
    tokenized_datasets['train'].to_parquet(f'./{output_data}/train/{file_root}.parquet')
    tokenized_datasets['test'].to_parquet(f'./{output_data}/test/{file_root}.parquet')
    tokenized_datasets['validation'].to_parquet(f'./{output_data}/validation/{file_root}.parquet')
    print('전처리 완료!')

In [11]:
def process(args):

    print(f"내용 목록 - {args.input_data}")
    dirs_input = os.listdir(args.input_data)
    for file in dirs_input:
        print(file)

    transform_dataset(input_data=args.input_data, #'./data-summarization/',
                      output_data=args.output_data, #'./data-summarization-processed/',
                      huggingface_model_name=args.model_checkpoint, #model_checkpoint,
                      train_split_percentage=args.train_split_percentage, #0.90
                      test_split_percentage=args.test_split_percentage, #0.05
                      validation_split_percentage=args.validation_split_percentage, #0.05
                     )

    print(f"내용 목록 - {args.output_data}")
    dirs_output = os.listdir(args.output_data)
    for file in dirs_output:
        print(file)

# 데이터 세트 로컬에서 처리하기

In [12]:
class Args:
    input_data: str
    output_data: str
    model_checkpoint: str
    train_split_percentage: float
    test_split_percentage: float
    validation_split_percentage: float

args = Args()

args.model_checkpoint = model_checkpoint
args.input_data = './data-summarization'
args.output_data = './data-summarization-processed'
args.train_split_percentage = 0.9
args.test_split_percentage = 0.05
args.validation_split_percentage = 0.05

# 이미 로컬에 저장된 데이터 제거
if os.path.isdir(args.output_data):
    import shutil
    shutil.rmtree(args.output_data)

process(args)

Listing contents of ./data-summarization
dialogsum-1.csv
dialogsum-2.csv
Dataset loaded from path: ./data-summarization
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 14460
    })
})
Loading the tokenizer for the model google/flan-t5-base
Dataset after splitting:
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 13014
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 723
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 723
    })
})
Tokenizing the dataset...




Map:   0%|          | 0/13014 [00:00<?, ? examples/s]

Map:   0%|          | 0/723 [00:00<?, ? examples/s]

Map:   0%|          | 0/723 [00:00<?, ? examples/s]

Tokenizing complete!
Writing the dataset to ./data-summarization-processed


Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Preprocessing complete!
Listing contents of ./data-summarization-processed
test
train
validation


## 데이터 세트를 올바르게 불러왔는지 확인하기

In [13]:
dataset = load_dataset(
    './data-summarization-processed/',
    data_files={'train': 'train/*.parquet', 'test': 'test/*.parquet', 'validation': 'validation/*.parquet'}
)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 13014
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 723
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 723
    })
})