In [3]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk

# 데이터 경로 설정
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
augmented_data_path = os.path.join(parent_dir, 'data', 'augmented_dataset2')

# 데이터를 다시 불러오기
dataset = load_from_disk(augmented_data_path)

# DataFrame으로 변환
train_df = pd.DataFrame(dataset["train"])
valid_df = pd.DataFrame(dataset["validation"])

# Train과 Validation을 합침
combined_df = pd.concat([train_df, valid_df], ignore_index=True)

# 데이터 셔플
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 원하는 비율 설정 (예시: 80% Train, 20% Validation)
train_ratio = 0.9
train_size = int(len(shuffled_df) * train_ratio)

# Train과 Validation 나누기
new_train_df = shuffled_df[:train_size]
new_valid_df = shuffled_df[train_size:]

# Dataset으로 변환
new_train_dataset = Dataset.from_pandas(new_train_df)
new_valid_dataset = Dataset.from_pandas(new_valid_df)

# DatasetDict로 결합
new_dataset_dict = DatasetDict({
    "train": new_train_dataset,
    "validation": new_valid_dataset,
})

# 다시 Arrow 파일로 저장
output_dir = os.path.join(parent_dir, 'data', 'shuffled_dataset')
new_dataset_dict.save_to_disk(output_dir)

# 결과 확인
print(f"New augmented dataset saved to {output_dir}")
print(f"Train size: {len(new_train_df)}, Validation size: {len(new_valid_df)}")


Saving the dataset (1/1 shards): 100%|██████████| 5547/5547 [00:00<00:00, 789715.36 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 617/617 [00:00<00:00, 205029.76 examples/s]

New augmented dataset saved to D:\boost-camp\07주_MRC\pj\level2-mrc-nlp-04\data\shuffled_dataset
Train size: 5547, Validation size: 617



