## 환경 세팅

dataset: https://www.kaggle.com/datasets/birdy654/cifake-real-and-ai-generated-synthetic-images

참조 (CIFAKE): https://www.youtube.com/watch?v=yf3rcJQG2Bg

In [None]:
import os
import shutil
from google.colab import userdata
from IPython.display import clear_output

# 1. Kaggle 인증 설정
os.environ['KAGGLE_USERNAME'] = userdata.get('auditAPI')
os.environ['KAGGLE_KEY'] = userdata.get('auditAPI_KEY')

# 2. 초기화
data_dir = '/content/cifake'
zip_file = 'cifake-real-and-ai-generated-synthetic-images.zip'

# 기존 압축 파일이 있다면 삭제
if os.path.exists(zip_file):
    os.remove(zip_file)
    print(f"Deleted existing zip file: {zip_file}")

# 기존 데이터 폴더가 있다면 전체 삭제 (초기화)
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)
    print(f"Deleted existing directory: {data_dir}")

# 3. 데이터셋 다운로드 및 압축 해제
print("Downloading dataset...")
!pip install -q Kaggle
!kaggle datasets download -d birdy654/cifake-real-and-ai-generated-synthetic-images

print("Unzipping dataset...")
!unzip -q {zip_file} -d {data_dir}

# 4. 결과 정리
clear_output()
print('Kaggle Authentication and Dataset Setup Successful.')
print(f'Data is ready at: {data_dir}')

## Data Extraction and Cleanup
real, fake 별로 10000장씩 꺼내서 validation set을 구성


In [None]:
import os
import random
import shutil

DATA_ROOT = '/content/cifake'
train_path = os.path.join(DATA_ROOT, 'train')
validation_path = os.path.join(DATA_ROOT, 'validation')

VALIDATION_SAMPLE_PER_CLASS = 10000

# Create Validation Folders
os.makedirs(os.path.join(validation_path, 'REAL'), exist_ok=True)
os.makedirs(os.path.join(validation_path, 'FAKE'), exist_ok=True)

# REAL
real_train_source = os.path.join(train_path, 'REAL')
real_validation_dest = os.path.join(validation_path, 'REAL')

real_images = os.listdir(real_train_source)
random.shuffle(real_images)

images_to_move_real = real_images[:VALIDATION_SAMPLE_PER_CLASS]

for image in images_to_move_real:
    shutil.move(
        os.path.join(real_train_source, image),
        os.path.join(real_validation_dest, image)
    )
print('Finish Validation set ( Real )')

# FAKE
fake_train_source = os.path.join(train_path, 'FAKE')
fake_validation_dest = os.path.join(validation_path, 'FAKE')

fake_images = os.listdir(fake_train_source)
random.shuffle(fake_images)

images_to_move_fake = fake_images[:VALIDATION_SAMPLE_PER_CLASS]

for image in images_to_move_fake:
    shutil.move(
        os.path.join(fake_train_source, image),
        os.path.join(fake_validation_dest, image)
    )
print('Finish Validation set ( Fake )')

# Re-count files in directories
new_train_real_count = len(os.listdir(real_train_source))
new_train_fake_count = len(os.listdir(fake_train_source))
var_real_count = len(os.listdir(real_validation_dest))
var_fake_count = len(os.listdir(fake_validation_dest))

print('\nChecking...\n')
test_path = os.path.join(DATA_ROOT, 'test')
test_real_count = len(os.listdir(os.path.join(test_path, 'REAL')))
test_fake_count = len(os.listdir(os.path.join(test_path, "FAKE")))

print(f'Train (Real): {new_train_real_count}')
print(f'Train (Fake): {new_train_fake_count}')
print(f'\nValidation (Real): {var_real_count}')
print(f'Validation (Fake): {var_fake_count}')
print(f'\nTest (Real): {test_real_count}')
print(f'Test (Fake): {test_fake_count}')