<a href="https://colab.research.google.com/github/yunyoungwoo/2024S-Ajou-ML-FP/blob/main/data_preprocessing0524.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import shutil
import random
import json
from PIL import Image
import pandas as pd
import torch
import torchvision.transforms as transforms
from torchvision import models
import pickle

In [2]:
# Colab 환경에 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 이미지와 라벨을 처리하는 부분
def process(image_source_dir, image_dest_dir, label_source_dir, label_dest_dir, num_images):
    # 중복 없는 랜덤 샘플링
    image_files = random.sample(os.listdir(image_source_dir), num_images)

    # 이미지 파일 옮기기
    for file in image_files:
        shutil.move(os.path.join(image_source_dir, file), os.path.join(image_dest_dir, file))

    # 해당 이미지에 대응하는 라벨 이동
    label_files = [file.replace('.jpg', '.json') for file in image_files]
    for label_file in label_files:
        shutil.move(os.path.join(label_source_dir, label_file), os.path.join(label_dest_dir, label_file))

In [None]:
# GPU 사용을 위해 device 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 이미지 resize하는 함수
def resize_images(dir, size=(224, 224)):
    # 이미지 전처리를 위한 transforms 정의
    preprocess = transforms.Compose([
        transforms.Resize(size),
        transforms.ToTensor(),
    ])
    for filename in os.listdir(dir):
        # 이미지 불러오기 및 GPU에 전달
        image = Image.open(os.path.join(dir, filename)).convert('RGB')
        image_tensor = preprocess(image).unsqueeze(0).to(device)

        # 이미지 저장
        output_image = transforms.ToPILImage()(image_tensor.squeeze(0).cpu())
        output_image.save(os.path.join(dir, filename))

In [None]:
# 데이터프레임 생성 및 one-hot encoding 수행
def create_dataframe(image_dir, label_dir, pickle_path):
    data = []
    # 문제가 발생한 파일 리스트
    problematic_files = []

    for label in os.listdir(label_dir):
        try:
            with open(os.path.join(label_dir, label), 'r') as f:
                label_data = json.load(f)
                # JSON 파일에서 value_6 열 추출
                value_6 = label_data['value_6']
        except Exception as e:
            print(f"Error reading JSON file: {os.path.join(label_dir, label)}")
            problematic_files.append(label)
            continue

        # 이미지 경로와 value_6 열을 데이터 리스트에 추가
        image_path = os.path.join(image_dir, label.replace('.json', '.jpg'))
        data.append([image_path, value_6])

    # DataFrame 생성
    df = pd.DataFrame(data, columns=['image_path', 'value_6'])

    # One-hot encoding 수행
    one_hot_encoded = pd.get_dummies(df['value_6'], prefix='class')
    df = df.drop('value_6', axis=1)
    df = pd.concat([df, one_hot_encoded], axis=1)

    # DataFrame 저장
    with open('/content/drive/MyDrive/Colab Notebooks/FP' + pickle_path, 'wb') as f:
        pickle.dump(df, f)

    # 문제가 발생한 파일들 삭제
    for file in problematic_files:
        image_path = os.path.join(image_dir, file.replace('.json', '.jpg'))
        json_path = os.path.join(label_dir, file)
        if os.path.exists(image_path):
            os.remove(image_path)
        if os.path.exists(json_path):
            os.remove(json_path)

In [5]:
# 데이터 증강을 수행하는 함수
def augment_images(original_df, image_dir, num_augmented_images_per_class):
    # 데이터 증강 설정
    transform = transforms.Compose([
        transforms.RandomRotation(20),
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(),
        transforms.ToTensor()
    ])

    augmented_images = []

    # 클래스 별로 증강된 이미지 수를 유지하기 위해 각 클래스별로 랜덤하게 선택
    for class_label in ['class_0', 'class_1', 'class_2', 'class_3']:
        selected_df = original_df[original_df[class_label] == 1].sample(n=num_augmented_images_per_class, random_state=42)

        for index, row in selected_df.iterrows():
            img_path = row['image_path']
            img = Image.open(img_path).convert('RGB')
            img_tensor = transform(img).unsqueeze(0).to(device)

            # 증강된 이미지를 저장
            augmented_img = transforms.ToPILImage()(img_tensor.squeeze(0).cpu())
            save_path = os.path.join(image_dir, f'aug_{index}.jpg')
            augmented_img.save(save_path)

            # 원본 이미지 정보와 동일한 클래스 정보를 유지
            augmented_image_info = {'image_path': save_path}
            for col in ['class_0', 'class_1', 'class_2', 'class_3']:
                augmented_image_info[col] = row[col]

            augmented_images.append(augmented_image_info)

    # 증강된 이미지 데이터프레임 생성
    augmented_df = pd.DataFrame(augmented_images)

    # 증강된 데이터프레임을 원본 데이터프레임에 추가
    combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

    return combined_df

In [None]:
# Training 폴더 경로 설정
training_image_dir0 = '/content/drive/MyDrive/Colab Notebooks/FP/Training/class0_image'
training_image_dir1 = '/content/drive/MyDrive/Colab Notebooks/FP/Training/class1_image'
training_image_dir2 = '/content/drive/MyDrive/Colab Notebooks/FP/Training/class2_image'
training_image_dir3 = '/content/drive/MyDrive/Colab Notebooks/FP/Training/class3_image'
training_label_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Training/class_label'

training_target_image_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Training/image'
training_target_label_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Training/label'

In [None]:
# Validation 폴더 경로 설정
validation_image_dir0 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class0_image'
validation_image_dir1 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class1_image'
validation_image_dir2 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class2_image'
validation_image_dir3 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class3_image'
validation_label_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class_label'

validation_target_image_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/image'
validation_target_label_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/label'

In [None]:
# Test 폴더 경로 설정
test_image_dir0 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class0_image'
test_image_dir1 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class1_image'
test_image_dir2 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class2_image'
test_image_dir3 = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class3_image'
test_label_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Validation/class_label'

test_target_image_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Test/image'
test_target_label_dir = '/content/drive/MyDrive/Colab Notebooks/FP/Test/label'

In [None]:
# training 이미지 및 라벨 처리
process(training_image_dir0, training_target_image_dir, training_label_dir, training_target_label_dir, 500)
process(training_image_dir1, training_target_image_dir, training_label_dir, training_target_label_dir, 500)
process(training_image_dir2, training_target_image_dir, training_label_dir, training_target_label_dir, 500)
process(training_image_dir3, training_target_image_dir, training_label_dir, training_target_label_dir, 500)

In [None]:
# Validation 이미지 및 라벨 처리
process(validation_image_dir0, validation_target_image_dir, validation_label_dir, validation_target_label_dir, 75)
process(validation_image_dir1, validation_target_image_dir, validation_label_dir, validation_target_label_dir, 75)
process(validation_image_dir2, validation_target_image_dir, validation_label_dir, validation_target_label_dir, 75)
process(validation_image_dir3, validation_target_image_dir, validation_label_dir, validation_target_label_dir, 75)

In [None]:
# Validation 이미지 및 라벨 처리
process(test_image_dir0, test_target_image_dir, test_label_dir, test_target_label_dir, 75)
process(test_image_dir1, test_target_image_dir, test_label_dir, test_target_label_dir, 75)
process(test_image_dir2, test_target_image_dir, test_label_dir, test_target_label_dir, 75)
process(test_image_dir3, test_target_image_dir, test_label_dir, test_target_label_dir, 75)

In [None]:
# 이미지 resize
resize_images(training_target_image_dir)
resize_images(validation_target_image_dir)
resize_images(test_target_image_dir)

In [None]:
# 데이터프레임 생성 및 저장
create_dataframe(training_target_image_dir, training_target_label_dir, 'training_df.pkl')
create_dataframe(validation_target_image_dir, validation_target_label_dir, 'validation_df.pkl')
create_dataframe(test_target_image_dir, test_target_label_dir, 'test_df.pkl')

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 저장된 데이터프레임 불러오기
with open('/content/drive/MyDrive/Colab Notebooks/FP/training_df.pkl', 'rb') as f:
    training_df = pickle.load(f)

augmented_df = augment_images(training_df, '/content/drive/MyDrive/Colab Notebooks/FP/Training/image', 500)

# 증강된 데이터프레임 저장
with open('/content/drive/MyDrive/Colab Notebooks/FP/augmented1_training.pkl', 'wb') as f:
    pickle.dump(augmented_df, f)

print(f'총 {len(augmented_df)}장의 증강된 이미지를 생성했습니다.')

총 4000장의 증강된 이미지를 생성했습니다.


In [None]:
# 저장된 데이터프레임 불러오기
with open('/content/drive/MyDrive/Colab Notebooks/FP/augmented_training.pkl', 'rb') as f:
    training_df = pickle.load(f)

In [None]:
# 클래스 별 샘플 수 확인
class_counts = training_df.iloc[:, 1:].sum(axis=0)  # 이미지 경로를 제외하고 원핫 인코딩된 클래스 열들만 선택하여 각 클래스 별 합을 계산
print(class_counts)
training_df.head()

class_0    575
class_1    575
class_2    575
class_3    575
dtype: int64


Unnamed: 0,image_path,class_0,class_1,class_2,class_3
0,/content/drive/MyDrive/Colab Notebooks/FP/Trai...,False,False,False,True
1,/content/drive/MyDrive/Colab Notebooks/FP/Trai...,False,False,False,True
2,/content/drive/MyDrive/Colab Notebooks/FP/Trai...,False,False,False,True
3,/content/drive/MyDrive/Colab Notebooks/FP/Trai...,False,False,False,True
4,/content/drive/MyDrive/Colab Notebooks/FP/Trai...,False,False,False,True
