<a href="https://colab.research.google.com/github/yunyoungwoo/2024S-Ajou-ML-FP/blob/main/temp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Colab 환경에 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import OneHotEncoder

# 디렉토리 경로 설정
training_directory = '/content/drive/MyDrive/Colab Notebooks/dataset/Training'
validation_directory = '/content/drive/MyDrive/Colab Notebooks/dataset/Validation'

# 이미지 및 라벨 폴더 경로 설정
training_image_directory = os.path.join(training_directory, 'image')
training_label_directory = os.path.join(training_directory, 'label')
validation_image_directory = os.path.join(validation_directory, 'image')
validation_label_directory = os.path.join(validation_directory, 'label')

# 이미지 및 라벨 파일 이름 리스트 생성
training_image_list = os.listdir(training_image_directory)
training_label_list = os.listdir(training_label_directory)
validation_image_list = os.listdir(validation_image_directory)
validation_label_list = os.listdir(validation_label_directory)

# validation 데이터셋을 validation과 test로 분리
val_images, test_images, val_labels, test_labels = train_test_split(
    validation_image_list, validation_label_list, test_size=0.5, random_state=42
)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/path_to_json_file.json'

In [None]:
def create_dataframe(image_dir, label_dir, image_list, label_list):
    data = []
    for image_file, label_file in zip(image_list, label_list):
        image_path = os.path.join(image_dir, image_file)
        label_path = os.path.join(label_dir, label_file)

        with open(label_path, 'r') as f:
            label_data = json.load(f)

        # 'image file name'과 'value6' 값 추출
        value6 = label_data.get('value6', None)
        data.append({'image_file_name': image_path, 'value6': value6})

    return pd.DataFrame(data)

In [None]:
# DataFrame 생성
training_df = create_dataframe(training_image_directory, training_label_directory, training_image_list, training_label_list)
validation_df = create_dataframe(validation_image_directory, validation_label_directory, val_images, val_labels)
test_df = create_dataframe(validation_image_directory, validation_label_directory, test_images, test_labels)

In [None]:
# 데이터프레임 확인
print("Training DataFrame length:", len(training_df))
print("Validation DataFrame length:", len(validation_df))
print("Test DataFrame length:", len(test_df))

print("\nTraining DataFrame class distribution:")
print(training_df['value6'].value_counts())

print("\nValidation DataFrame class distribution:")
print(validation_df['value6'].value_counts())

print("\nTest DataFrame class distribution:")
print(test_df['value6'].value_counts())

In [None]:
# 데이터 증강 설정
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [None]:
# 데이터 증강 함수
def augment_data(df, datagen, class_id, target_count):
    class_df = df[df['value6'] == class_id]
    augmented_images = []
    for index, row in class_df.iterrows():
        image_path = row['image_file_name']
        image = plt.imread(image_path)
        image = image.reshape((1,) + image.shape)
        count = 0
        for batch in datagen.flow(image, batch_size=1):
            new_image_path = f"{os.path.splitext(image_path)[0]}_aug_{count}.jpg"
            augmented_images.append({'image_file_name': new_image_path, 'value6': class_id})
            count += 1
            if count >= target_count:
                break
    return pd.DataFrame(augmented_images)

In [None]:
# 불균형 해결을 위한 데이터 증강
max_class_count = max(training_df['value6'].value_counts())
augmented_data = []
for class_id in training_df['value6'].unique():
    class_count = training_df['value6'].value_counts()[class_id]
    if class_count < max_class_count:
        augmented_df = augment_data(training_df, datagen, class_id, max_class_count - class_count)
        augmented_data.append(augmented_df)
if augmented_data:
    augmented_df = pd.concat(augmented_data)
    training_df = pd.concat([training_df, augmented_df]).reset_index(drop=True)

In [None]:
# 원-핫 인코딩
one_hot_encoder = OneHotEncoder()
training_df['value6'] = one_hot_encoder.fit_transform(training_df[['value6']]).toarray()
validation_df['value6'] = one_hot_encoder.transform(validation_df[['value6']]).toarray()
test_df['value6'] = one_hot_encoder.transform(test_df[['value6']]).toarray()

In [None]:
# 데이터프레임 확인
print("\nTraining DataFrame after augmentation and one-hot encoding:")
print(training_df.head())

print("\nValidation DataFrame after one-hot encoding:")
print(validation_df.head())

print("\nTest DataFrame after one-hot encoding:")
print(test_df.head())

In [None]:
import pickle

# 데이터프레임 저장 경로
save_path = '/content/drive/MyDrive/'

# training_df, validation_df, test_df 저장
with open(save_path + 'training_df.pickle', 'wb') as f:
    pickle.dump(training_df, f)

with open(save_path + 'validation_df.pickle', 'wb') as f:
    pickle.dump(validation_df, f)

with open(save_path + 'test_df.pickle', 'wb') as f:
    pickle.dump(test_df, f)

In [None]:
import pickle

# 저장된 데이터프레임 불러오기
load_path = '/content/drive/MyDrive/'

with open(load_path + 'training_df.pickle', 'rb') as f:
    training_df = pickle.load(f)

with open(load_path + 'validation_df.pickle', 'rb') as f:
    validation_df = pickle.load(f)

with open(load_path + 'test_df.pickle', 'rb') as f:
    test_df = pickle.load(f)