# Image_Preprocessing_RGB
- Input shape: (64, 64, 3)
- Input Data: 200 x 200 사진, 23708장
- Output Data: NP Array file

In [1]:
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm

## I. Google Colab Mount

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## II. Image Load

In [3]:
%%time

path = '/content/drive/MyDrive/Colab Notebooks/dataset/UTKFace(23708)'
files = os.listdir(path)
size = len(files)
print("Total samples:",size)
print(files[0])

Total samples: 23708
56_0_2_20170119180737540.jpg.chip.jpg
CPU times: user 380 ms, sys: 65.5 ms, total: 446 ms
Wall time: 1min 39s


In [4]:
files_new = []

for i in tqdm(files):
    split_var = i.split('_')
    if 0 < int(split_var[0]) <=8 or 20 <= int(split_var[0]):
      files_new.append(i)

100%|██████████| 23708/23708 [00:00<00:00, 569936.66it/s]


## III. Numpy Array 변환

- Numpy Array 변환

In [6]:
images = []
ages = []
genders = []

for i in tqdm(range(0, len(files_new))):
    image = cv2.imread(path+'/'+files_new[i],0)
    image = cv2.resize(image, (128,128), interpolation=cv2.INTER_AREA)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = image.reshape((image.shape[0], image.shape[1], 3))
    images.append(image)
    split_var = files_new[i].split('_')
    ages.append(split_var[0])
    genders.append(int(split_var[1]))

100%|██████████| 22011/22011 [1:23:04<00:00,  4.42it/s]


# Data Preprocessing

## I. Array 및 Ages, Genders DF 생성

In [7]:
images = pd.Series(list(images), name = 'Images')
ages = pd.Series(list(ages), name = 'Ages')
genders = pd.Series(list(genders), name = 'Genders')

DF = pd.concat([images, ages, genders], axis=1) # DF 합치기. axis = 1: 좌우로

DF

Unnamed: 0,Images,Ages,Genders
0,"[[[50, 50, 50], [53, 53, 53], [69, 69, 69], [9...",56,0
1,"[[[50, 50, 50], [53, 53, 53], [69, 69, 69], [9...",56,0
2,"[[[114, 114, 114], [114, 114, 114], [114, 114,...",56,0
3,"[[[110, 110, 110], [110, 110, 110], [109, 109,...",56,0
4,"[[[148, 148, 148], [150, 150, 150], [151, 151,...",56,0
...,...,...,...
22006,"[[[190, 190, 190], [200, 200, 200], [211, 211,...",1,1
22007,"[[[185, 185, 185], [187, 187, 187], [192, 192,...",1,0
22008,"[[[28, 28, 28], [16, 16, 16], [23, 23, 23], [4...",1,1
22009,"[[[133, 133, 133], [141, 141, 141], [129, 129,...",1,0


- Age Grouping function

In [8]:
# 14세 미만:0 / 14~2030: 1 / 4050: 2 / 실버: 3
def age_group(age):
    if 0 <= age < 14:
        return 0
    elif 14 <= age < 40:
        return 1
    elif 40<= age < 60:
        return 2
    else:
      return 3

In [9]:
target_age = np.zeros((len(images)),dtype='float32')

for i in range(len(ages)):
  target_age[i] = age_group(int(ages[i]))

- Image Array

In [23]:
images_new = []

for i in range(len(images)):
  images_new.append(images[i])

In [24]:
images_new = np.array(images_new)

In [25]:
images_new.shape

(22011, 128, 128, 3)

- DF

In [26]:
DF['age_group'] = target_age

DF

Unnamed: 0,Images,Ages,Genders,age_group
0,"[[[50, 50, 50], [53, 53, 53], [69, 69, 69], [9...",56,0,2.0
1,"[[[50, 50, 50], [53, 53, 53], [69, 69, 69], [9...",56,0,2.0
2,"[[[114, 114, 114], [114, 114, 114], [114, 114,...",56,0,2.0
3,"[[[110, 110, 110], [110, 110, 110], [109, 109,...",56,0,2.0
4,"[[[148, 148, 148], [150, 150, 150], [151, 151,...",56,0,2.0
...,...,...,...,...
22006,"[[[190, 190, 190], [200, 200, 200], [211, 211,...",1,1,0.0
22007,"[[[185, 185, 185], [187, 187, 187], [192, 192,...",1,0,0.0
22008,"[[[28, 28, 28], [16, 16, 16], [23, 23, 23], [4...",1,1,0.0
22009,"[[[133, 133, 133], [141, 141, 141], [129, 129,...",1,0,0.0


- Age Value 확인 -> 데이터 불균형

In [27]:
DF['age_group'].value_counts()

1.0    11881
2.0     4544
0.0     2896
3.0     2690
Name: age_group, dtype: int64

- Gender Value 확인 -> 균형

In [28]:
DF['Genders'].value_counts()

0    11574
1    10437
Name: Genders, dtype: int64

In [None]:
# 애기들 사진만 ImegeGenerator? SMOTE? 경계에서 오분류 자주 일어날 때는 Borderline-SMOTE:KNN 사용하여 Borderline에서 데이터 생성 / SVM-SMOTE: 
# 오버 샘플링 이유? 데이터 500개 -> 9000개 했을 때 동일 모델 accuracy 0.77 -> 0.84 증가. 따라서 데이터 수 줄이는 언더 샘플링은 모델 성능에 안좋을 것으로 예상
# 집단 간 동일하게 sampling -> stratify (O)
# train data -> Image Augmentation (O)
# val & test -> Normalization only (O)
# y 값 encoding (O)
# age는 categorical / gender는 binary (O)

# Train & Test Split

## I. Age

- Train & Test Split

In [50]:
from sklearn.model_selection import train_test_split

X_train_age, X_test_age, y_train_age, y_test_age = train_test_split(images_new, DF['age_group'],
                                                                  test_size = int(len(images_new)*0.2),
                                                                  stratify = DF['age_group'])

In [51]:
X_train_age, X_val_age, y_train_age, y_val_age = train_test_split(X_train_age, y_train_age,
                                                                  test_size = int(len(images_new)*0.2),
                                                                  stratify = y_train_age)

In [52]:
print(X_train_age.shape)
print(X_val_age.shape)
print(X_test_age.shape)
print(y_train_age.shape)
print(y_val_age.shape)
print(y_test_age.shape)

(13207, 128, 128, 3)
(4402, 128, 128, 3)
(4402, 128, 128, 3)
(13207,)
(4402,)
(4402,)


- Target Encoding

In [53]:
# age data만 Encoding 필요 / gender는 binary
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

def one_hot_encoding(x_label):
    # 라벨 인코더 생성
    encoder = LabelEncoder()
    # X_train데이터를 이용 피팅하고 라벨숫자로 변환한다
    encoder.fit(x_label)
    y = encoder.transform(x_label)
    y = to_categorical(y, 4)
    return y

In [54]:
y_train_age_OHE = one_hot_encoding(y_train_age)
y_val_age_OHE = one_hot_encoding(y_val_age)
y_test_age_OHE = one_hot_encoding(y_test_age)

In [55]:
print(X_train_age.shape)
print(X_val_age.shape)
print(X_test_age.shape)
print(y_train_age_OHE.shape)
print(y_val_age_OHE.shape)
print(y_test_age_OHE.shape)

(13207, 128, 128, 3)
(4402, 128, 128, 3)
(4402, 128, 128, 3)
(13207, 4)
(4402, 4)
(4402, 4)


- Numpy Array Save

In [56]:
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Age/X_train_age_Original_128', X_train_age)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Age/X_val_age_Original_128', X_val_age)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Age/X_test_age_Original_128', X_test_age)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Age/y_train_age_OHE_Original_128', y_train_age_OHE)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Age/y_val_age_OHE_Original_128', y_val_age_OHE)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Age/y_test_age_OHE_Original_128', y_test_age_OHE)

## II. Gender

In [57]:
from sklearn.model_selection import train_test_split

X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(images_new, DF['Genders'],
                                                                                test_size = int(len(images_new)*0.2),
                                                                                stratify = DF['Genders']) # stratify

In [58]:
X_train_gender, X_val_gender, y_train_gender, y_val_gender = train_test_split(X_train_gender, y_train_gender,
                                                                              test_size = int(len(images_new)*0.2),
                                                                              stratify = y_train_gender)

In [59]:
y_test_gender = np.array(y_test_gender, dtype = 'float32')
y_train_gender = np.array(y_train_gender, dtype = 'float32')
y_val_gender = np.array(y_val_gender, dtype = 'float32')

In [61]:
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Gender/X_train_gender_Original_128', X_train_gender)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Gender/X_val_gender_Original_128', X_val_gender)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Gender/X_test_gender_Original_128', X_test_gender)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Gender/y_train_gender_Original_128', y_train_gender)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Gender/y_val_gender_Original_128', y_val_gender)
np.save('/content/drive/MyDrive/Colab Notebooks/dataset/Numpy_Array/Original/Gender/y_test_gender_Original_128', y_test_gender)

In [60]:
print(X_train_gender.shape)
print(X_val_gender.shape)
print(X_test_gender.shape)
print(y_train_gender.shape)
print(y_val_gender.shape)
print(y_test_gender.shape)

(13207, 128, 128, 3)
(4402, 128, 128, 3)
(4402, 128, 128, 3)
(13207,)
(4402,)
(4402,)
