## **Binary classification : Image classification**

     (참고) https://www.kaggle.com/devm2024/transfer-learning-with-vgg-16-cnn-aug-lb-0-1712

해당 코드는 이유한님이 정리해주신 kaggle-korea 커널 커리큘럼 중 **"Binary classification: Image classification"**의 첫번째 파트인 Statoil/C-CORE Iceberg Classifier Challenge 파트를 필사한 내용입니다. 

그 중 Transfer Learning with VGG-16 CNN+AUG LB 0.1712 글을 참고했으며 출처는 별도로 표기해두었습니다. 

코드를 필사하면서 개인적인 코드 해석을 추가해서 업데이트할 예정입니다. 

**목표: 매일 1개 커밋 업로드 및 코드 구현능력 향상**

### **Transfer Learning with VGG-16 CNN+AUG LB**

In [None]:
import numpy as np 
import pandas as pd

from subprocess import check_output

In [None]:
# 패키지 import

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from os.path import join as opj
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pylab
plt.rcParams['figure.figsize'] = 10, 10
%matplotlib inline

In [None]:
from google.colab import drive # Google colab과 Google drive 연동

# 출력되는 url 입력시 연동 완료
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# json file load
train = pd.read_json('/content/gdrive/My Drive/dataset/ice berg/train.json')
test = pd.read_json('/content/gdrive/My Drive/dataset/ice berg/test.json')

In [None]:
target_train = train['is_iceberg']

In [None]:
# is_iceberg는 binary한 값으로 테이블이 이뤄진 것을 확인할 수 있습니다 
target_train.head()

0    0
1    0
2    1
3    0
4    0
Name: is_iceberg, dtype: int64

In [None]:
# errors = 'coerce' 문자형을 숫자로 변경할 때 에러를 무시하는 옵션 
test['inc_angle'] = pd.to_numeric(test['inc_angle'], errors = 'coerce')
train['inc_angle'] = pd.to_numeric(train['inc_angle'], errors = 'coerce')
train['inc_angle'] = train['inc_angle'].fillna(method='pad') # "pad" 옵션을 통해 앞에서부터 데이터를 채워나간다 
X_angle = train['inc_angle']
test['inc_angle'] = pd.to_numeric(test['inc_angle'], errors = 'coerce')
X_test_angle = test['inc_angle']

# Generate the training data
X_band_1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train['band_1']])
X_band_2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train['band_2']])
X_band_3 = (X_band_1 + X_band_2)/2
X_train = np.concatenate([X_band_1[:, :, :, np.newaxis], X_band_2[:, :, :, np.newaxis], X_band_3[:, :, :, np.newaxis]], axis = -1)

In [None]:
X_band_1.shape

(1604, 75, 75)

In [None]:
X_band_test_1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test["band_1"]])
X_band_test_2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test["band_2"]])
X_band_test_3 = (X_band_test_1 + X_band_test_2)/2
X_test = np.concatenate([X_band_test_1[:, :, :, np.newaxis], X_band_test_2[:, :, :, np.newaxis], X_band_test_3[:, :, :, np.newaxis]], axis = -1)

In [None]:
# import keras
from matplotlib import pyplot
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten, Activation
from keras.layers import GlobalAveragePooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import Concatenate
from keras.models import Model
from keras import initializers
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

from keras.datasets import cifar10
from keras.applications.inception_v3 import InceptionV3
from keras.applications.vgg16 import VGG16
from keras.applications.mobilenet import MobileNet
from keras.applications.vgg19 import VGG19
from keras.layers import Concatenate, Dense, LSTM, Input, concatenate
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input

In [None]:
# data augment for multi-input
# shift, zoom 등의 옵션을 지정해 데이터 generator를 실시할 수 있습니다 

from keras.preprocessing.image import ImageDataGenerator
batch_size = 64
gen = ImageDataGenerator(horizontal_flip = True,
                         vertical_flip = True, 
                         width_shift_range = 0.,
                         height_shift_range = 0.,
                         channel_shift_range = 0., 
                         zoom_range = 0.2,
                         rotation_range = 10)

In [None]:
# return: 반환 즉시 함수를 마침
# yield: 잠시 함수 바깥의 코드가 실행되도록 양보해 값을 가져가게 한 뒤 제너레이터 내의 코드 실행 
def gen_flow_for_two_inputs(X1, X2, y): 
  genX1 = gen.flow(X1, y, batch_size = batch_size, seed = 5)
  genX2 = gen.flow(X1, X2, batch_size = batch_size, seed = 5)
  while True:
    X1i = genX1.next()
    X2i = genX2.next()
    yield [X1i[0], X2i[1]], X1i[1]

In [None]:
# generator를 위한 함수 정의
def get_callbacks(filepath, patience = 2):
  es = EarlyStopping('val_loss', patience = 10, mode = "min")
  msave = ModelCheckpoint(filepath, save_best_only=True)
  return [es, msave]

In [None]:
def getVggAngleModel(): 
  input_2 = Input(shape = [1], name = "angle")
  angle_layer = Dense(1, )(input_2)
  base_model = VGG16(weights = "imagenet", include_top = False, input_shape = X_train.shape[1:], classes = 1)
  x = base_model.get_layer("block5_pool").output

# globalaveragepooling layer(GAP)는 각 feature map 상의 노드값들의 평균을 출력한다. output: (1,1,d)
  x = GlobalAveragePooling2D()(x)
  merge_one = concatenate([x, angle_layer])
  merge_one = Dense(512, activation = "relu", name = "fc2")(merge_one)
  merge_one = Dropout(0.3)(merge_one)
  merge_one = Dense(512, activation = "relu", name = "fc3")(merge_one)
  merge_one = Dropout(0.3)(merge_one)

  predictions = Dense(1, activation = "sigmoid")(merge_one)

  model = Model([base_model.input, input_2], predictions)
  sgd = SGD(lr = 1e-3, decay = 1e-6, momentum = 0.9, nesterov = True)
  model.compile(loss = "binary_crossentropy", optimizer = sgd, metrics = ['accuracy'])
  return model

In [None]:
# TypeError: ('Keyword argument not understood:', 'inputs') 오류 해결

In [None]:
def myAngleCV(X_train, X_angle, X_test):
  K = 3
  folds = list(StratifiedKFold(n_splits = K, shuffle = True, random_state = 16).split(X_train, target_train))
  y_test_pred_log = 0
  y_train_pred_log = 0
  y_valid_pred_log = 0.0*target_train
  for j, (train_idx, test_idx) in enumerate(folds):
    print("\n============FOLD====", j)
    X_train_cv = X_train[train_idx]
    y_train_cv = target_train[train_idx]
    X_holdout = X_train[test_idx]
    Y_holdout = target_train[test_idx]

    # angle
    X_angle_cv = X_angle[train_idx]
    X_angle_hold = X_angle[test_idx]

    # 파일 경로를 정의하고 callback 값을 부른다
    file_path = "%s_aug_model_weights.hdf5"%j #모델을 각 j값에 대해 저장한다 
    callbacks = get_callbacks(filepath = file_path, patience=5)
    gen_flow = gen_flow_for_two_inputs(X_train_cv, X_angle_cv, y_train_cv)
    galaxyModel = getVggAngleModel()
    galaxyModel.fit_generator(
        gen_flow, 
        steps_per_epoch = 2, 
        epochs = 2, 
        shuffle = True, 
        verbose = 1, 
        validation_data = ([X_holdout, X_angle_hold], Y_holdout), 
        callbacks = callbacks)
    
    galaxyModel.load_weights(filepath = file_path)
    score = galaxyModel.evaluate([X_train_cv, X_angle_cv], y_train_cv, verbose = 0)
    print("Train loss: ", score[0])
    print("Test accuracy: ", score[1])

    # get validation score
    pred_valid = galaxyModel.predict([X_holdout, X_angle_hold])
    y_valid_pred_log[test_idx] = pred_valid.reshape(pred_valid.shape[0])

    # get test score
    temp_test = galaxyModel.predict([X_test, X_test_angle])
    y_test_pred_log += temp_test.reshape(temp_test.shape[0])

    temp_train = galaxyModel.predict([X_train, X_angle])
    y_train_pred_log += temp_train.reshape(temp_train.shape[0])

  y_test_pred_log = y_test_pred_log/K
  y_train_pred_log = y_train_pred_log/K
  
  print("\n Train Log Logg Validation = ", log_loss(target_train, y_train_pred_log))
  print(" Test Log Loss Validation = ", log_loss(target_train, y_valid_pred_log))
  return y_test_pred_log

In [None]:
# 코드 출력값을 확인하기위해 steps_per_epoch, epochs, K 값을 일부 수정하여 성능이 낮게 나왔습니다. 
# 성능향상을 위해서는 해당 파라미터의 값을 높이는 것도 좋을 것 같습니다.  
preds = myAngleCV(X_train, X_angle, X_test)






Epoch 1/2
Epoch 2/2
Train loss:  0.6446225047111511
Test accuracy:  0.6164639592170715

Epoch 1/2
Epoch 2/2
Train loss:  0.6396046876907349
Test accuracy:  0.579045832157135

Epoch 1/2
Epoch 2/2
Train loss:  0.6639807820320129
Test accuracy:  0.5953270792961121

 Train Log Logg Validation =  0.6378442079869292
 Test Log Loss Validation =  0.6493596159501152


In [None]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['is_iceberg'] = preds
submission.to_csv('/content/submission.csv', index = False)