In [1]:
%cd E:\kaggle\iceberg

E:\kaggle\iceberg


In [2]:
import pandas as pd
import numpy as np
import cv2
SEED = 1234
np.random.seed(SEED) 

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam

from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [3]:
def get_scaled_imgs(df):
    imgs = []
    
    for i, row in df.iterrows():
        #make 75x75 image
        band_1 = np.array(row['band_1']).reshape(75, 75)
        band_2 = np.array(row['band_2']).reshape(75, 75)
        band_3 = band_1 + band_2 # plus since log(x*y) = log(x) + log(y)
        
        # Rescale
        a = (band_1 - band_1.mean()) / (band_1.max() - band_1.min())
        b = (band_2 - band_2.mean()) / (band_2.max() - band_2.min())
        c = (band_3 - band_3.mean()) / (band_3.max() - band_3.min())

        imgs.append(np.dstack((a, b, c)))

    return np.array(imgs)

def get_more_images(imgs):
    more_images = []
    vert_flip_imgs = []
    hori_flip_imgs = []
    vh_flip_imgs = []
      
    for i in range(0,imgs.shape[0]):
        vert_flip_imgs.append(cv2.flip(imgs[i], 1))
        hori_flip_imgs.append(cv2.flip(imgs[i], 0))
        vh_flip_imgs.append(cv2.flip(imgs[i], -1))
      
    v = np.array(vert_flip_imgs)
    h = np.array(hori_flip_imgs)
    vh = np.array(vh_flip_imgs)
       
    more_images = np.concatenate((imgs,v,h, vh))
    
    return more_images

In [4]:
use_custom_augmentation = True
if use_custom_augmentation:
    df_train = pd.read_json('E:/kaggle/iceberg/train.json/data/processed/train.json')
    df_test = pd.read_json('E:/kaggle/iceberg/test.json/data/processed/test.json')
    Xtrain = get_scaled_imgs(df_train)
    Xtest = get_scaled_imgs(df_test)
    Ytrain = np.array(df_train['is_iceberg'])
    
    df_train["inc_angle"] = df_train["inc_angle"].replace('na',0)
    df_test["inc_angle"] = df_test["inc_angle"].replace('na',0)
    idx_tr = np.where(df_train["inc_angle"]>0)
    Xtrain = Xtrain[idx_tr[0]]
    Ytrain = Ytrain[idx_tr[0]]
    
    Xtrain = get_more_images(Xtrain) 
    Ytrain = np.concatenate((Ytrain,Ytrain,Ytrain, Ytrain))

In [5]:
if not use_custom_augmentation:
    df_train = pd.read_json('E:/kaggle/iceberg/train.json/data/processed/train.json')
    Xtrain_band_1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df_train["band_1"]])
    Xtrain_band_2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df_train["band_2"]])
    Xtrain_band_3 = Xtrain_band_1 + Xtrain_band_2

    Xtrain = np.concatenate([Xtrain_band_1[:, :, :, np.newaxis], Xtrain_band_2[:, :, :, np.newaxis], Xtrain_band_3[:, :, :, np.newaxis]], axis=-1)
    Ytrain = df_train["is_iceberg"].values

    df_train["inc_angle"] = df_train["inc_angle"].replace('na',0)
    idx_tr = np.where(df_train["inc_angle"]>0)

    Xtrain = Xtrain[idx_tr[0]]
    Ytrain = Ytrain[idx_tr[0]]

    df_test = pd.read_json('E:/kaggle/iceberg/test.json/data/processed/test.json')
    Xtest_band_1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df_test["band_1"]])
    Xtest_band_2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df_test["band_2"]])
    Xtest_band_3 = Xtest_band_1 + Xtest_band_2

    Xtest = np.concatenate([Xtest_band_1[:, :, :, np.newaxis], Xtest_band_2[:, :, :, np.newaxis], Xtest_band_3[:, :, :, np.newaxis]], axis=-1)

    df_test["inc_angle"] = df_test["inc_angle"].replace('na',0)

In [6]:
def getModel():
    #Build keras model
    
    model=Sequential()
    
    # CNN 1
    model.add(Conv2D(64, kernel_size=(3, 3),activation='relu', input_shape=(75, 75, 3)))
    model.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    model.add(Dropout(0.2))

    # CNN 2
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu' ))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(Dropout(0.2))

    # CNN 3
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(Dropout(0.3))

    #CNN 4
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(Dropout(0.3))

    # You must flatten the data for the dense layers
    model.add(Flatten())

    #Dense 1
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))

    #Dense 2
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))

    # Output 
    model.add(Dense(1, activation="sigmoid"))

    optimizer = Adam(lr=0.001, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [7]:
model = getModel()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 73, 73, 64)        1792      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 15, 15, 128)       147584    
__________

In [8]:
batch_size = 32
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min')
if not use_custom_augmentation:
    datagen = ImageDataGenerator(horizontal_flip = True,
                             vertical_flip = True,
                             samplewise_center = True,
                             samplewise_std_normalization = True,
                             width_shift_range = 0.,
                             height_shift_range = 0.,
                             channel_shift_range=0,
                             zoom_range = 0.2,
                             rotation_range = 10)
    datagen.fit(Xtrain)

In [11]:
K=3
Kfolds = list(StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED).split(Xtrain, Ytrain))
y_test_pred_log = 0
for j, (train_idx, test_idx) in enumerate(Kfolds):
    print('\n===================FOLD=',j)
    Xtrain_cv = Xtrain[train_idx]
    Ytrain_cv = Ytrain[train_idx]
    Xtrain_val = Xtrain[test_idx]
    Ytrain_val = Ytrain[test_idx]
    
    model_file = 'model_%s.hdf5' % j
    
    mcp_save = ModelCheckpoint(model_file, save_best_only=True, monitor='val_loss', mode='min')
    model = getModel()
    if not use_custom_augmentation:
        model.fit_generator(
            datagen.flow(Xtrain_cv, Ytrain_cv, batch_size=batch_size, seed=SEED), 
            steps_per_epoch=len(Xtrain_cv) / batch_size,
            epochs=100, 
            verbose=1,
            callbacks=[earlyStopping, mcp_save, reduce_lr_loss], 
            validation_data=(Xtrain_val, Ytrain_val)
        )
    else:
        model.fit(Xtrain_cv, Ytrain_cv, batch_size=batch_size, epochs=50, verbose=1, callbacks=[earlyStopping, mcp_save, reduce_lr_loss], validation_data=(Xtrain_val, Ytrain_val))

    model.load_weights(filepath = model_file)    
    
    score = model.evaluate(Xtrain, Ytrain, verbose=1)
    print('Train score:', score[0])
    print('Train accuracy:', score[1])
    y_test_pred_log += model.predict(Xtest).reshape(Xtest.shape[0])
    
y_test_pred_log /= K


Train on 3922 samples, validate on 1962 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 00030: reducing learning rate to 0.00010000000474974513.
Epoch 32/50
Epoch 33/50
Epoch 34/50
Train accuracy: 0.968048946295

Train on 3923 samples, validate on 1961 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 00023: reducing learning rate to 0.00010000000474974513.
Epoch 25/50
Epoch 26/50
Epoch 27/50
Train accuracy: 0.956152277362


In [12]:
submission = pd.DataFrame({'id': df_test["id"], 'is_iceberg': y_test_pred_log.reshape(y_test_pred_log.shape[0])})
print(submission.head(10))
print(submission.count(), Xtest.shape[0])

submission.to_csv('submission-cnn-custom.csv', index=False)

         id  is_iceberg
0  5941774d    0.011371
1  4023181e    0.992145
2  b20200e4    0.010635
3  e7f018bb    0.999868
4  4371c8c3    0.984800
5  a8d9b1fd    0.969786
6  29e7727e    0.055787
7  92a51ffb    0.999586
8  c769ac97    0.000039
9  aee0547d    0.000062
id            8424
is_iceberg    8424
dtype: int64 8424
