<a href="https://colab.research.google.com/github/taeyoonnoh/Kaggle-Competition/blob/main/Iceberg_Classifier_Challenge/Best_Statoil_C_CORE_Iceberg_Classifier_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ! pip install -q kaggle

# from google.colab import files

# files.upload()

# ! mkdir ~/.kaggle

# ! cp kaggle.json ~/.kaggle/

# ! chmod 600 ~/.kaggle/kaggle.json


In [5]:
# !kaggle competitions download -c statoil-iceberg-classifier-challenge

In [6]:
# !7z e train.json.7z
# !7z e test.json.7z
# !7z e sample_submission.csv.7z

In [7]:
import pandas as pd 
import numpy as np 
import cv2 # Used to manipulated the images 
np.random.seed(1337) # The seed I used - pick your own or comment out for a random seed. A constant seed allows for better comparisons though

# Import Keras 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam

In [8]:
df_train = pd.read_json('train.json') # this is a dataframe

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit, KFold
from scipy.ndimage.filters import uniform_filter
from scipy.ndimage.measurements import variance
from sklearn.model_selection import train_test_split

In [10]:
def lee_filter(img, size):
    
    img_mean = uniform_filter(img, (size, size))
    img_sqr_mean = uniform_filter(img**2, (size, size))
    img_variance = img_sqr_mean - img_mean**2

    overall_variance = variance(img)

    img_weights = img_variance**2 / (img_variance**2 + overall_variance**2)
    img_output = img_mean + img_weights * (img - img_mean)

    return img_output

In [11]:
def get_scaled_imgs(df):
    imgs = []
    
    for i, row in df.iterrows():
        #make 75x75 image
        band_1 = np.array(row['band_1']).reshape(75, 75)
        band_2 = np.array(row['band_2']).reshape(75, 75)
        band_3 = band_1 + band_2 # plus since log(x*y) = log(x) + log(y)

        # use a lee filter to help with speckling
        band_1 = lee_filter(band_1,4)
        band_2 = lee_filter(band_2,4)
        band_3 = lee_filter(band_3,4)

        # Rescale
        a = (band_1 - band_1.mean()) / (band_1.max() - band_1.min())
        b = (band_2 - band_2.mean()) / (band_2.max() - band_2.min())
        c = (band_3 - band_3.mean()) / (band_3.max() - band_3.min())

        imgs.append(np.dstack((a, b, c)))

    return np.array(imgs)

In [12]:
Xtrain = get_scaled_imgs(df_train)

In [13]:
Ytrain = np.array(df_train['is_iceberg'])


In [14]:
df_train.inc_angle = df_train.inc_angle.replace('na',0)
idx_tr = np.where(df_train.inc_angle>0)

idx_tr

(array([   0,    1,    2, ..., 1506, 1507, 1508]),)

In [15]:
Ytrain = Ytrain[idx_tr[0]]
Xtrain = Xtrain[idx_tr[0],...]

In [16]:
def get_more_images(imgs):
    
    more_images = []
    vert_flip_imgs = []
    hori_flip_imgs = []
      
    for i in range(0,imgs.shape[0]):
        a=imgs[i,:,:,0]
        b=imgs[i,:,:,1]
        c=imgs[i,:,:,2]
        
        av=cv2.flip(a,1)
        ah=cv2.flip(a,0)
        bv=cv2.flip(b,1)
        bh=cv2.flip(b,0)
        cv=cv2.flip(c,1)
        ch=cv2.flip(c,0)
        
        vert_flip_imgs.append(np.dstack((av, bv, cv)))
        hori_flip_imgs.append(np.dstack((ah, bh, ch)))
      
    v = np.array(vert_flip_imgs)
    h = np.array(hori_flip_imgs)
       
    more_images = np.concatenate((imgs,v,h))
    
    return more_images

In [17]:
Xtr_more = get_more_images(Xtrain) 


In [18]:
Ytr_more = np.concatenate((Ytrain,Ytrain,Ytrain))


In [19]:
#define our model
def getModel():
    #Building the model
    gmodel=Sequential()
    #Conv Layer 1
    gmodel.add(Conv2D(64, kernel_size=(3, 3),activation='relu', input_shape=(75, 75, 3)))
    gmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 2
    gmodel.add(Conv2D(128, kernel_size=(3, 3), activation='relu' ))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 3
    gmodel.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 4
    gmodel.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Flatten the data for upcoming dense layers
    gmodel.add(Flatten())

    #Dense Layers
    gmodel.add(Dense(512))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Dense Layer 2
    gmodel.add(Dense(256))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Sigmoid Layer
    #Adam 은 모델을 최적화 시켜주는 기능을 한다, 
    gmodel.add(Dense(1))
    gmodel.add(Activation('sigmoid'))

    mypotim=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    gmodel.compile(loss='binary_crossentropy',
                  optimizer=mypotim,
                  metrics=['accuracy'])
    gmodel.summary()
    return gmodel


def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]
file_path = ".model_weights.hdf5"
callbacks = get_callbacks(filepath=file_path, patience=5)

In [20]:

X_train, X_valid, y_train, y_valid = train_test_split(Xtr_more,
                                                      Ytr_more,
                                                      random_state=42,
                                                      train_size = 0.8,
                                                      stratify=Ytr_more)

In [21]:
#Without denoising, core features.
gmodel=getModel()
gmodel.fit(X_train, y_train,
          batch_size=32,
          epochs=50,
          verbose=1,
          validation_data=(X_valid, y_valid),
          callbacks=callbacks)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 73, 73, 64)        1792      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 36, 36, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 15, 15, 128)       1

<keras.callbacks.History at 0x7f4cc2503c90>

In [22]:
gmodel.load_weights(filepath=file_path)
score = gmodel.evaluate(X_valid, y_valid, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.19577473402023315
Test accuracy: 0.9241223335266113


In [23]:
df_test = pd.read_json('test.json')
df_test.inc_angle = df_test.inc_angle.replace('na',0)
Xtest = (get_scaled_imgs(df_test))
pred_test = gmodel.predict(Xtest)

submission = pd.DataFrame({'id': df_test["id"], 'is_iceberg': pred_test.reshape((pred_test.shape[0]))})
print(submission.head(10))

submission.to_csv('submission.csv', index=False)

         id  is_iceberg
0  5941774d    0.009097
1  4023181e    0.950651
2  b20200e4    0.019480
3  e7f018bb    0.998447
4  4371c8c3    0.979289
5  a8d9b1fd    0.281676
6  29e7727e    0.075297
7  92a51ffb    0.995088
8  c769ac97    0.000023
9  aee0547d    0.000001


In [24]:
!kaggle competitions submit -c statoil-iceberg-classifier-challenge -f submission.csv -m "2021-08-26 third commit Remove_NaN+Sum_Channel+Lee_Filter+Augmentation+CNN+"

  0% 0.00/165k [00:00<?, ?B/s]100% 165k/165k [00:00<00:00, 787kB/s]
Successfully submitted to Statoil/C-CORE Iceberg Classifier Challenge

# Pseudo Labeling

In [27]:
new_X = np.concatenate([Xtr_more,Xtest])
add_target = np.array([1 if i[0]>=0.5 else 0 for i in pred_test])
new_y = np.concatenate([Ytr_more,add_target])

new_X.shape,new_y.shape

((12837, 75, 75, 3), (12837,))

In [28]:

X_train, X_valid, y_train, y_valid = train_test_split(new_X,
                                                      new_y,
                                                      random_state=42,
                                                      train_size = 0.8,
                                                      stratify=new_y)

In [29]:
#Without denoising, core features.
gmodel=getModel()
gmodel.fit(X_train, y_train,
          batch_size=32,
          epochs=50,
          verbose=1,
          validation_data=(X_valid, y_valid),
          callbacks=callbacks)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 73, 73, 64)        1792      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 15, 15, 128)      

<keras.callbacks.History at 0x7f4cae005a90>

In [30]:
gmodel.load_weights(filepath=file_path)
score = gmodel.evaluate(X_valid, y_valid, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.16344022750854492
Test accuracy: 0.9295171499252319


In [31]:
df_test = pd.read_json('test.json')
df_test.inc_angle = df_test.inc_angle.replace('na',0)
Xtest = (get_scaled_imgs(df_test))
pred_test = gmodel.predict(Xtest)

submission = pd.DataFrame({'id': df_test["id"], 'is_iceberg': pred_test.reshape((pred_test.shape[0]))})
print(submission.head(10))

submission.to_csv('submission.csv', index=False)

         id  is_iceberg
0  5941774d    0.002428
1  4023181e    0.999233
2  b20200e4    0.000063
3  e7f018bb    0.999142
4  4371c8c3    0.999994
5  a8d9b1fd    0.226824
6  29e7727e    0.023768
7  92a51ffb    0.999609
8  c769ac97    0.000004
9  aee0547d    0.000002


In [32]:
!kaggle competitions submit -c statoil-iceberg-classifier-challenge -f submission.csv -m "2021-08-26 forth commit Remove_NaN+Sum_Channel+Lee_Filter+Augmentation+CNN+Pseudo_Labeling"

100% 166k/166k [00:00<00:00, 793kB/s]
Successfully submitted to Statoil/C-CORE Iceberg Classifier Challenge