In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np 
import pandas as pd 

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from os.path import join as opj
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pylab
plt.rcParams['figure.figsize'] = 10, 10
%matplotlib inline

In [3]:
#Load the data.
train = pd.read_json("train/processed/train.json")

In [4]:
test = pd.read_json("test/processed/test.json")

#Intro about the Data.

Sentinet -1 sat is at about 680 Km above earth. Sending pulses of signals at a particular angle of incidence and then recoding it back. Basically those reflected signals are called backscatter. The data we have been given is backscatter coefficient which is the conventional form of backscatter coefficient given by:

$σo (dB) = βo (dB) + 10log10 [ sin(ip) / sin (ic)] $

where
1. ip=is angle of incidence for a particular pixel
2. 'ic ' is angle of incidence for center of the image
3. K =constant.

We have been given $σo$ directly in the data. 
###Now coming to the features of $σo$
Basically σo varies with the surface on which the signal is scattered from. For example, for a particular angle of incidence, it varies like:
*             WATER...........           SETTLEMENTS........           AGRICULTURE...........          BARREN........

1.**HH:**     -27.001   ................                     2.70252       .................                -12.7952        ................    -17.25790909

2.**HV: **      -28.035      ................            -20.2665             ..................          -21.4471       .................     -20.019

As you can see, the HH component varies a lot but HV doesn't.
**I don't have the data for scatter from ship, but being a metal object, it should vary differently as compared to ice object.**



Ok, so this Sentinal Settalite is equivalent to RISTSAT(an Indian remote sensing Sat) and they only Transmit pings in H polarization, **AND NOT IN V polarization**.  Those H-pings gets scattered, objects change their polarization and returns as a mix of H and V.
**Since Sentinel has only H-transmitter, return signals are of the form of HH and HV only**. Don't ask why VV is not given(because Sentinel don't have V-ping transmitter).

Now coming to features, for the purpose of this demo code, I am extracting all two bands and taking avg of them as 3rd channel to create a 3-channel RGB equivalent. 


In [5]:
#Generate the training data
#Create 3 bands having HH, HV and avg of both
X_band_1=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_1"]])
X_band_2=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_2"]])
X_train = X_band_2[:, :, :, np.newaxis]
# X_train = np.concatenate([X_band_1[:, :, :, np.newaxis], X_band_2[:, :, :, np.newaxis],((X_band_1+X_band_2)/2)[:, :, :, np.newaxis]], axis=-1)

In [6]:
#Import Keras.
from matplotlib import pyplot
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten, Activation
from keras.layers import GlobalMaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import Concatenate
from keras.models import Model
from keras import initializers
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

Using TensorFlow backend.


In [7]:
#define our model
def getModel():
    #Building the model
    gmodel=Sequential()
    #Conv Layer 1
    gmodel.add(Conv2D(32, kernel_size=(3, 3),activation='relu', input_shape=(75, 75, 1)))
    gmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 2
    gmodel.add(Conv2D(64, kernel_size=(3, 3), activation='relu' ))
    gmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 3
    gmodel.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 4
    gmodel.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Flatten the data for upcoming dense layers
    gmodel.add(Flatten())

    #Dense Layers
    gmodel.add(Dense(512))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Dense Layer 2
    gmodel.add(Dense(256))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Sigmoid Layer
    gmodel.add(Dense(1))
    gmodel.add(Activation('sigmoid'))

    mypotim=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    gmodel.compile(loss='binary_crossentropy',
                  optimizer=mypotim,
                  metrics=['accuracy'])
    gmodel.summary()
    return gmodel


def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]
file_path = ".model_weights.hdf5"
callbacks = get_callbacks(filepath=file_path, patience=5)


In [8]:
target_train=train['is_iceberg']
X_train_cv, X_valid, y_train_cv, y_valid = train_test_split(X_train, target_train, random_state=1, train_size=0.75)

In [9]:
#Without denoising, core features.
import os
gmodel=getModel()
gmodel.fit(X_train_cv, y_train_cv,
          batch_size=24,
          epochs=40,
          verbose=1,
          validation_data=(X_valid, y_valid),
          callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 73, 73, 32)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 36, 36, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 36, 36, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 34, 34, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 16, 16, 64)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 16, 16, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 14, 14, 64)        36928     
__________

<keras.callbacks.History at 0x185ae6dd8>

In [10]:
gmodel.load_weights(filepath=file_path)
score = gmodel.evaluate(X_valid, y_valid, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.389256348782
Test accuracy: 0.830423941041


In [11]:
X_band_test_1=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test["band_1"]])
X_band_test_2=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test["band_2"]])
X_test = X_band_test_2[:, :, :, np.newaxis]
# X_test = np.concatenate([X_band_test_1[:, :, :, np.newaxis]
#                           , X_band_test_2[:, :, :, np.newaxis]
#                          , ((X_band_test_1+X_band_test_2)/2)[:, :, :, np.newaxis]], axis=-1)
predicted_test=gmodel.predict_proba(X_test)



In [13]:
submission = pd.DataFrame()
submission['id']=test['id']
submission['is_iceberg']=predicted_test.reshape((predicted_test.shape[0]))
submission.to_csv('sub.csv', index=False)
print(submission)

            id    is_iceberg
0     5941774d  6.110536e-01
1     4023181e  7.467964e-01
2     b20200e4  1.967285e-38
3     e7f018bb  9.286130e-01
4     4371c8c3  2.829859e-01
5     a8d9b1fd  4.365160e-06
6     29e7727e  4.224674e-01
7     92a51ffb  6.968638e-01
8     c769ac97  1.511893e-02
9     aee0547d  2.142574e-32
10    565b28ac  8.295904e-09
11    e04e9775  7.796412e-01
12    8e8161d1  5.097590e-03
13    4cf4d256  5.555335e-01
14    139e5324  3.324844e-07
15    f156976f  7.534784e-02
16    68a117cc  3.733841e-03
17    d9aa7a56  2.207665e-01
18    9005b143  1.161478e-04
19    5f6d3988  6.300321e-01
20    9ad70954  4.832850e-01
21    b9087b9e  6.553949e-01
22    a39a1427  4.938548e-01
23    82fbe8ed  6.380972e-01
24    1fae4879  1.072418e-15
25    6dd8f13d  1.433158e-26
26    bbad5958  4.724115e-01
27    54527583  2.615817e-01
28    be8fa29c  5.988660e-01
29    81a3328f  9.599079e-01
...        ...           ...
8394  8ae30ce6  3.144685e-08
8395  de27ed88  2.404891e-33
8396  66d5196f