In [1]:
# import necessary modules we need, all these code running on Kaggle online notebooks, which offers us free GPU
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sample_submission.csv
/kaggle/input/train_relationships.csv
/kaggle/input/test/face00857.jpg
/kaggle/input/test/face01342.jpg
/kaggle/input/test/face02898.jpg
/kaggle/input/test/face02732.jpg
/kaggle/input/test/face05963.jpg
/kaggle/input/test/face00353.jpg
/kaggle/input/test/face06083.jpg
/kaggle/input/test/face01782.jpg
/kaggle/input/test/face01938.jpg
/kaggle/input/test/face04963.jpg
/kaggle/input/test/face00383.jpg
/kaggle/input/test/face02668.jpg
/kaggle/input/test/face04394.jpg
/kaggle/input/test/face01741.jpg
/kaggle/input/test/face01621.jpg
/kaggle/input/test/face05207.jpg
/kaggle/input/test/face04239.jpg
/kaggle/input/test/face06171.jpg
/kaggle/input/test/face05276.jpg
/kaggle/input/test/face00767.jpg
/kaggle/input/test/face01474.jpg
/kaggle/input/test/face02813.jpg
/kaggle/input/test/face01751.jpg
/kaggle/input/test/face05902.jpg
/kaggle/input/test/face03187.jpg
/kaggle/input/test/face01201.jpg
/kaggle/input/test/face05790.jpg
/kaggle/input/test/face03997.jpg
/k

In [2]:
from collections import defaultdict
from glob import glob
from random import choice, sample

import cv2
import numpy as np
import pandas as pd
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, Dense, GlobalMaxPool2D, GlobalAvgPool2D, Concatenate, Multiply, Dropout, Subtract
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [3]:
train_file_path = "../input/train_relationships.csv"  # training file which gives us kinship relationship in the training dataset
train_folders_path = "/kaggle/input/train/" # training dataset  
val_famillies = "F09" # vlidation data, we will use it while training the model

In [4]:
all_images = glob(train_folders_path + "*/*/*.jpg")  # get all images in training set

train_images = [x for x in all_images if val_famillies not in x] # training images 
val_images = [x for x in all_images if val_famillies in x] # validatoin images

ppl = [x.split("/")[-3] + "/" + x.split("/")[-2] for x in all_images] # all persons in training set

In [5]:
# mapping pictures into training and validation set
train_person_to_images_map = defaultdict(list)  
for x in train_images:
    train_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)

val_person_to_images_map = defaultdict(list)

for x in val_images:
    val_person_to_images_map[x.split("/")[-3] + "/" + x.split("/")[-2]].append(x)

In [6]:
# transform training labels into variables so that we can use it suring training
relationships = pd.read_csv(train_file_path)
relationships = list(zip(relationships.p1.values, relationships.p2.values))
relationships = [x for x in relationships if x[0] in ppl and x[1] in ppl]

In [7]:
# get training sets and validation sets
train = [x for x in relationships if val_famillies not in x[0]]
val = [x for x in relationships if val_famillies in x[0]]

In [8]:
# function to read images
def read_img(path):
    img = cv2.imread(path)
    return preprocess_input(img)

In [9]:
# generate input pairs
def gen(list_tuples, person_to_images_map, batch_size=16):
    ppl = list(person_to_images_map.keys())
    while True:
        batch_tuples = sample(list_tuples, batch_size // 2) # choose batch_size/2 pairs of relationships randomly from training set(having relaionship),which outputs 1
        labels = [1] * len(batch_tuples)
        while len(batch_tuples) < batch_size: # choose batch_size/2 pairs of relationships randomly(not having relationships), which outputs 0
            p1 = choice(ppl)
            p2 = choice(ppl)

            if p1 != p2 and (p1, p2) not in list_tuples and (p2, p1) not in list_tuples: 
                batch_tuples.append((p1, p2))
                labels.append(0)

        for x in batch_tuples: # if it's null, print it
            if not len(person_to_images_map[x[0]]):
                print(x[0])

        # generate input pairs with labels, which we will use in model.fit_generator()
        X1 = [choice(person_to_images_map[x[0]]) for x in batch_tuples]
        X1 = np.array([read_img(x) for x in X1])

        X2 = [choice(person_to_images_map[x[1]]) for x in batch_tuples]
        X2 = np.array([read_img(x) for x in X2])

        yield [X1, X2], labels

In [10]:
# structure of siamese network 
def baseline_model():
    input_1 = Input(shape=(224, 224, 3))
    input_2 = Input(shape=(224, 224, 3))

    base_model = ResNet50(weights='imagenet', include_top=False)

    for x in base_model.layers[:]:
        x.trainable = True

    x1 = base_model(input_1)
    x2 = base_model(input_2) # here is two RestNet 50 deep learning model

    x1 = Concatenate(axis=-1)([GlobalMaxPool2D()(x1), GlobalAvgPool2D()(x1)])
    x2 = Concatenate(axis=-1)([GlobalMaxPool2D()(x2), GlobalAvgPool2D()(x2)])

    x3 = Subtract()([x1, x2])
    x3 = Multiply()([x3, x3])

    x = Multiply()([x1, x2]) # x is distance difference

    x = Concatenate(axis=-1)([x, x3])

    # here is the third deep learning model, input is distance difference, output is 0/1
    x = Dense(100, activation="relu")(x)
    x = Dropout(0.01)(x)
    out = Dense(1, activation="sigmoid")(x)

    model = Model([input_1, input_2], out)

    model.compile(loss="binary_crossentropy", metrics=['acc'], optimizer=Adam(0.00001))

    model.summary()

    return model

In [11]:
# Transfer learning here, baseline.h5 is the ResNet50 weights download from Internet.
file_path = "baseline.h5"

checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

reduce_on_plateau = ReduceLROnPlateau(monitor="val_acc", mode="max", factor=0.1, patience=20, verbose=1)

callbacks_list = [checkpoint, reduce_on_plateau]

model = baseline_model()
# start to train the whole model
model.fit_generator(gen(train, train_person_to_images_map, batch_size=16), use_multiprocessing=True,
                    validation_data=gen(val, val_person_to_images_map, batch_size=16), epochs=100, verbose=2,
                    workers=4, callbacks=callbacks_list, steps_per_epoch=200, validation_steps=100)

test_path = "/kaggle/input/test/"



Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
resnet50 (Model)                multiple             23587712    input_1[0][0]                    
                                                                 input_2[0][0]                    
____________________________________________



 - 143s - loss: 12.4057 - acc: 0.4956 - val_loss: 9.9277 - val_acc: 0.5206

Epoch 00001: val_acc improved from -inf to 0.52062, saving model to baseline.h5
Epoch 2/100
 - 92s - loss: 7.3968 - acc: 0.5362 - val_loss: 7.4468 - val_acc: 0.5231

Epoch 00002: val_acc improved from 0.52062 to 0.52312, saving model to baseline.h5
Epoch 3/100
 - 92s - loss: 5.2469 - acc: 0.5509 - val_loss: 4.7993 - val_acc: 0.5200

Epoch 00003: val_acc did not improve from 0.52312
Epoch 4/100
 - 93s - loss: 3.6917 - acc: 0.5562 - val_loss: 2.6100 - val_acc: 0.5406

Epoch 00004: val_acc improved from 0.52312 to 0.54062, saving model to baseline.h5
Epoch 5/100
 - 92s - loss: 2.6764 - acc: 0.5616 - val_loss: 3.5156 - val_acc: 0.5394

Epoch 00005: val_acc did not improve from 0.54062
Epoch 6/100
 - 93s - loss: 1.9144 - acc: 0.5656 - val_loss: 1.4335 - val_acc: 0.5356

Epoch 00006: val_acc did not improve from 0.54062
Epoch 7/100
 - 92s - loss: 1.5187 - acc: 0.5644 - val_loss: 0.8385 - val_acc: 0.5475

Epoch 00007:

In [12]:
# seperate testing datasets
def chunker(seq, size=32):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [13]:
# start to test relationship in sample_submission.csv
from tqdm import tqdm

submission = pd.read_csv('../input/sample_submission.csv')

predictions = []

for batch in tqdm(chunker(submission.img_pair.values)):
    X1 = [x.split("-")[0] for x in batch]
    X1 = np.array([read_img(test_path + x) for x in X1])

    X2 = [x.split("-")[1] for x in batch]
    X2 = np.array([read_img(test_path + x) for x in X2])

    pred = model.predict([X1, X2]).ravel().tolist()
    predictions += pred

166it [01:13,  2.26it/s]


In [14]:
# We get the final result baseline.csv. After uploading to the Kaggle, we get the accuracy: 75.2
submission['is_related'] = predictions

submission.to_csv("baseline.csv", index=False)