In [65]:
import pandas as pd
import numpy as np

import tensorflow as tf
from annoy import AnnoyIndex

import sys
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2

sys.path.append('../src')

## Import data

In [66]:
images = pd.read_csv("../data/pareto_images/images.csv")
images['product_id'] = images.product_id.astype(str)

for i, image in tqdm(images.iterrows(), total=len(images)):
    fname = image.filename
    split = fname.split('.')
    if len(split) != 2 or split[-1] != 'jpg':
        new_fname = f"{split[0]}.jpg"
        images.loc[i, "filename"] = new_fname

100%|██████████| 13631/13631 [00:00<00:00, 15034.31it/s]


In [67]:
products = pd.read_csv('../data/pareto_training.csv')
products = products.dropna(subset=['id'])
products['id'] = products.id.astype(int).astype(str)
products = products[products.apply(lambda x: len(images[images.product_id == x.id]) > 0, axis=1)]

In [68]:
products['image_filename'] = products.apply(lambda x: images[images.product_id == x.id].iloc[0].filename, axis=1) # select first occurence of image
products['image_media_url'] = products.apply(lambda x: images[images.product_id == x.id].iloc[0].media_url, axis=1)
products['image_id'] = products.apply(lambda x: images[images.product_id == x.id].iloc[0].id, axis=1)

In [69]:
matching_products = products.dropna(subset=['master_product'])
matching_products = matching_products.reset_index(drop=True)
master_products = matching_products.master_product_fill.unique()
matching_products['master_product_fill'] = matching_products.apply(lambda x: np.where(x.master_product_fill == master_products)[0][0], axis=1)

In [8]:
images = pd.read_csv('../data/pareto_images/results-20210629-142420.csv')

In [41]:
images = images.rename(columns={'id': 'product_id'})
images['id'] = images.index

In [42]:
images['filename'] = images.apply(lambda x: x.media_url.split('/')[-1], axis=1)

In [43]:
for i, image in tqdm(images.iterrows(), total=len(images)):
    fname = image.filename
    split = fname.split('.')
    if len(split) != 2 or split[-1] != 'jpg':
        new_fname = f"{split[0]}.jpg"
#         print(fname, new_fname)
        try:
#             os.rename(os.path.join("../data/pareto_images/images", fname), os.path.join("../data/pareto_images/images", new_fname))
            images.loc[i, "filename"] = new_fname
        except FileNotFoundError:
            pass

100%|██████████| 13631/13631 [00:03<00:00, 3856.01it/s]


In [88]:
images.to_csv("../data/pareto_images/images.csv")

## Preprocess
Create annoy index using feature extraction and find batch hard negative and hard positive

Train, test, split

Import images using keras ImageDataGenerator and flow_from_dataframe

In [126]:
# constants
EPOCHS = 10
BATCH_SIZE = 32
INPUT_SIZE = (224,224)

In [138]:
# Data augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg19 import preprocess_input

aug = ImageDataGenerator(
    preprocessing_function=preprocess_input
)

In [139]:
# create data generator helper
def make_datagen(df, label='id'):
    image_gen = aug.flow_from_dataframe(
        dataframe=df,
        directory='../data/pareto_images/images',
        x_col="image_filename",
        y_col=label,
        batch_size=BATCH_SIZE,
        shuffle=False,
        target_size=INPUT_SIZE,
        class_mode='raw'
    )
    return image_gen

In [12]:
# only matching products
image_gen = make_datagen(matching_products)

Found 495 validated image filenames.


In [7]:
# all products
image_gen = make_datagen(products)

Found 13344 images belonging to 1 classes.


In [124]:
from tensorflow.keras.applications import VGG19
feature_model1 = VGG19(weights='imagenet')
feature_model2 = VGG19(weights='imagenet')
feature_model1._name = 'vgg1'
feature_model2._name = 'vgg2'

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5


In [138]:
features = feature_model1.predict(image_gen, verbose=1)



In [75]:
index = AnnoyIndex(1000, 'angular')

In [146]:
id_mapping = id_mapping = dict(zip(range(len(products.id.values)), products.id.values))
for i in tqdm(range(features.shape[0])):
    vec = features[i]
    fname = image_gen.filenames[i]
#     img = images[images.filename == fname.split('/')[-1]].iloc[0]
    prod = products[products.image_filename == fname.split('/')[-1]].iloc[0]
    index.add_item(i, vec.tolist())
index.build(10)
index.save('match_images.ann')

100%|██████████| 495/495 [00:00<00:00, 986.29it/s]


True

In [76]:
index.load('match_images.ann')

True

In [141]:
def display_nns(i):
    nn_ids, distances = index.get_nns_by_item(i, 10, include_distances=True)
    print(nn_ids)
    for id, dis in zip(nn_ids, distances):
        print(dis)
        fname = images.loc[id].filename
        plt.imshow(cv2.imread(os.path.join("../data/pareto_images/images", fname)))
        plt.show()

In [80]:
%load_ext autoreload

In [86]:
%autoreload 2

In [185]:
from preprocessing.batch_selection import batch_selection
match_df = batch_selection(matching_products, index)

100%|██████████| 122265/122265 [02:10<00:00, 938.39it/s]


In [186]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(match_df, test_size=0.1)

In [249]:
matching_products.image_filename.value_counts()

0_065f0262-6c84-4b1e-a868-3a2e8004cc29_700_700.jpg            3
154339058443124_c9e93930-0cc4-41ac-84b1-6a6d6da637b1.jpg      2
1678750_7010d0e3-2c41-4752-a204-1fcbdf498fd9_1280_1280.jpg    2
0e9aba56-d9b6-4134-9a55-60bdc01251c3.jpg                      2
153740950832024_9ac41b6d-1dfc-4c2f-92e7-3473153c5517.jpg      2
                                                             ..
6151906_e1e61aaf-fef9-4e04-b501-1825930a6881_2048_1536.jpg    1
35c947ce-ca25-424a-8c9d-f9c61ecffb2b.jpg                      1
201070617_46706b3b-9cd8-43d9-87f7-dc35eb1b0e6e_538_638.jpg    1
208955_cd7b0af8-d76b-40db-8299-fe71c968c370.jpg               1
27039034_68d18d0b-0016-4091-9c47-f036fe858b3a_600_600.jpg     1
Name: image_filename, Length: 488, dtype: int64

In [255]:
train.rename(columns={'id1': 'id'}).merge(matching_products,  on='id', how='left')[['image_filename', 'id', 'id2']]

Unnamed: 0,image_filename,id,id2
0,159107627636699_e50c771d-0e6b-49e4-a85d-356478...,876155444,1026125401
1,288123413_dfc3a53f-f0bc-4e44-a3ed-353f8faef5b9...,1028348220,1071694571
2,a81dfa78-a2c3-4af0-9f7a-a85561c03839.jpg,985032307,554501984
3,45633585_750fe894-3842-499e-bddf-e47505bb4e63_...,665111185,1421953695
4,8126895_968b2cc2-5993-4cd3-8a5f-13ece33003c9_9...,940529067,784053564
...,...,...,...
18046,2cdb5785-5dfa-480e-93fe-881f99eb72dc.jpg,276739489,528284138
18047,b2be7dde-35f7-46b3-8b13-5617d983aaaa.jpg,869885807,1247753893
18048,1593958_18a170fd-dd8f-4da8-9725-aeeaa8469fe9.jpg,345149106,338983558
18049,6374c6b5-d7a0-40fd-8b4e-052eaaf55f14.jpg,1249485411,252926634


In [254]:
train.rename(columns={'id2': 'id'}).merge(matching_products,  on='id', how='left')[['image_filename', 'id']]

Unnamed: 0,image_filename,id
0,3110822_ecf34c8a-1d21-497c-b2a1-9adc0842398a_2...,1026125401
1,97a57f5a-2c25-4378-95ed-223b17c8e8fb.jpg,1071694571
2,272063472_a1b17c5c-4792-4a03-9297-3e97fc417831...,554501984
3,2405075_eb668abb-df4d-4518-84da-b74fe85b23e9_1...,1421953695
4,6997796_cd7fbf3f-142c-49d1-82f8-14390cdb69bc_1...,784053564
...,...,...
18046,2c80c03f-7b2a-4eae-87a9-aa81b2f7736c.jpg,528284138
18047,576806c2-bf3c-4e4f-a7e2-6d043a2e3d42.jpg,1247753893
18048,40545836_f5c43a07-4c4a-4f64-a1b7-69c670ec3848_...,338983558
18049,808047_e6066aec-0e83-4fce-8178-9f94b21a5278_64...,252926634


In [264]:
def make_matching_dataset(df):
    gen_1 = make_datagen(df.rename(columns={'id1': 'id'}).merge(matching_products,  on='id', how='left'), label='match')
    gen_2 = make_datagen(df.rename(columns={'id2': 'id'}).merge(matching_products,  on='id', how='left'))
    while True:
        X1, Y = gen_1.next()
        X2, _ = gen_2.next()
        yield ([X1, X2], Y)

In [265]:
train_dataset = make_matching_dataset(train)
# val_dataset = make_matching_dataset(val)
test_dataset = make_matching_dataset(test)

In [189]:
def display_matches(match_dataset):
    (img1, img2), match = next(match_dataset)
    for i in range(BATCH_SIZE):
        fig, axs = plt.subplots(1, 2)
        fig.suptitle(match[i])
        axs[0].imshow(img1[i])
        axs[1].imshow(img2[i])

In [281]:
train.merge(products.rename(columns={'id': 'id1', 'name': 'name1', 'image_id': 'image_id2'})[['id1', 'name1', 'image_id2']], on='id1', how='left').merge(products.rename(columns={'id': 'id2', 'name': 'name2', 'image_id': 'image_id2'})[['id2', 'name2', 'image_id2']], on='id2', how='left')[['name1_y', 'name2_y', 'match']][:10]


Unnamed: 0,name1_y,name2_y,match
0,Susu diamond fullcream UHT 1 liter,Susu diamond uht full cream 1000ml (1 carton),1
1,Susu Diamond Coklat / Straw / Skim 1lt | fresh...,Susu greenfields fresh milk 1 liter,1
2,Ultra Milk 1 liter - Coklat,Susu Diamond Coklat / Strawberry / non fat 1lt...,0
3,Susu UHT Ultra Full Cream 1 Liter / Plain Ecer...,Susu Diamond Pasteurised Milk - PLAIN,0
4,susu diamond full cream UHT,susu greenfield UHT full cream 1 liter PCS,1
5,Nestle Carnation Susu Minuman Evaporasi 405 gr,susu diamond fresh milk 1 karton,0
6,"Susu UHT Ultra Full Cream 1 Liter, 1 Karton is...",Diamond Fresh Milk,0
7,Hilo Belgian Chocolate Milk Isi 10 Pcs | Hi Lo...,ANLENE ACTIFIT 600 GR - COKLAT,0
8,[ Susu UHT ] Ultra Milk 1 Liter Full Cream,Ultra Milk Full Cream 200 ML,1
9,Ultra Milk 125 ml - Coklat,Susu Diamond 1 Liter - Full Cream [12 Liter],0


In [282]:
# display_matches(train_dataset)

In [283]:
import tensorflow.keras.backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def contrastive_loss(y, preds, margin=1):
    y = tf.cast(y, preds.dtype)

    squaredPreds = K.square(preds)
    squaredMargin = K.square(K.maximum(margin - preds, 0))
    loss = K.mean(y * squaredPreds + (1 - y) * squaredMargin)

    return loss

def euclidean_distance(vectors):
    # unpack the vectors into separate lists
    (featsA, featsB) = vectors
    # compute the sum of squared distances between the vectors
    sumSquared = K.sum(K.square(featsA - featsB), axis=1,
        keepdims=True)
    # return the euclidean distance between the vectors
    return K.sqrt(K.maximum(sumSquared, K.epsilon()))

In [284]:
from tensorflow.keras.layers import Input, Lambda, Dense
from tensorflow.keras.models import Model

img1 = Input(shape=(INPUT_SIZE[0], INPUT_SIZE[1], 3))
img2 = Input(shape=(INPUT_SIZE[0], INPUT_SIZE[1], 3))

feature_model1 = feature_model1(img1)
feature_model2 = feature_model2(img2)

distance = Lambda(euclidean_distance)([feature_model1, feature_model2])
# outputs = Dense(1, activation="sigmoid")(distance)
model = Model(inputs=[img1, img2], outputs=distance)

TypeError: 'KerasTensor' object is not callable

In [285]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
vgg1 (Functional)               (None, 1000)         143667240   input_27[0][0]                   
__________________________________________________________________________________________________
vgg2 (Functional)               (None, 1000)         143667240   input_28[0][0]                   
____________________________________________________________________________________________

In [286]:
model.compile(loss=contrastive_loss, optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_dataset, steps_per_epoch=(len(train) // BATCH_SIZE) + 1, epochs=EPOCHS)

Epoch 1/10

In [60]:
model.evaluate(test_dataset,batch_size=BATCH_SIZE, steps_per_epoch=(len(test) // BATCH_SIZE) + 1)

In [None]:
from sklearn.metrics import classification_report 
y_pred = model.predict(test_dataset, batch_size=32, verbose=1, steps=(len(test) // BATCH_SIZE) + 1)
print(classification_report(test.match.values, np.argmax(y_pred, axis=1)))

In [None]:
np.unique(y_pred)

In [64]:
model.save('../models/image_siamese')



INFO:tensorflow:Assets written to: ../models/image_siamese/assets
