In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Parameter

In [None]:
img_height = 75
img_width = 75
batch_size = 64
oov_token = '<oov>'
num_words = 20000
truncating = 'post'
padding = 'post'
embedding_dim = 50
max_len = 20
#execution_mode = 'get_threshold'
execution_mode = 'submission'

In [None]:
base_network_cnn = tf.keras.models.load_model('../input/shopee-base-network-cnn-final/base_network_cnn.h5')
base_network_seq = tf.keras.models.load_model('../input/shopee-product-matching-seq-final/base_network_seq.h5')

# Data prep

In [None]:
def create_sequence(data, col):
    seq = np.array(pad_sequences(tokenizer.texts_to_sequences([x for x in data[col].values]), maxlen=max_len, 
              truncating=truncating, padding=padding))
    return seq
def clean_data(x):
    x = x.replace('!', '')
    x = x.replace('"', '')
    x = x.replace('#', '')
    x = x.replace('$', '')
    x = x.replace('%', '')
    x = x.replace('&', '')
    x = x.replace("'", '')
    x = x.replace('(', '')
    x = x.replace(')', '')
    x = x.replace('*', '')
    x = x.replace('+', '')
    x = x.replace(',', '')
    x = x.replace('-', '')
    x = x.replace('.', '')
    x = x.replace('/', '')
    x = x.replace('0', '')
    x = x.replace('1', '')
    x = x.replace('2', '')
    x = x.replace('3', '')
    x = x.replace('4', '')
    x = x.replace('5', '')
    x = x.replace('6', '')
    x = x.replace('7', '')
    x = x.replace('8', '')
    x = x.replace('9', '')
    x = x.replace(':', '')
    x = x.replace(';', '')
    x = x.replace('<', '')
    x = x.replace('=', '')
    x = x.replace('>', '')
    x = x.replace('?', '')
    x = x.replace('@', '')
    x = x.replace('[', '')
    x = x.replace('\\', '')
    x = x.replace(']', '')
    x = x.replace('^', '')
    x = x.replace('_', '')
    x = x.replace('`', '')
    x = x.replace('{', '')
    x = x.replace('|', '')
    x = x.replace('}', '')
    x = x.replace('~', '')
    return x

#Learn Tokenizer for sequence
train_data = pd.read_csv('../input/shopee-product-matching/train.csv')
train_data['title'] = train_data['title'].apply(clean_data)
titles = []
for index, row in train_data.iterrows():
    titles.append(row['title'])
tokenizer = Tokenizer(oov_token=oov_token, num_words=num_words)
tokenizer.fit_on_texts(titles)

# Get Embeddings

In [None]:
def process_test_images(image):
    dct = {}
    image_path = tf.constant('/kaggle/input/shopee-product-matching/test_images/')
    #Process Image
    img = tf.strings.join([image_path, image])
    img = tf.io.read_file(img)
    img = tf.image.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [img_height, img_width])
    dct['image'] = img
    return dct
def process_train_images(image):
    dct = {}
    image_path = tf.constant('/kaggle/input/shopee-product-matching/train_images/')
    #Process Image
    img = tf.strings.join([image_path, image])
    img = tf.io.read_file(img)
    img = tf.image.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [img_height, img_width])
    dct['image'] = img
    return dct
def get_image_embeddings(data):
    pred_dataset = tf.data.Dataset.from_tensor_slices((data['image']))
    if execution_mode == 'submission':
        pred_dataset = pred_dataset.map(process_test_images, num_parallel_calls=tf.data.AUTOTUNE)
    else:
        pred_dataset = pred_dataset.map(process_train_images, num_parallel_calls=tf.data.AUTOTUNE)
    pred_dataset = pred_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    pred_vecs = []
    for step, single_batch in enumerate(pred_dataset):
        if step == 0:
            pred_vecs = base_network_cnn.predict(single_batch['image'])
        else:
            pred_vec = base_network_cnn.predict(single_batch['image'])
            pred_vecs = np.concatenate((pred_vecs,pred_vec), axis=0)
    return pred_vecs

def get_title_embeddings(data):
    weight_metrix = base_network_seq.layers[1].get_weights()[0]
    sequences = create_sequence(data, 'title')
    pred_vecs = []
    for seq in sequences:
        i = 0
        emb = np.zeros((embedding_dim))
        for idx in seq:
            if idx != 0:
                emb = emb + np.array(weight_metrix[idx])
                i = i + 1
        emb = emb/i
        pred_vecs.append(emb)
    return np.array(pred_vecs)

# Find threshold

In [None]:
def get_true_matches(train_data):
    all_matches = []
    for index, row in train_data.iterrows():
        posting_id1 = row['posting_id']
        matching_postings = train_data[train_data['label_group'] == row['label_group']]['posting_id'].values
        matching_postings = ' '.join(x for x in matching_postings)
        all_matches.append([matching_postings])
    return all_matches
def get_f1_score(actual, predicted):
    act_list = actual[0].split()
    pred_list = predicted.split()
    ##F1 score
    tags = set(act_list)
    pred = set(pred_list)
    tp = len(tags & pred)
    fp = len(pred) - tp 
    fn = len(tags) - tp
    if tp>0:
        precision=float(tp)/(tp+fp)
        recall=float(tp)/(tp+fn)
        f_score = 2*((precision*recall)/(precision+recall))
    else:
        f_score=0
    return f_score

if execution_mode == 'get_threshold':
    #will use 10% of training data
    train_data = pd.read_csv('../input/shopee-product-matching/train.csv')
    train_data['title'] = train_data['title'].apply(clean_data)
    image_embeddings = get_image_embeddings(train_data)
    title_embeddings = get_title_embeddings(train_data)
    #Get all True Matches
    y_true_all = get_true_matches(train_data)
    all_posting_id = train_data['posting_id'].values
    nbrs_image = NearestNeighbors(n_neighbors=50, algorithm='auto').fit(image_embeddings)
    nbrs_title = NearestNeighbors(n_neighbors=50, algorithm='auto').fit(title_embeddings)

In [None]:
def get_thresold(nbrs_image, nbrs_title, image_embeddings, title_embeddings, thrld_img, thrld_seq):
    f1_score = []
    for i in range(image_embeddings.shape[0]):
        #Using Image Embeddings
        distances, indices = nbrs_image.kneighbors(np.expand_dims(image_embeddings[i], axis=0))
        idx = np.where(distances[0] < thrld_img)[0]
        idx = indices[0,idx]
        y_pred_img = all_posting_id[idx]
        #Using Image Embeddings
        distances, indices = nbrs_title.kneighbors(np.expand_dims(title_embeddings[i], axis=0))
        idx = np.where(distances[0] < thrld_seq)[0]
        idx = indices[0,idx]
        y_pred_title = all_posting_id[idx]
        #Merge both prediction
        y_pred = set(np.concatenate((y_pred_img, y_pred_title), axis=0))
        y_pred = ' '.join(x for x in y_pred)
        f1_score.append(get_f1_score(y_true_all[i], y_pred))
        if i > 4000:
            break
    return np.mean(f1_score)

In [None]:
#score = get_thresold(nbrs_image, nbrs_title, image_embeddings, title_embeddings, 0.1, 0.05)
#print(score)

# Submission

In [None]:
def make_prediction(nbrs_image, nbrs_title, image_embeddings, title_embeddings, thrld_img, thrld_seq):
    all_matches = []
    for i in range(image_embeddings.shape[0]):
        #Using Image Embeddings
        distances, indices = nbrs_image.kneighbors(np.expand_dims(image_embeddings[i], axis=0))
        idx = np.where(distances[0] < thrld_img)[0]
        idx = indices[0,idx]
        y_pred_img = all_posting_id[idx]
        #Using Image Embeddings
        distances, indices = nbrs_title.kneighbors(np.expand_dims(title_embeddings[i], axis=0))
        idx = np.where(distances[0] < thrld_seq)[0]
        idx = indices[0,idx]
        y_pred_title = all_posting_id[idx]
        #Merge both prediction
        y_pred = set(np.concatenate((y_pred_img, y_pred_title), axis=0))
        y_pred = ' '.join(x for x in y_pred)
        all_matches.append([all_posting_id[i], y_pred])
    return pd.DataFrame(data=all_matches, columns=['posting_id', 'matches'])

In [None]:
if execution_mode == 'submission':
    test_data = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_data['title'] = test_data['title'].apply(clean_data)
    image_embeddings = get_image_embeddings(test_data)
    title_embeddings = get_title_embeddings(test_data)
    all_posting_id = test_data['posting_id'].values
    if image_embeddings.shape[0] < 50:
        k = image_embeddings.shape[0]
    else:
        k = 50
    nbrs_image = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(image_embeddings)
    nbrs_title = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(title_embeddings)
    submission_df = make_prediction(nbrs_image, nbrs_title, image_embeddings, title_embeddings, 0.1, 0.05)
    submission_df.to_csv('./submission.csv', index=False)
    #submission_df.head()

In [None]:
#submission_df.head()