In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Parameter

In [None]:
img_height = 75
img_width = 75
batch_size = 64
oov_token = '<oov>'
num_words = 20000
truncating = 'post'
padding = 'post'
embedding_dim = 50
max_len = 20

# Data Prep

In [None]:
def create_sequence(data, col):
    seq = np.array(pad_sequences(tokenizer.texts_to_sequences([x for x in data[col].values]), maxlen=max_len, 
              truncating=truncating, padding=padding))
    return seq
def clean_data(x):
    x = x.replace('!', '')
    x = x.replace('"', '')
    x = x.replace('#', '')
    x = x.replace('$', '')
    x = x.replace('%', '')
    x = x.replace('&', '')
    x = x.replace("'", '')
    x = x.replace('(', '')
    x = x.replace(')', '')
    x = x.replace('*', '')
    x = x.replace('+', '')
    x = x.replace(',', '')
    x = x.replace('-', '')
    x = x.replace('.', '')
    x = x.replace('/', '')
    x = x.replace('0', '')
    x = x.replace('1', '')
    x = x.replace('2', '')
    x = x.replace('3', '')
    x = x.replace('4', '')
    x = x.replace('5', '')
    x = x.replace('6', '')
    x = x.replace('7', '')
    x = x.replace('8', '')
    x = x.replace('9', '')
    x = x.replace(':', '')
    x = x.replace(';', '')
    x = x.replace('<', '')
    x = x.replace('=', '')
    x = x.replace('>', '')
    x = x.replace('?', '')
    x = x.replace('@', '')
    x = x.replace('[', '')
    x = x.replace('\\', '')
    x = x.replace(']', '')
    x = x.replace('^', '')
    x = x.replace('_', '')
    x = x.replace('`', '')
    x = x.replace('{', '')
    x = x.replace('|', '')
    x = x.replace('}', '')
    x = x.replace('~', '')
    return x

#Learn Tokenizer for sequence
train_data = pd.read_csv('../input/shopee-product-matching/train.csv')
train_data['title'] = train_data['title'].apply(clean_data)
titles = []
for index, row in train_data.iterrows():
    titles.append(row['title'])
tokenizer = Tokenizer(oov_token=oov_token, num_words=num_words)
tokenizer.fit_on_texts(titles)

# Load model

In [None]:
base_network_cnn = tf.keras.models.load_model('../input/shopee-base-network-cnn-final/base_network_cnn.h5')
base_network_seq = tf.keras.models.load_model('../input/shopee-product-matching-seq-final/base_network_seq.h5')

# Make Prediction

In [None]:
def process_test_images(image):
    dct = {}
    image_path = tf.constant('/kaggle/input/shopee-product-matching/test_images/')
    #Process Image
    img = tf.strings.join([image_path, image])
    img = tf.io.read_file(img)
    img = tf.image.decode_jpeg(img)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [img_height, img_width])
    dct['image'] = img
    return dct
def make_prediction_cnn(data, thld):
    pred_dataset = tf.data.Dataset.from_tensor_slices((data['image']))
    pred_dataset = pred_dataset.map(process_test_images, num_parallel_calls=tf.data.AUTOTUNE)
    pred_dataset = pred_dataset.batch(batch_size).prefetch(1)
    pred_vecs = []
    for step, single_batch in enumerate(pred_dataset):
        if step == 0:
            pred_vecs = base_network_cnn.predict(single_batch['image'])
        else:
            pred_vec = base_network_cnn.predict(single_batch['image'])
            pred_vecs = np.concatenate((pred_vecs,pred_vec), axis=0)
    all_posting_ids = data['posting_id'].values
    iter1 = 0
    all_matches = []
    for vec1 in pred_vecs:
        vec1 = np.expand_dims(vec1, axis=0)
        dist = np.sqrt((np.square(vec1[:,np.newaxis]-pred_vecs).sum(axis=2)))
        match_indices = np.where(dist[0] < thld)[0]
        res_list = [all_posting_ids[i] for i in match_indices]
        matches = ' '.join(x for x in res_list)
        all_matches.append([all_posting_ids[iter1], matches])
        iter1 = iter1 + 1
    return pd.DataFrame(data=all_matches, columns=['posting_id', 'matches'])

def make_prediction_seq(data, thld):
    weight_metrix = base_network_seq.layers[1].get_weights()[0]
    sequences = create_sequence(data, 'title')
    pred_vecs = []
    for seq in sequences:
        i = 0
        emb = np.zeros((embedding_dim))
        for idx in seq:
            if idx != 0:
                emb = emb + np.array(weight_metrix[idx])
                i = i + 1
        emb = emb/i
        pred_vecs.append(emb)
    all_posting_ids = data['posting_id'].values
    iter1 = 0
    all_matches = []
    for vec1 in pred_vecs:
        vec1 = np.expand_dims(vec1, axis=0)
        dist = np.sqrt((np.square(vec1[:,np.newaxis]-pred_vecs).sum(axis=2)))
        match_indices = np.where(dist[0] < thld)[0]
        res_list = [all_posting_ids[i] for i in match_indices]
        matches = ' '.join(x for x in res_list)
        all_matches.append([all_posting_ids[iter1], matches])
        iter1 = iter1 + 1
    return pd.DataFrame(data=all_matches, columns=['posting_id', 'matches'])

def merge_prediction(cnn_prediction, seq_prediction):
    final_result = []
    for index, row in cnn_prediction.iterrows():
        all_matches = []
        res = []
        posting_id = row['posting_id']
        cnn_matches = row['matches'].split(' ')
        seq_matches = seq_prediction[seq_prediction['posting_id'] == posting_id]['matches'].values[0].split(' ')
        for x in cnn_matches:
            all_matches.append(x)
        for x in seq_matches:
            all_matches.append(x)
        all_matches = list(set(all_matches))
        res.append(' '.join(x for x in all_matches))
        final_result.append([posting_id, res[0]])
    return pd.DataFrame(data=final_result, columns=['posting_id', 'matches'])

In [None]:
test_data = pd.read_csv('/kaggle/input/shopee-product-matching/test.csv')
cnn_prediction = make_prediction_cnn(test_data, 0.08)
seq_prediction = make_prediction_seq(test_data, 0.02)
submission_df = merge_prediction(cnn_prediction, seq_prediction)
submission_df.to_csv('./submission.csv', index=False)
#submission_df.head()

# Validation

In [None]:
def get_true_matches(train_data):
    all_matches = []
    for index, row in train_data.iterrows():
        posting_id1 = row['posting_id']
        matching_postings = train_data[train_data['label_group'] == row['label_group']]['posting_id'].values
        matching_postings = ' '.join(x for x in matching_postings)
        all_matches.append([posting_id1,matching_postings])
    return pd.DataFrame(data=all_matches, columns=['posting_id', 'matches'])

def get_f1_score(actual, predicted):
    f_score = []
    for index,row in actual.iterrows():
        id = row['posting_id']
        list = row['matches'].split()
        pred_list = predicted[predicted['posting_id'] == id]['matches'].values[0].split()
        ##F1 score
        tags = set(list)
        pred = set(pred_list)
        tp = len(tags & pred)
        fp = len(pred) - tp 
        fn = len(tags) - tp
        if tp>0:
            precision=float(tp)/(tp+fp)
            recall=float(tp)/(tp+fn)
            f_score.append(2*((precision*recall)/(precision+recall)))
        else:
            f_score.append(0)
    return np.array(f_score).mean()

In [None]:
'''
training_data, testing_data = train_test_split(train_data, test_size=0.1)
id1 = testing_data['label_group'].unique()
val_df = pd.DataFrame(data=id1, columns=['lkp_label_group'])
val_df = pd.merge(train_data, val_df, left_on=['label_group'], right_on=['lkp_label_group'], 
                      how='inner')
cnn_prediction = make_prediction_cnn(val_df, 0.14)
seq_prediction = make_prediction_seq(val_df, 0.055)
prediction = merge_prediction(cnn_prediction, seq_prediction)
'''

In [None]:
#true_matches = get_true_matches(val_df)
#print("Validation F1 Score: {}".format(get_f1_score(true_matches, prediction)))

In [None]:
#.11,.055--0.6119074216242151