In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.python.keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import Progbar
from datetime import datetime
import time

# Parameter

In [None]:
batch_size = 64
execution_mode = 'submission'
learning_rate = 0.0001
oov_token = '<oov>'
num_words = 20000
truncating = 'post'
padding = 'post'
embedding_dim = 50
max_len = 20
epochs = 5

# Data cleaning

In [None]:
def clean_data(x):
    x = x.replace('!', '')
    x = x.replace('"', '')
    x = x.replace('#', '')
    x = x.replace('$', '')
    x = x.replace('%', '')
    x = x.replace('&', '')
    x = x.replace("'", '')
    x = x.replace('(', '')
    x = x.replace(')', '')
    x = x.replace('*', '')
    x = x.replace('+', '')
    x = x.replace(',', '')
    x = x.replace('-', '')
    x = x.replace('.', '')
    x = x.replace('/', '')
    x = x.replace('0', '')
    x = x.replace('1', '')
    x = x.replace('2', '')
    x = x.replace('3', '')
    x = x.replace('4', '')
    x = x.replace('5', '')
    x = x.replace('6', '')
    x = x.replace('7', '')
    x = x.replace('8', '')
    x = x.replace('9', '')
    x = x.replace(':', '')
    x = x.replace(';', '')
    x = x.replace('<', '')
    x = x.replace('=', '')
    x = x.replace('>', '')
    x = x.replace('?', '')
    x = x.replace('@', '')
    x = x.replace('[', '')
    x = x.replace('\\', '')
    x = x.replace(']', '')
    x = x.replace('^', '')
    x = x.replace('_', '')
    x = x.replace('`', '')
    x = x.replace('{', '')
    x = x.replace('|', '')
    x = x.replace('}', '')
    x = x.replace('~', '')
    return x

In [None]:
def get_all_char(df, col_name):
    all_char = []
    for index, row in df.iterrows():
        for char in row[col_name]:
            all_char.append(char)
    return set(all_char)

In [None]:
train_data = pd.read_csv('../input/shopee-product-matching/train.csv')
train_data['title'] = train_data['title'].apply(clean_data)
#train_data.head()

# Data prep

In [None]:
def create_pair(train_data):
  pos_pair = []
  neg_pair = []
  train_data = train_data
  all_label_group = set(train_data['label_group'])
  for label in all_label_group:
    #Positive pair
    pos_list = train_data[train_data['label_group'] == label]['posting_id'].values
    for i in range(len(pos_list)-1):
      pos_pair.append([pos_list[i], pos_list[i+1], 1])
    #Negative pair
    neg_label = random.sample(all_label_group - set([label]), 1)[0]
    neg_list = train_data[train_data['label_group'] == neg_label]['posting_id'].values
    n = min(len(pos_list), len(neg_list))
    for i in range(n):
      neg_pair.append([pos_list[i], neg_list[i], 0])
  #Merge pos and neg pair
  all_pairs = pos_pair+ neg_pair
  #Create DataaFrame
  df_all_pairs = pd.DataFrame(data=all_pairs, columns=['posting_id1','posting_id2','label'])
  df_all_pairs = pd.merge(df_all_pairs, train_data, left_on='posting_id1', right_on='posting_id', how='inner')
  df_all_pairs = df_all_pairs[['posting_id1', 'posting_id2', 'image', 'title', 'label']]
  df_all_pairs = df_all_pairs.rename(columns={'image':'image1', 'title':'title1'})
  df_all_pairs = pd.merge(df_all_pairs, train_data, left_on='posting_id2', right_on='posting_id', how='inner')
  df_all_pairs = df_all_pairs[['posting_id1', 'posting_id2','image1','title1','image', 'title', 'label']]
  df_all_pairs = df_all_pairs.rename(columns={'image':'image2', 'title':'title2'})
  return df_all_pairs

def get_true_matches(train_data):
    all_matches = []
    for index, row in train_data.iterrows():
        posting_id1 = row['posting_id']
        matching_postings = train_data[train_data['label_group'] == row['label_group']]['posting_id'].values
        matching_postings = ' '.join(x for x in matching_postings)
        all_matches.append([posting_id1,matching_postings])
    return pd.DataFrame(data=all_matches, columns=['posting_id', 'matches'])

def get_f1_score(actual, predicted):
    f_score = []
    for index,row in actual.iterrows():
        id = row['posting_id']
        list = row['matches'].split()
        pred_list = predicted[predicted['posting_id'] == id]['matches'].values[0].split()
        ##F1 score
        tags = set(list)
        pred = set(pred_list)
        tp = len(tags & pred)
        fp = len(pred) - tp 
        fn = len(tags) - tp
        if tp>0:
            precision=float(tp)/(tp+fp)
            recall=float(tp)/(tp+fn)
            f_score.append(2*((precision*recall)/(precision+recall)))
        else:
            f_score.append(0)
    return np.array(f_score).mean()

def create_sequence(data, col):
    seq = np.array(pad_sequences(tokenizer.texts_to_sequences([x for x in data[col].values]), maxlen=max_len, 
              truncating=truncating, padding=padding))
    return seq

#Learn Tokenizer for sequence
train_data = train_data
titles = []
for index, row in train_data.iterrows():
    titles.append(row['title'])
tokenizer = Tokenizer(oov_token=oov_token, num_words=num_words)
tokenizer.fit_on_texts(titles)

# Batched Dataset

In [None]:
if execution_mode == 'train-test' or execution_mode == 'train-test':
    paired_train_data = create_pair(train_data)
    training_data, testing_data = train_test_split(paired_train_data, test_size=0.1)
    print("Total number of Training records: {}".format(len(training_data['posting_id1'].values)))
    print("Total number of validation records: {}".format(len(testing_data['posting_id1'].values)))
    #Training Data Gen
    dataset = tf.data.Dataset.from_tensor_slices((create_sequence(training_data,'title1'),
                                                  create_sequence(training_data,'title2'), training_data['label']))
    dataset = dataset.batch(batch_size).prefetch(1)
    #Validation Data Gen
    val_dataset = tf.data.Dataset.from_tensor_slices((create_sequence(testing_data,'title1'),
                                                  create_sequence(testing_data,'title2'), testing_data['label']))
    val_dataset = val_dataset.batch(batch_size).prefetch(1)
elif execution_mode == 'train':
    paired_train_data = create_pair(train_data)
    training_data = paired_train_data
    print("Total number of Training records: {}".format(len(training_data['posting_id1'].values)))
    #Training Data Gen
    dataset = tf.data.Dataset.from_tensor_slices((create_sequence(training_data,'title1'),
                                                  create_sequence(training_data,'title2'), training_data['label']))
    dataset = dataset.batch(batch_size).prefetch(1)

# Model

In [None]:
def base_model():
    #For Image Data
    input_seq = tf.keras.layers.Input(shape=(max_len), name='base_seq_input')
    x = tf.keras.layers.Embedding(input_dim=num_words+1, output_dim=embedding_dim,input_length=max_len)(input_seq)
    x = tf.keras.layers.GRU(256, return_sequences=True)(x)
    x = tf.keras.layers.SimpleRNN(128)(x)
    #x = tf.keras.layers.GlobalAveragePooling1D()(x)
    #x = tf.keras.layers.Dense(32, activation='relu')(x)
    #x = tf.keras.layers.Dropout(0.2)(x)
    #x = tf.keras.layers.Dense(64, activation='relu')(x)
    #x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    #define model
    model = Model(inputs=input_seq, outputs=x)
    return model
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)
def contrastive_loss_with_margin(margin):
    def contrastive_loss(y_true, y_pred):
        square_pred = K.square(y_pred)
        margin_square = K.square(K.maximum(margin - y_pred, 0))
        return K.mean(y_true * square_pred + (1 - y_true) * margin_square)
    return contrastive_loss
#Define Model
base_network = base_model()

input_a = tf.keras.layers.Input(shape=(max_len), name='input_a')
vec_output_a = base_network(input_a)

input_b = tf.keras.layers.Input(shape=(max_len), name='input_b')
vec_output_b = base_network(input_b)
output = tf.keras.layers.Lambda(euclidean_distance, name='output_layer', 
                                    output_shape=eucl_dist_output_shape)([vec_output_a, vec_output_b])
my_model = Model(inputs=[input_a, input_b], outputs=output)
#plot_model(base_network, show_shapes=True, show_layer_names=True, to_file='base-model.png')

# optimizer and loss

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = contrastive_loss_with_margin(1)
my_model.compile(optimizer=optimizer, loss=loss)

In [None]:
@tf.function
def train_model(my_model, dataset, optimizer, loss):
    with tf.GradientTape() as tap:
        input_a = dataset[0]
        input_b = dataset[1]
        label = tf.expand_dims(tf.cast(dataset[2], tf.float32), axis=0)
        output = my_model([input_a, input_b])
        loss_value = loss(label, output)
    gradients = tap.gradient(loss_value, my_model.trainable_weights)
    optimizer.apply_gradients(zip(gradients, my_model.trainable_weights))
    return loss_value

def train_data_for_one_epoch(dataset, my_model, optimizer, loss):
    losses = []
    for step, single_batch in enumerate(dataset):
        time.sleep(0.3)
        loss_value = train_model(my_model, single_batch, optimizer, loss)
        losses.append(loss_value.numpy())
        pb_i.add(batch_size)
    return np.mean(losses)

def perform_validation(my_model, val_dataset):
    val_loss = []
    for step, single_batch in enumerate(val_dataset):
        input_a = single_batch[0]
        input_b = single_batch[1]
        y_pred = my_model([input_a, input_b])
        val_loss.append(loss(tf.expand_dims(tf.cast(single_batch[2], tf.float32), axis=0), y_pred))
    return np.mean(val_loss)

#Custom Training
if execution_mode == 'train-test' or execution_mode == 'train':
    epoch = range(epochs)
    num_training_samples = training_data['title1'].count()
    for epc in epoch:
        pb_i = Progbar(num_training_samples)
        epc = epc + 1
        start_time = datetime.now()
        #Train model for one epoch
        losses = train_data_for_one_epoch(dataset, my_model, optimizer, loss)
        val_loss = perform_validation(my_model, val_dataset)
        #val_loss = 0
        #Train End time for one epoch
        end_time = datetime.now()
        time_taken_for_one_epoch = (end_time-start_time).total_seconds()
        print('\n Epoch %s/%s time taken: %.4f: Train loss: %.4f  Validation Loss: %.4f,' % \
          (epc,epochs, float(time_taken_for_one_epoch), float(losses), float(val_loss)))

# save model

In [None]:
if execution_mode == 'train-test' or execution_mode == 'train':
    base_network.save('base_network_seq.h5')

# Evaluation

In [None]:
#Read model for inference
if execution_mode == 'train-test' or execution_mode == 'train':
    base_network = tf.keras.models.load_model('base_network_seq.h5')
else:
    base_network = tf.keras.models.load_model('../input/shopee-product-matching-seq-final/base_network_seq.h5')

In [None]:
def make_prediction(data, thld):
    weight_metrix = base_network.layers[1].get_weights()[0]
    sequences = create_sequence(data, 'title')
    pred_vecs = []
    for seq in sequences:
        i = 0
        emb = np.zeros((embedding_dim))
        for idx in seq:
            if idx != 0:
                emb = emb + np.array(weight_metrix[idx])
                i = i + 1
        emb = emb/i
        pred_vecs.append(emb)
    all_posting_ids = data['posting_id'].values
    iter1 = 0
    all_matches = []
    for vec1 in pred_vecs:
        vec1 = np.expand_dims(vec1, axis=0)
        dist = np.sqrt((np.square(vec1[:,np.newaxis]-pred_vecs).sum(axis=2)))
        match_indices = np.where(dist[0] < thld)[0]
        res_list = [all_posting_ids[i] for i in match_indices]
        matches = ' '.join(x for x in res_list)
        all_matches.append([all_posting_ids[iter1], matches])
        iter1 = iter1 + 1
    return pd.DataFrame(data=all_matches, columns=['posting_id', 'matches'])

In [None]:
if execution_mode == 'train-test' or execution_mode == 'train':
    id1 = testing_data['posting_id1'].unique()
    id2 = testing_data['posting_id2'].unique()
    ids = set(np.concatenate((id1, id2), axis=None))
    val_df = pd.DataFrame(data=ids, columns=['lkp_posting_id'])
    val_df = pd.merge(train_data, val_df, left_on=['posting_id'], right_on=['lkp_posting_id'], 
                      how='inner')['label_group'].unique()
    val_df = pd.DataFrame(data=val_df, columns=['lkp_label_group'])
    val_df = pd.merge(train_data, val_df, left_on=['label_group'], right_on=['lkp_label_group'], how='inner')
    prediction = make_prediction(val_df, .057)
    true_matches = get_true_matches(val_df)
    print("Validation F1 Score: {}".format(get_f1_score(true_matches, prediction)))

In [None]:
#learning rate --default
#.08-0.5173479313729755
#learning rate --.0001
#.03--0.45473600943616843
#.04-0.48570234724580486
#.05--0.5267134612449069
#.055-- 0.5336425349313603
#0.057--0.5282102495100531
#.06--0.5101598238049183
#.08--0.22476086841316661
#.1--0.05536571180914155

# Prediction

In [None]:
test_data = pd.read_csv('/kaggle/input/shopee-product-matching/test.csv')
submission_df = make_prediction(test_data, 0.055)
submission_df.to_csv('./submission.csv', index=False)
#submission_df.head()