# Ground Truth Creation for Retailrocket Embeddings


### Imports     

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import lil_matrix
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import math

### load dataset

In [3]:
basepath = '../../datasets/preprocessed_datasets/retailrocket/'
product_data = pd.read_pickle(basepath +'item_data_extracted_10k.pkl')
product_data = product_data.reset_index(drop=True)

display(product_data)

Unnamed: 0,itemid,categoryid,available,properties,property_values
0,309858,799,0,"[1066, 112, 159, 202, 227, 283, 364, 384, 400,...","[n1260.000 424566, 679677, 519769, 125782 n144..."
1,89393,491,0,"[0, 1036, 112, 159, 202, 208, 225, 227, 283, 3...",[[n24.000 266119 768453 119932 754228 98606 63...
2,88277,1576,0,"[1010, 1036, 1064, 112, 159, 202, 227, 252, 28...","[769062, 575349, 664227 504513 79652, 679677, ..."
3,90911,179,0,"[112, 159, 19, 202, 213, 227, 243, 28, 283, 36...","[679677, 519769, n36.000 908104, 1071703 49150..."
4,72133,1694,0,"[1032, 1037, 1079, 1090, 112, 120, 159, 186, 1...","[769062, 769062, 769062, 769062, 679677, 76906..."
...,...,...,...,...,...
9995,171847,508,0,"[112, 159, 19, 202, 227, 243, 28, 283, 364, 38...","[679677, 519769, n216.000 309206, 535929, 6851..."
9996,271852,624,0,"[112, 119, 122, 141, 142, 159, 176, 202, 227, ...","[679677, 150169 924913, 769062, n12.000, 76906..."
9997,206063,352,0,"[1036, 112, 159, 202, 227, 283, 364, 6, 678, 6...","[431619, 679677, 519769, 1202958, 404632, 4046..."
9998,99270,1355,0,"[112, 159, 202, 227, 28, 283, 293, 30, 348, 36...","[679677, 519769, 15252 n216.000 1043928 110382..."


### Create Ground Truth

In [4]:
def convert_properties(prop_str):
    return set(int(x) for x in prop_str.strip('[]').split(','))

def calculate_properties_similarity(prop1, prop2, print_results=False):
    prop1 = set(prop1)
    prop2 = set(prop2)
    intersection = prop1.intersection(prop2)
    avg_len = (len(prop1) + len(prop2)) / 2
    if(print_results):
        display(len(intersection), avg_len)
    return len(intersection) / avg_len # >= threshold

def combinations(n, r):
    return math.factorial(n) // (math.factorial(r) * math.factorial(n - r))

def weighted_average(value1, weight1, value2, weight2):
    return (value1 * weight1 + value2 * weight2) / (weight1 + weight2)

In [5]:
search_1_idx = 5
search_2_idx = 3
display(calculate_properties_similarity(product_data.loc[search_1_idx, 'properties'], product_data.loc[search_2_idx, 'properties'], False))
display(len(product_data.loc[search_1_idx, 'properties']), len(product_data.loc[search_2_idx, 'properties']))
# calculate_properties_similarity([0], [10], 0.005)

0.8727272727272727

24

31

In [6]:

treshold = 0.7
df = product_data.copy()
print_count = 1000
similarity_matrix = lil_matrix((product_data.shape[0], product_data.shape[0]), dtype=bool)  # Sparse matrix
for i in df.index:
    for j in range(i + 1, df.shape[0]):
        category_match = df.loc[i, 'categoryid'] == df.loc[j, 'categoryid']
        # properties_match = calculate_properties_similarity(df.iloc[[i, j]], threshold)[0]
        same_category = 1.0 if df.loc[i, 'categoryid'] == df.loc[j, 'categoryid'] else 0
        property_sim = calculate_properties_similarity(df.loc[i, 'properties'], df.loc[j, 'properties'])
        similarity_matrix[i, j] =  weighted_average(property_sim, 0.8,same_category, 0.2) >= treshold
    if i+1 % print_count == 0:
       print("done: " + str(i+1) + "/" + str(df.shape[0]))
print(str(similarity_matrix.count_nonzero()) + " true pairs from possible " + str(combinations(df.shape[0], 2)))
# similarity_matrix = similarity_matrix

2040299 true pairs from possible 49995000


In [7]:
# print(similarity_matrix)

In [8]:
similarity_matrix_csr = similarity_matrix.tocsr()

In [9]:
filename = 'ground_truth/ground_truth_10k.npz'
save_npz(basepath + filename, similarity_matrix_csr)

In [13]:
loaded_csr = load_npz(basepath + filename)
display(loaded_csr[0,1])

False