In [1]:
import pandas as pd
import random as rd
import json
import numpy as np
from itertools import combinations
import sympy as sy
from sklearn.cluster import AgglomerativeClustering
import re

In [2]:
#Opening the .json which is read as a dictionary by python
with open('sample_data/TVs-all-merged.json') as f:
    data = json.load(f)
print(type(data))

FileNotFoundError: ignored

In [None]:
data['UN46ES6580']

[{'featuresMap': {'ASIN:': 'B007B9PMCO',
   'Brand Name': 'Samsung',
   'Date first available at Amazon.com': 'February 20, 2012',
   'Display Size': '46 inches',
   'Display Technology': 'LED-lit',
   'Image Aspect Ratio': '16:09',
   'Product Dimensions:': '52 x 6.8 x 28.7 inches ; 33.3 pounds',
   'Shipping Weight': '41 pounds (',
   'Shipping:': 'Currently, item can be shipped only within the U.S.'},
  'modelID': 'UN46ES6580',
  'shop': 'amazon.com',
  'title': 'Samsung UN46ES6580 46-Inch 1080p 120Hz 3D Slim LED HDTV (Black)',
  'url': 'http://www.amazon.com/Samsung-UN46ES6580-46-Inch-1080p-120Hz/dp/B007B9PMCO/ref=sr_1_95/182-5884791-9374669?ie=UTF8&qid=1379502010&s=electronics&sr=1-95'},
 {'featuresMap': {'3DTV': 'Yes',
   'Aspect Ratio': '16:9',
   'Brand': 'Samsung',
   'Cabinet Color': 'Black',
   'Component Video': '1 In',
   'Composite A/V': '2 In',
   'Digital Audio': '1 Optical Out',
   'Dimensions With Stand': '42.1" x 28.3" x 10.9" (W x H x D)',
   'Dimensions Without Sta

In [None]:
#Looping through the dataset and counting the amount of descriptions per key (Key is model number)
amount_of_descriptions = 0
for key in data.keys():
    amount_of_descriptions += len(data[key])
print(amount_of_descriptions)

1624


In [None]:
#Making the descriptions seperate elements instead of storing them as a single element.
#(I think this is preferable for making the algorithm for the paper)
new_data = {}
i = 1
for key in data.keys():
    for description in data[key]:
        new_data[i] = description
        i+=1
print(len(new_data.keys()))

1624


In [None]:
#Check if each product has an element 'shop'
shop_elements = 0
for key in new_data.keys():
    if 'shop' in new_data[key].keys():
        shop_elements += 1

if shop_elements == len(new_data):
    print('True: each product has a shop element')

print(shop_elements)

True: each product has a shop element
1624


In [None]:
#Double checking if the claim about their only being maximum of 4 for the same model is true. (it is)
highest_amount = 1
for key in data.keys():
    if len(data[key]) > highest_amount:
        highest_amount = len(data[key])
print(highest_amount)

4


In [None]:
#create set of titles 
def createTitleSet(data):
  """Returns list of models words from the titles
  param: data, dictionary with all product information."""

  set_titles = {}
  for key in data.keys():
      set_titles[key] = data[key]['title'] 
    
  #Replace and make seperate elements
  regex = '[a-zA-Z0-9.]*[0-9]+[a-zA-Z0-9.]*'
  new_title= set_titles.copy()
  for key in new_title.keys():
      new_title[key] = new_title[key].replace('"','inch')
      new_title[key] = new_title[key].lower()
      new_title[key] = re.sub("[^a-zA-Z0-9\s\.]","",new_title[key])
      new_title[key] = re.findall(regex, new_title[key])
    
  return new_title

In [None]:
def createBinaryMatrix(data):
    """
    param: data, dictionary, length=|P| 
    param: set_titles, dictionary, lenght=|P|
    Returns binary values, list, shape=(|MW|, |P|)
    """
    
    #Create model word list
    data_conc = sum(data.values(), [])
    MW = sorted(set(data_conc), key=data_conc.index) #extracts unique model words 
    
    #create binary vector 
    #values are set to 1 if model word is present in title
    binary_matrix = np.zeros((len(MW),len(data)),dtype=int)
    p=0
    for value in data.values():
        w=0
        for word in MW:
            if word in value:
                binary_matrix[w,p]=1
            w= w+1
        p=p+1
    return binary_matrix

In [None]:
#MinHash algorithm
def createHashValues(lengthBinary, numHashes):
  mylist = []
  hashval = list(range(1,lengthBinary+1))
  hashind = list(range(1,lengthBinary+1))
  for i in range(numHashes):
    rd.shuffle(hashind)
    dicto = dict(zip(hashind,hashval))
    mylist.append(dicto)
  return mylist
  
def createSignature(binary_matrix,hashValues,numHashes):
  signatures=[]
  for product in range(binary_matrix.shape[1]):
    sig_val = []
    for i in range(numHashes): #N signature values
      for hash_val in range(len(binary_matrix)): #Find minHashValue for i-th signature value, goes from 0 to 1299
        #print(product,i,hash_val)
        index =  hashValues[i].get(hash_val+1) #add 1 to hash value as we go from 1 to 1300 and not 0 to 1299
        if (binary_matrix[index-1,product]) == 1:
          sig_val.append(hash_val+1)
          break
    signatures.append(sig_val)
  return signatures

def minHashing(binary_matrix, numHashes):
  hashValues = createHashValues(len(binary_matrix),numHashes)
  signatures = createSignature(binary_matrix,hashValues,numHashes)
  return signatures

In [None]:
#LSH
def hashSignaturesToBuckets(data, signatures, b):
    """
    Hashes column bands of each signature to a buckets of that band 
    : param signatures: array-like, shape=(|P|, n)
    returns dictionary with buckets for each band 
    """
    n = len(signatures[0])
    assert n % b == 0  
    r = int(n/b)   #number of rows in each band
    product_keys = list(data.keys()) #to get right product key with index
    
    bucket_bands =[]
    for i in range(b): #construct for each band a empty list of buckets 
        bucket_bands.append({})

    buckets=[]
    sign_idx=0

    #goes trough all product signatures
    for signature in signatures: 
        bands= signatureToBands(signature, r)

        #goes trough each band i of a product signature
        for i, band in enumerate(bands.astype(str)): 
            band = ','.join(band)
            if band not in bucket_bands[i].keys():
                bucket_bands[i][band]=[]
            bucket_bands[i][band].append(product_keys[sign_idx])
        sign_idx +=1
    return bucket_bands


def signatureToBands(signature, r):
    """
    Creates bands of length r for each signature vector
    : param signature: array-like, shape=(1, |P|)
    b: number of bands 
    returns list of subvectors for signature
    """
    bands=[]
    for i in range(0,len(signature), r):#step size r 
        bands.append(signature[i:i+r])
    return np.stack(bands)


def candidates(bucket_bands):
    """
    Returns list of candidate neigbors
    """
    candidates = []
    for bucket_band in bucket_bands:
        for bucket in bucket_band.keys():
            products_in_bucket = bucket_band[bucket]
            if len(products_in_bucket) > 1:
                candidates.extend(combinations(products_in_bucket, 2))
    return list(set(candidates)) #returns unique pairs 

In [None]:
def cleanAllLabels(clean_data):
    for product in clean_data:
        for key in clean_data[product]['featuresMap'].keys():
            clean_key = cleanData(key)
            clean_data[product]['featuresMap'][clean_key] = clean_data[product]['featuresMap'].pop(key)
            clean_data[product]['featuresMap'][clean_key] = cleanData(clean_data[product]['featuresMap'].get(clean_key))
            clean_data[product]['title'] = cleanData(clean_data[product]['title'])
    return clean_data

def cleanData(data):
    dataOut = data.replace('" ','inch')
    dataOut = dataOut.replace('"','inch')
    dataOut = dataOut.replace('diag.','diagonal')
    dataOut = dataOut.replace('newegg.com','')
    dataOut = dataOut.replace('bestbuy','')
    dataOut = dataOut.replace('best buy','')
    dataOut = dataOut.lower()
    dataOut = re.sub("[^a-zA-Z0-9\s\.]","",dataOut)
    return dataOut

from difflib import SequenceMatcher
def similarity_score(cleaned_data,new_title,prod1, prod2,w_title,gamma):
    w_KVP = 1-w_title
    sim = 0
    m = 0
    w = 0
    avgSimKVP = 0

    for key1 in cleaned_data[prod1]['featuresMap'].keys():
        for key2 in cleaned_data[prod2]['featuresMap'].keys():
            keysim = SequenceMatcher(None, key1, key2).ratio()
            if keysim > gamma:
                valsim = SequenceMatcher(None,cleaned_data[prod1]['featuresMap'][key1],cleaned_data[prod2]['featuresMap'][key2]).ratio()
                weight = keysim 
                sim = sim + weight*valsim
                m = m+1
                w = w + weight
    if w > 0:
        avgSimKVP = sim / w
        
    simTitle = len(list(set(new_title[prod1]).intersection(set(new_title[prod2]))))/len(list(set(new_title[prod1]).union(set(new_title[prod2]))))

    similarity_score = w_KVP * avgSimKVP + w_title * simTitle
    return similarity_score

#True pairs list
def findTruePairs(pairs):
    true_pairs = []
    false_pairs = []
    for i in range(len(pairs)):
        if new_data[pairs[i][0]]['modelID']==new_data[pairs[i][1]]['modelID']:
            true_pairs.append(pairs[i])
        else:
            false_pairs.append(pairs[i])
    return (true_pairs,false_pairs)

In [None]:
#CALCULATE SIMILARITIES OF CANDIDATE_PAIRS, RETURN FOUND_PAIRS AS EXPECTED DUPLICATES BASED ON SIMILARITY

def MSM(new_data, candidate_pairs, new_title):
    #Clean labels as well
    cleaned_labels = new_data.copy()
    cleaned_labels = cleanAllLabels(cleaned_labels)

    #From candidates find true expected duplicates
    #Q and GAMMA CAN BE ADJUSTED FOR STRING COMPARISON for better results
    w_title = 0.75
    gamma = 0.5
    sim_tres = 0.35

    found_pairs = [];
    for i in range(len(candidate_pairs)):
        prod1 = candidate_pairs[i][0]
        prod2 = candidate_pairs[i][1]
        simscore = similarity_score(cleaned_labels,new_title,prod1,prod2,w_title,gamma)
        if simscore > sim_tres:
            found_pairs.append(candidate_pairs[i])
    return found_pairs

In [None]:
true_pairs, false_pairs = findTruePairs(candidate_pairs)
print(len(true_pairs),len(false_pairs))
true_pairs, false_pairs = findTruePairs(found_pairs)
print(len(true_pairs),len(false_pairs))

NameError: ignored

In [None]:
def sampleData(new_data, n_samples):
  """Samples data
  param: new_data, dictionary with all product information, dictionary
  param: n_samples, number of samples to draw, integer 
  """
  keys = rd.sample(new_data.keys(), n_samples)
  sample_d = {k: new_data[k] for k in keys}
  return sample_d

def truePositive(candidate_pairs):
  """Return number of true postive pairs
  param: candidate_pairs, list of canidates pairs obtained from LSH"""
  TP=0
  for candidate_pair in candidate_pairs:
    if data[candidate_pair[0]]['modelID']==data[candidate_pair[1]]['modelID']:
      TP +=1
  return TP

def totalAmountDuplicates(data):
  """Returns number of duplicates in the data"""
  Df=0
  modelIDs=[]
  for i in range(len(new_data)):
      modelIDs.append(new_data[i+1]['modelID'])

  rev_dict = {} #dictionary with key: ModelID and value: {indices with that modelID}
  for index, modelID in enumerate(modelIDs):
      rev_dict.setdefault(modelID, set()).add(index)

  duplicates = []
  for value in rev_dict.values():
    if len(value) > 1:
      duplicates.extend(combinations(value, 2))

  return len(duplicates), duplicates


### Perform bootstrap and compute evaluation metrics


In [None]:
#bootstraps for LSH performance
bootstraps = 1 #5
iter = 0

n_samples = int(len(new_data)*0.6)

n = 540
settings =[]
for r in range(1,31):
  if (n % r) == 0:
    b = n/ r
    t = (1/b) ** (1/r)
    settings.append([t,b,r])

settings = settings[10:12] #dit is alleen voor testen

all_PQt = np.zeros((1,len(settings)))
all_PCt = np.zeros((1,len(settings)))
while iter != bootstraps:
    #data = sampleData(new_data, n_samples)
    #set_titles = createTitleSet(data)
    #binary_matrix = createBinaryMatrix(set_titles) 
    #signatures = minHashing(binary_matrix, n)
    
    Dn, duplicates  = totalAmountDuplicates(data)
    PQ_t  = [] #values for all t for 1 bootstrap
    PC_t = []
    for setting in settings :
        bands =  int(setting[1])
        bucket_bands = hashSignaturesToBuckets(data, signatures, bands)
        candidate_pairs= candidates(bucket_bands)
        found_pairs = MSM(data, candidate_pairs, set_titles)
        #print(found_pairs)
        Df= truePositive(found_pairs )
        Nc = len(found_pairs) #amount of duplicates found

        PQ = Df / Nc
        PC = Df / Dn

        PQ_t.append(PQ)
        PC_t.append(PC)
    
    all_PQt = all_PQt + np.array(PQ_t)
    all_PCt = all_PCt + np.array(PC_t)
    iter += 1

av_PQt =  np.array(all_PQt) / bootstraps
av_PCt =  np.array(all_PCt) / bootstraps

In [None]:
print(av_PQt )
print(av_PCt)
print(duplicates) #to do juiste ratio's vinden
#komen uit lsh de juiste paren
#welke missen? en FP?
#wat gebeurd er na MSM welke combi's maken ze dan? zelfde vragen als LSH
#F1 score erin zetten
#tunen
#plotten

[[0.03827751 0.03030303]]
[[0.02005013 0.01002506]]
[(9, 10), (12, 13), (16, 15), (19, 20), (26, 27), (29, 30), (32, 31), (36, 37), (44, 45), (48, 49), (48, 50), (49, 50), (52, 53), (56, 55), (61, 62), (65, 66), (68, 69), (73, 74), (75, 76), (77, 78), (80, 79), (86, 87), (89, 90), (92, 93), (94, 95), (110, 111), (113, 114), (117, 118), (120, 119), (133, 134), (136, 135), (140, 141), (146, 147), (150, 151), (156, 157), (160, 161), (160, 162), (161, 162), (163, 164), (168, 167), (171, 172), (184, 183), (188, 189), (192, 193), (194, 195), (197, 198), (201, 202), (206, 207), (211, 212), (222, 223), (224, 225), (230, 231), (232, 233), (236, 237), (236, 238), (237, 238), (242, 243), (250, 251), (252, 253), (266, 267), (272, 273), (272, 271), (273, 271), (276, 277), (276, 278), (277, 278), (280, 281), (282, 283), (285, 286), (285, 287), (286, 287), (288, 289), (296, 295), (299, 300), (307, 308), (309, 310), (316, 317), (324, 325), (326, 327), (329, 330), (329, 331), (330, 331), (337, 338), (3

In [None]:
settings = settings[10:12] #dit is alleen voor testen
settings = [1,2,3,4]
all_PQt = np.zeros((1, len(settings)))

all_PQt + np.array(settings)

array([[1., 2., 3., 4.]])

In [None]:
cleaned_labels = data.copy()
cleaned_labels = cleanAllLabels(cleaned_labels)
cleaned_labels.index(342)

AttributeError: ignored

In [None]:
max =0
n=1000
while n != 0:
  n_samples = int(len(new_data)*0.6)
  data = sampleData(new_data, n_samples)
  set_titles = createTitleSet(data)
  binary_matrix = createBinaryMatrix(set_titles)

  if max < len(binary_matrix):
    max = len(binary_matrix)
  n -=1

print(max)
    

928


In [None]:
signatures = minHashing(binary_matrix)

In [None]:
print(len(new_data))
print(len(binary_data))

1624
974


In [None]:
import math

In [None]:
n= 800
b = 400
r = n/b
print(r)

2.0


In [None]:
t = 

In [None]:
numHashes = round(0.5*(len(binary_matrix)+0.5))
numHashes

441

In [None]:
ini_dict = {1: 'LC-90LE657U',
 2: 'LC-90LE657U',
 3: '39PFL2908/F7',
 4: 'LC70LE550U',
 5: 'UN40F6350A',
 6: 'UN40F6350A', 
 7: 'UN40F6350A'}

# finding duplicate values
# from dictionary
# using a naive approach
rev_dict = {}
  
for index, product in ini_dict.items():
    rev_dict.setdefault(product, set()).add(index)

print(rev_dict)
duplicates = []
for value in rev_dict.values():
  if len(value) > 1:
    duplicates.extend(combinations(value, 2))
Df = len(duplicates)
print(Df)
    #Df += len(combinations(values, 2))

print(duplicates)
#result = [(key, values) for key, values in rev_dict.items() if len(values) > 1]
#result2 = 
  
# printing result
#print(result)
#print(result2)


{'LC-90LE657U': {1, 2}, '39PFL2908/F7': {3}, 'LC70LE550U': {4}, 'UN40F6350A': {5, 6, 7}}
4
[(1, 2), (5, 6), (5, 7), (6, 7)]


In [None]:
def generateRandomNumbers(n):
    """Returns a list of n random numbers"""
    randList = []

    while n > 0:
        # Get a random model word index
        randIndex = random.randint(0, len(MW)) 
  
        # Ensure that each random number is unique.
        while randIndex in randList:
            randIndex = random.randint(0, len(MW)) 
        # Add the random number to the list.
        randList.append(randIndex)
        n = n - 1
    return randList

In [None]:
#OLD VERSION OF MINHASHING
def minHashing(binary_matrix):
    """Hashes binary matrix to a signature matrix
    param: binary_matrix: binary values sorted by product, array-like, shape=(|MW|, |P|)
    param: prime: prime number
    returns signatures, list with dimensions (|P|, n)
    """
    n = int(0.5 * len(binary_matrix)) # number of hash functions=1/2*\MW|
    coeffA= generateRandomNumbers(n)
    coeffB= generateRandomNumbers(n)
    prime = sy.nextprime(len(binary_matrix))
   
    
    signatures=[]
    # For each product in binary_matrix create n signatures  
    for product in range(binary_matrix.shape[1]):
        signature = []
        # For each of the permutation
        for i in range(n):
            hashCode = (coeffA[i] * product + coeffB[i]) % prime

            # Initialize 'minHashCode' to be greater than the maximum possible value output by the hash.
            minHashCode = prime
            #Determine smallest hash code value 
            for word in range(binary_matrix.shape[0]):
                if binary_matrix[word,product]==1:
                    if hashCode <= minHashCode:
                        minHashCode = hashCode 

            # Add smallest hash code value for hash function 'i' to the signature of product.
            signature.append(minHashCode)

        # Store the MinHash signature for each permutation.
        signatures.append(signature)
    return signatures

In [None]:
test_binary_matrix = np.array([[0, 0, 0, 1], [1, 0, 0, 1], [0, 1, 0, 1],[1, 0, 1, 1],[1, 0, 1, 1],[1, 0, 1, 1]])
test_binary_matrix

In [None]:
hashValues = createHashValues(len(test_binary_matrix),20)
hashValues

binary_matrix = np.array([[1,1,1],[0,1,1],[1,0,0],[0,0,1]])
print(binary_matrix)
n = 3
hashValues = createHashValues(binary_matrix,n)
print(hashValues)


In [None]:
binary_matrix= create_binary_matrix(new_title)
print(binary_matrix.shape)
print(np.max(np.sum(binary_matrix, axis=0))) #gek want een zin bestaat vgm niet uit         
#print(binary_matrix.shape)
#signatures=minHashing(binary_matrix, 947)


(1300, 1624)
8.0


KeyboardInterrupt: ignored

In [None]:
sign0 = [6,7,8,10,1,3,4,5, 1, 2,3,4]
sign1 = [1,3,4,5,7,7,8,10, 0, 2,3,4]
sign2 = [1,3,4,5,7,7,8,10, 1, 2,3,4]
sign3 = [1,3,4,5,7,7,8,10, 1, 2,3,1]
sign4 = [1,3,4,5,7,7,8,10, 1, 2,3,4]


signatures= np.array([sign0, sign1, sign2, sign3, sign4])

buckets= has_to_buckets(2,signatures)

print(buckets)

[{'6,7,8,10,1,3': [0], '1,3,4,5,7,7': [1, 2, 3, 4]}, {'4,5,1,2,3,4': [0], '8,10,0,2,3,4': [1], '8,10,1,2,3,4': [2, 4], '8,10,1,2,3,1': [3]}]


In [None]:
#OLD CODE FOR CLUSTERING

def create_upper_matrix(values, size):
    upper = np.zeros((size, size))
    upper[np.triu_indices(size, 0)] = values
    return(upper)

c = create_upper_matrix([x for x in range(1,15+1)], 5)
cd = c - np.identity(len(c)) * c
cd[3,4] = 10**99
cd[0,1] = 0.0
print(cd)

from scipy.cluster.hierarchy import dendrogram, linkage
cluster = linkage(cd, method='single', metric='euclidean', optimal_ordering=False)
cluster

#creating clusters
aggl = AgglomerativeClustering(n_clusters=3).fit_predict(cd)
print(aggl)

[[0.0e+00 0.0e+00 3.0e+00 4.0e+00 5.0e+00]
 [0.0e+00 0.0e+00 7.0e+00 8.0e+00 9.0e+00]
 [0.0e+00 0.0e+00 0.0e+00 1.1e+01 1.2e+01]
 [0.0e+00 0.0e+00 0.0e+00 0.0e+00 1.0e+99]
 [0.0e+00 0.0e+00 0.0e+00 0.0e+00 0.0e+00]]
