In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join
from os.path import isdir, join
from os.path import basename

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from itertools import combinations
from itertools import product

import textdistance
import copy
import os
import math
import string
import tqdm

from multiprocessing import Pool
from itertools import repeat

import random
random.seed(42)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rpeeters/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data_df = pd.read_json('../../../data/raw/wdc-lspc/corpus/offers_corpus_english_v2.json.gz', lines=True)
data_df.replace(np.nan, '', inplace=True)
data_df = data_df.set_index('id', drop=False)
data_df.head()

Unnamed: 0_level_0,brand,category,cluster_id,description,id,identifiers,keyValuePairs,price,specTableContent,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,rockwell automation,Other_Electronics,3184606,cable between pcmk and plc5 or slc5 04 in dhpl...,0,[{'/mpn': '[1784pcm5]'}],,,,cable between pcmk and plc5 or slc5 04 in dhpl...
1,,Jewelry,4865031,,1,[{'/sku': '[15009616]'}],,,,2pcs golden lock and key ear cuff earring free...
2,,Grocery_and_Gourmet_Food,4865878,,2,[{'/gtin13': '[7890557402128]'}],,,,havaianas flash hit rose 33 34 pod product ope...
3,,Other_Electronics,1028988,,3,"[{'/sku': '[34852050]'}, {'/mpn': '[f5e10c10m1...",,,,panduit lc keyed c green to 50 125 om2 1 6mm m...
4,icebreaker,Camera_and_Photo,11784767,descripci n icebreaker kids compass l s half z...,4,[{'/sku': '[135889120]'}],,,,icebreaker kids compass l s half zip boy compr...


In [4]:
def tokenize(s):
    if isinstance(s, float):
        if s != s:
            return []
    s = str(s)
    s = s.replace('&amp;', "")
    s = s.replace('&reg;', "")
    s = s.replace('&quot;', "")
    s = s.replace('\t;', " ")
    s = s.replace('\n;', " ")
    return s.lower().translate(str.maketrans('', '', string.punctuation)).split()

def process_text(s, stop_words):
    if isinstance(s, float):
        if s != s:
            return s
    w_list = tokenize(s)
    w_clean_list = [x for x in w_list if x not in stop_words]
    string_final = " ".join(w_clean_list)
    return string_final


stop_words_with_punct = copy.deepcopy(stopwords.words('english'))
stop_words = list(map(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)), stop_words_with_punct))
data_df['title'] = data_df['title'].apply(process_text, args=(stop_words,))

data_df.head()

Unnamed: 0_level_0,brand,category,cluster_id,description,id,identifiers,keyValuePairs,price,specTableContent,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,rockwell automation,Other_Electronics,3184606,cable between pcmk and plc5 or slc5 04 in dhpl...,0,[{'/mpn': '[1784pcm5]'}],,,,cable pcmk plc5 slc5 04 dhplus cp7 included 3d...
1,,Jewelry,4865031,,1,[{'/sku': '[15009616]'}],,,,2pcs golden lock key ear cuff earring free shi...
2,,Grocery_and_Gourmet_Food,4865878,,2,[{'/gtin13': '[7890557402128]'}],,,,havaianas flash hit rose 33 34 pod product ope...
3,,Other_Electronics,1028988,,3,"[{'/sku': '[34852050]'}, {'/mpn': '[f5e10c10m1...",,,,panduit lc keyed c green 50 125 om2 1 6mm mult...
4,icebreaker,Camera_and_Photo,11784767,descripci n icebreaker kids compass l s half z...,4,[{'/sku': '[135889120]'}],,,,icebreaker kids compass l half zip boy comprar...


In [5]:
def get_clusters_for_gs_pairs(df, pairs):
    gs_df = pd.read_json(pairs, lines=True)
    offer_ids = set()
    all_pairs = []
    
    ids = list(gs_df['pair_id'].values)
    for x in ids:
        split = x.split('#')
        offer_ids.add(int(split[0]))
        offer_ids.add(int(split[1]))
        all_pairs.append([int(split[0]), int(split[1])])
        
    offers = df.loc[offer_ids]
    clusters = offers['cluster_id'].unique()
    return clusters, all_pairs, offers
    

def build_positive_pairs(df, gs, samples, set_label, clu_percent=1.0):
    cat = basename(gs).replace('_gs.json.gz', '')
    os.makedirs(os.path.dirname('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/positives/'), exist_ok=True)
    try:
        os.remove('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/positives/'+cat+'_train_positive'+'_'+set_label+'.txt')
    except OSError:
        pass
    clusters, gs_pairs, gs_offers = get_clusters_for_gs_pairs(df, gs)
    print('Clusters in '+cat+ ' ' + str(len(clusters)))
    
    if clu_percent != 1.0:
        sample = random.sample(range(0, len(clusters)), k=int(round(clu_percent * len(clusters))))
        clusters_new = [clusters[x] for x in sample]
        clusters = clusters_new

    no_clean_pair_cluster = 0
    no_pair_at_all = 0
    overall_pairs = []
    
    for cluster in clusters:
        
        cluster_offers = df[df['cluster_id'] == cluster]
        cluster_offers = cluster_offers.dropna(subset=['title'])
        cluster_offers = cluster_offers.assign(counts=cluster_offers.count(axis=1)).sort_values(['title', 'counts']).drop_duplicates('title', keep='last').drop('counts', axis=1)
        cluster_offers['title+desc'] = (cluster_offers['title'] + ' ' + cluster_offers['description']).str.strip()
        cluster_offers = cluster_offers[cluster_offers['title+desc'].str.split().map(len) > 5]
        
        gs_titles = gs_offers['title'].tolist()
        
        cluster_offers_ids = cluster_offers['id']
        cluster_offers_ids = cluster_offers_ids.values
        all_pairs_dirty = list(combinations(cluster_offers_ids, 2))
        all_pairs = []
        for pair in all_pairs_dirty:
            reversed_pair = pair[::-1]
            if list(pair) not in gs_pairs and list(reversed_pair) not in gs_pairs:
                all_pairs.append(pair)
        if samples > len(all_pairs):
            cluster_sample = len(all_pairs)
        else:
            cluster_sample = samples
            
        pairs = []
        all_pairs_copy = copy.deepcopy(all_pairs)

        while True:
            sample_amount = cluster_sample
            counter = 0
            bad_pairs = []
            sampling_range = range(0, len(all_pairs))
            if len(sampling_range) == 0:
                break
            sample_amount -= len(pairs)
            assert sample_amount >= 0
            if sample_amount == 0:
                break
            if len(sampling_range) < sample_amount:
                sample_amount = len(sampling_range)
            sample = random.sample(sampling_range, k=sample_amount)
            pairs.extend([all_pairs[x] for x in sample])
            for pair in pairs:
                if pair in all_pairs:
                    all_pairs.remove(pair)
                if df.loc[pair[0]]['title'] in gs_titles or df.loc[pair[1]]['title'] in gs_titles:
                    bad_pairs.append(pair)
                    counter += 1
            if counter > (cluster_sample / 2):
                remove = random.sample(range(0, len(bad_pairs)), math.ceil(counter - (cluster_sample / 2)))
                remove_pairs = [bad_pairs[x] for x in remove]
                for pair in remove_pairs:
                    pairs.remove(pair)

            if len(pairs) > cluster_sample:
                print('WARNING: More pos pairs sampled than should be!')
            if len(pairs) == cluster_sample:
                break
            if len(pairs) == 0 and len(all_pairs) == 0:
                sample = random.sample(range(0, len(all_pairs_copy)), k=1)
                pairs.extend([all_pairs_copy[x] for x in sample])
                no_clean_pair_cluster += 1
                break
        if len(pairs) == 0:
            no_pair_at_all +=1

        overall_pairs.extend(pairs)
            
    with open ('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/positives/'+cat+'_train_positive'+'_'+set_label+'.txt', "a") as f:
        for item in overall_pairs:
            f.write(str(item[0])+'#'+str(item[1])+'#'+'1'+'\n')
    with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/positives/'+cat+'_train_positive'+'_'+set_label+'.txt', "r") as f:
        print(cat+ ' Positive Training samples: '+str(sum(1 for _ in f)))
        print('For '+str(no_clean_pair_cluster)+' clusters, there was no clean training pair!')
        print('For '+str(no_pair_at_all)+' cluster pairs, there was no training pair at all!')

def get_similarities_title(df, pairs):
    left = []
    right = []
    for pair in pairs:
        left.append(pair[0])
        right.append(pair[1])
    titles_1 = df.loc[left]['title'].values
    titles_2 = df.loc[right]['title'].values

    titles_zipped = zip(titles_1, titles_2)

    similarities = []

    for titles in titles_zipped:
        similarities.append(textdistance.Jaccard(qval=None).normalized_similarity(titles[0], titles[1]))
    return similarities

def processInput(cluster_pair):
    df = cluster_pair[1]
    cluster1 = df[df['cluster_id']==cluster_pair[0][0]]
    cluster2 = df[df['cluster_id']==cluster_pair[0][1]]
    string1 = cluster1['title'].str.cat()
    string2 = cluster2['title'].str.cat()
    sim = textdistance.Jaccard(qval=None).normalized_similarity(string1, string2)
    return sim
    
def build_negative_pairs(df, gs, samples, sample_clusters, set_label, clu_percent=1.0):
    cat = basename(gs).replace('_gs.json.gz', '')
    os.makedirs(os.path.dirname('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/negatives/'), exist_ok=True)
    try:
        os.remove('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/negatives/'+cat+'_train_negative'+'_'+set_label+'.txt')
    except OSError:
        pass
    clusters, gs_pairs, gs_offers = get_clusters_for_gs_pairs(df, gs)
    
    if clu_percent != 1.0:
        sample = random.sample(range(0, len(clusters)), k=int(round(clu_percent * len(clusters))))
        clusters_new = [clusters[x] for x in sample]
        clusters = clusters_new

    no_clean_pair_cluster = 0
    all_cluster_pairs_already_seen = 0
    no_pair_at_all = 0
    overall_pairs = []
    
    cluster_sims = pd.DataFrame(index=clusters, columns=clusters, dtype=np.float64)
    cluster_pairs = list(combinations(clusters, 2))
    
    for cluster_pair in tqdm.tqdm(cluster_pairs):
        cluster1 = df[df['cluster_id']==cluster_pair[0]]
        cluster2 = df[df['cluster_id']==cluster_pair[1]]
        string1 = cluster1['title'].str.cat()
        string2 = cluster2['title'].str.cat()
        sim = textdistance.Jaccard(qval=None).normalized_similarity(string1, string2)
        cluster_sims.at[cluster_pair[0], cluster_pair[1]] = sim
        cluster_sims.at[cluster_pair[1], cluster_pair[0]] = sim
        
    pairs_to_consider_global = []
    
    sim_stats = [[] for x in range(50)]

    for cluster in clusters:
        pairs_to_consider = []
        high_sim_clusters = cluster_sims.loc[cluster,:].nlargest(50)
        for ix, value in enumerate(high_sim_clusters.values):
            sim_stats[ix].append(value)
        n_most_similar = list(high_sim_clusters[0:sample_clusters].index)
        pairs = [[cluster, x] for x in n_most_similar]
        for pair in pairs:
            if pair not in pairs_to_consider_global and pair[::-1] not in pairs_to_consider_global:
                pairs_to_consider.append(pair)
                pairs_to_consider_global.append(pair)
        
        all_pairs_unnormalized = []
        pair_list_lengths = []

        if len(pairs_to_consider) == 0:
            all_cluster_pairs_already_seen += 1
            continue
        
        for cluster_pair in pairs_to_consider:

            cluster_offers1 = df[df['cluster_id'] == cluster_pair[0]]
            cluster_offers1 = cluster_offers1.dropna(subset=['title'])
            cluster_offers1 = cluster_offers1.assign(counts=cluster_offers1.count(axis=1)).sort_values(['title', 'counts']).drop_duplicates('title', keep='last').drop('counts', axis=1)
            cluster_offers1['title+desc'] = (cluster_offers1['title'] + ' ' + cluster_offers1['description']).str.strip()
            cluster_offers1 = cluster_offers1[cluster_offers1['title+desc'].str.split().map(len) > 5]

            cluster_offers2 = df[df['cluster_id'] == cluster_pair[1]]
            cluster_offers2 = cluster_offers2.dropna(subset=['title'])
            cluster_offers2 = cluster_offers2.assign(counts=cluster_offers2.count(axis=1)).sort_values(['title', 'counts']).drop_duplicates('title', keep='last').drop('counts', axis=1)
            cluster_offers2['title+desc'] = (cluster_offers2['title'] + ' ' + cluster_offers2['description']).str.strip()
            cluster_offers2 = cluster_offers2[cluster_offers2['title+desc'].str.split().map(len) > 5]

            pair_ids_1 = cluster_offers1['id'].values
            pair_ids_2 = cluster_offers2['id'].values

            temp_all_pairs_dirty = list(product(pair_ids_1, pair_ids_2))
            temp_all_pairs = []

            for pair in temp_all_pairs_dirty:
                reversed_pair = pair[::-1]
                if list(pair) not in gs_pairs and list(reversed_pair) not in gs_pairs:
                    temp_all_pairs.append(pair)
            pair_list_lengths.append(len(temp_all_pairs))
            all_pairs_unnormalized.append(temp_all_pairs)

        min_length = min(pair_list_lengths)
        all_pairs = []
        for pair_list in all_pairs_unnormalized:
            sample_min_length = random.sample(range(0, len(pair_list)), k=min_length)
            new_pair_list = [pair_list[x] for x in sample_min_length]
            all_pairs.extend(new_pair_list)

        gs_titles = gs_offers['title'].tolist()

        if samples > len(all_pairs):
            cluster_sample = len(all_pairs)
        else:
            cluster_sample = samples        
                  
        pairs = []
        all_pairs_copy = copy.deepcopy(all_pairs)

        while True:
            sample_amount = cluster_sample
            counter = 0
            bad_pairs = []
            sampling_range = range(0, len(all_pairs))
            if len(sampling_range) == 0:
                break
            sample_amount -= len(pairs)
            assert sample_amount >= 0
            if sample_amount == 0:
                break
            if len(sampling_range) < sample_amount:
                sample_amount = len(sampling_range)
            sample = random.sample(sampling_range, k=sample_amount)
            pairs.extend([all_pairs[x] for x in sample])
            for pair in pairs:
                if pair in all_pairs:
                    all_pairs.remove(pair)
                if df.loc[pair[0]]['title'] in gs_titles or df.loc[pair[1]]['title'] in gs_titles:
                    bad_pairs.append(pair)
                    counter += 1
            if counter > (cluster_sample / 2):
                remove = random.sample(range(0, len(bad_pairs)), math.ceil(counter - (cluster_sample / 2)))
                remove_pairs = [bad_pairs[x] for x in remove]
                for pair in remove_pairs:
                    pairs.remove(pair)

            if len(pairs) > cluster_sample:
                print('WARNING: More pos pairs sampled than should be!')
            if len(pairs) == cluster_sample:
                break
            if len(pairs) == 0 and len(all_pairs) == 0:
                sample = random.sample(range(0, len(all_pairs_copy)), k=1)
                pairs.extend([all_pairs_copy[x] for x in sample])
                no_clean_pair_cluster += 1
                break
        if len(pairs) == 0:
            no_pair_at_all +=1
        overall_pairs.extend(pairs)
    
        with open ('../../../data/processed/build-training-sets/clustersim_info_'+cat+'.csv', "w") as f:
            counter = 1
            f.write('n_most_similar_cluster'+','+'mean_sim'+','+'std_sim'+'\n')
            for stat in sim_stats:
                f.write(str(counter)+','+str(np.mean(stat))+','+str(np.std(stat))+'\n')
                counter = counter + 1
                
    with open ('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/negatives/'+cat+'_train_negative'+'_'+set_label+'.txt', "a") as f:
        for item in overall_pairs:
            f.write(str(item[0])+'#'+str(item[1])+'#'+'0'+'\n')
            
    with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/negatives/'+cat+'_train_negative'+'_'+set_label+'.txt', "r") as f:
        print(cat+' Negative Training samples: '+str(sum(1 for _ in f)))
        print('For '+str(all_cluster_pairs_already_seen)+' clusters, all 5 most similar pairs were already sampled!')
        print('For '+str(no_clean_pair_cluster)+' cluster pairs, there was no clean training pair!')
        print('For '+str(no_pair_at_all)+' cluster pairs, there was no training pair at all!')

def combine_sets(cat, set_label):
    pairs = []
    with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/positives/'+cat+'_train_positive'+'_'+set_label+'.txt', 'r') as pf:
        for line in pf:
            pairs.append(line.strip())
    with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/negatives/'+cat+'_train_negative'+'_'+set_label+'.txt', 'r') as nf:
        for line in nf:
            pairs.append(line.strip())
    with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/'+cat+'_train_combined'+'_'+set_label+'.txt', 'w') as outfile:
        for pair in pairs:
            outfile.write(pair+'\n')

def double_check_gs_inclusion(cat, set_label):
    train_pairs = []
    gs_pairs = []
    counter = 0
    with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/'+cat+'_train_combined'+'_'+set_label+'.txt', 'r') as ts_f:
        for line in ts_f:
            line = line.strip().split('#')
            train_pairs.append([line[0], line[1]])
            
    gs = pd.read_json('../../../data/raw/wdc-lspc/gold-standards/'+cat+'_gs.json.gz', lines=True)
    gs_ids = list(gs['pair_id'].values)
    for x in gs_ids:
        split = x.split('#')
        gs_pairs.append([split[0],split[1]])

    for pair in gs_pairs:
        rev = copy.deepcopy(pair).reverse()
        if pair in train_pairs or rev in train_pairs:
            counter += 1
    
    if counter == 0:
        print('No gs pairs are in the '+cat+' training set.')
    else:
        print('WARNING: Number of gs pairs in the '+cat+' training set: '+str(counter))
        
def get_ts_stats(cats, set_label, df=None):
    print(f'Training set: {set_label}')
    try:
        os.remove('../../../data/processed/build-training-sets/'+set_label+'/stats'+'_'+set_label+'.txt')
    except OSError:
        pass
    
    with open('../../../data/processed/build-training-sets/'+set_label+'/stats'+'_'+set_label+'.txt', 'a') as outfile:
            outfile.write('category,positives,negatives,combined\n')
    for cat in cats:   
        pos = []
        neg = []
        with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/positives/'+cat+'_train_positive'+'_'+set_label+'.txt', 'r') as pf:
            for line in pf:
                pos.append(line.strip())
        with open('../../../data/processed/build-training-sets/'+set_label+'/'+cat+'/negatives/'+cat+'_train_negative'+'_'+set_label+'.txt', 'r') as nf:
            for line in nf:
                neg.append(line.strip())
        with open('../../../data/processed/build-training-sets/'+set_label+'/stats'+'_'+set_label+'.txt', 'a') as outfile:
            outfile.write(cat+','+str(len(pos))+','+str(len(neg))+','+str(len(pos)+len(neg))+'\n')
        if isinstance(df, pd.DataFrame):
            pairs = []
            counter = 0
            for line in neg:
                pairs.append(line.split('#')[0:2])
            for pair in pairs:
                if df.loc[int(pair[0])]['title'] == df.loc[int(pair[1])]['title']:
                    counter += 1
        print('For '+cat+' there were '+str(counter)+ ' negatives with exact same title.')

In [None]:
cats = ['computers', 'cameras', 'watches', 'shoes']

training_sets = ['1-3-10-1.0', '3-9-10-1.0', '15-45-10-1.0', '50-150-10-1.0']

for train in training_sets:
    params = [int(float(x)) for x in train.split('-')]
    
    for cat in cats:
        build_positive_pairs(data_df, '../../../data/raw/wdc-lspc/gold-standards/'+cat+'_gs.json.gz', params[0], train)
        build_negative_pairs(data_df, '../../../data/raw/wdc-lspc/gold-standards/'+cat+'_gs.json.gz', params[1], params[2], train)
        combine_sets(cat, train)
        double_check_gs_inclusion(cat, train)

Clusters in computers 745


  0%|          | 0/277140 [00:00<?, ?it/s]

computers Positive Training samples: 722
For 148 clusters, there was no clean training pair!
For 23 cluster pairs, there was no training pair at all!


100%|██████████| 277140/277140 [3:28:25<00:00, 22.16it/s]  


computers Negative Training samples: 2107
For 11 clusters, all 5 most similar pairs were already sampled!
For 0 cluster pairs, there was no clean training pair!
For 0 cluster pairs, there was no training pair at all!
No gs pairs are in the computers training set.
Clusters in cameras 563


  0%|          | 0/158203 [00:00<?, ?it/s]

cameras Positive Training samples: 486
For 133 clusters, there was no clean training pair!
For 77 cluster pairs, there was no training pair at all!


100%|██████████| 158203/158203 [1:53:43<00:00, 23.18it/s]  


cameras Negative Training samples: 1421
For 6 clusters, all 5 most similar pairs were already sampled!
For 1 cluster pairs, there was no clean training pair!
For 7 cluster pairs, there was no training pair at all!
No gs pairs are in the cameras training set.
Clusters in watches 617


  0%|          | 0/190036 [00:00<?, ?it/s]

watches Positive Training samples: 580
For 118 clusters, there was no clean training pair!
For 37 cluster pairs, there was no training pair at all!


100%|██████████| 190036/190036 [2:11:02<00:00, 24.17it/s]  


watches Negative Training samples: 1671
For 11 clusters, all 5 most similar pairs were already sampled!
For 0 cluster pairs, there was no clean training pair!
For 3 cluster pairs, there was no training pair at all!
No gs pairs are in the watches training set.
Clusters in shoes 563


  0%|          | 0/158203 [00:00<?, ?it/s]

shoes Positive Training samples: 530
For 131 clusters, there was no clean training pair!
For 33 cluster pairs, there was no training pair at all!


100%|██████████| 158203/158203 [1:50:32<00:00, 23.85it/s]  


shoes Negative Training samples: 1529
For 5 clusters, all 5 most similar pairs were already sampled!
For 0 cluster pairs, there was no clean training pair!
For 1 cluster pairs, there was no training pair at all!
No gs pairs are in the shoes training set.
Clusters in computers 745


  0%|          | 0/277140 [00:00<?, ?it/s]

computers Positive Training samples: 1762
For 97 clusters, there was no clean training pair!
For 23 cluster pairs, there was no training pair at all!


100%|██████████| 277140/277140 [3:14:10<00:00, 23.79it/s]  


computers Negative Training samples: 6321
For 11 clusters, all 5 most similar pairs were already sampled!
For 0 cluster pairs, there was no clean training pair!
For 0 cluster pairs, there was no training pair at all!
No gs pairs are in the computers training set.
Clusters in cameras 563


  0%|          | 0/158203 [00:00<?, ?it/s]

cameras Positive Training samples: 1108
For 65 clusters, there was no clean training pair!
For 77 cluster pairs, there was no training pair at all!


100%|██████████| 158203/158203 [1:47:43<00:00, 24.48it/s]  


cameras Negative Training samples: 4198
For 6 clusters, all 5 most similar pairs were already sampled!
For 1 cluster pairs, there was no clean training pair!
For 7 cluster pairs, there was no training pair at all!
No gs pairs are in the cameras training set.
Clusters in watches 617


  0%|          | 0/190036 [00:00<?, ?it/s]

watches Positive Training samples: 1418
For 64 clusters, there was no clean training pair!
For 37 cluster pairs, there was no training pair at all!


100%|██████████| 190036/190036 [2:11:49<00:00, 24.02it/s]  


watches Negative Training samples: 4979
For 11 clusters, all 5 most similar pairs were already sampled!
For 0 cluster pairs, there was no clean training pair!
For 3 cluster pairs, there was no training pair at all!
No gs pairs are in the watches training set.
Clusters in shoes 563


  0%|          | 0/158203 [00:00<?, ?it/s]

shoes Positive Training samples: 1214
For 44 clusters, there was no clean training pair!
For 33 cluster pairs, there was no training pair at all!


100%|██████████| 158203/158203 [1:47:44<00:00, 24.47it/s] 


shoes Negative Training samples: 4575
For 5 clusters, all 5 most similar pairs were already sampled!
For 0 cluster pairs, there was no clean training pair!
For 1 cluster pairs, there was no training pair at all!
No gs pairs are in the shoes training set.
Clusters in computers 745


  0%|          | 0/277140 [00:00<?, ?it/s]

computers Positive Training samples: 6146
For 97 clusters, there was no clean training pair!
For 23 cluster pairs, there was no training pair at all!


 14%|█▍        | 38737/277140 [26:03<3:40:12, 18.04it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 277140/277140 [3:08:53<00:00, 24.45it/s]  


In [None]:
cats = ['computers', 'cameras', 'watches', 'shoes']

training_sets = ['1-3-10-1.0', '3-9-10-1.0', '15-45-10-1.0', '50-150-10-1.0']

for train in training_sets:
    get_ts_stats(cats, train, data_df)