In [13]:
from pathlib import Path
import pandas as pd

In [29]:
from sentence_transformers import SentenceTransformer
import torch

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# SBERT_MODEL = 'ai-forever/FRIDA'
# BATCH_SIZE = 512

# SBERT_MODEL = 'sergeyzh/BERTA'
# BATCH_SIZE = 256

SBERT_MODEL = 'ai-forever/ru-en-RoSBERTa'
BATCH_SIZE = 512

# SBERT_MODEL = 'all-distilroberta-v1'
# BATCH_SIZE = 512

# BATCH_SIZE = 128
# SBERT_MODEL = 'sentence-transformers/gtr-t5-base'

sbert = SentenceTransformer(SBERT_MODEL, device=DEVICE)

# Numbers experiments

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

# Generate numbers 0-100 and their string representations
numbers = list(range(101))
number_strs = [str(n) for n in numbers]

# Get embeddings for all numbers
number_embeddings = sbert.encode(number_strs, batch_size=BATCH_SIZE, show_progress_bar=True)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
# Calculate pairwise distances between embeddings
emb_distances = 1 - cosine_similarity(number_embeddings)

# Calculate actual euclidean distances between numbers
import numpy as np
from scipy.stats import spearmanr

actual_distances = np.abs(np.subtract.outer(numbers, numbers))

# For each number, calculate rank correlation between embedding distances and actual distances
correlations = []
for i in range(len(numbers)):
    # Get distances from current number to all others
    emb_dists_i = emb_distances[i]
    actual_dists_i = actual_distances[i]
    
    # Calculate rank correlation
    correlation, _ = spearmanr(emb_dists_i, actual_dists_i)
    correlations.append(correlation)

# Create results dataframe
import pandas as pd
results_df = pd.DataFrame({
    'number': numbers,
    'rank_correlation': correlations
})

print("\nPairwise embedding distances matrix:")
display(pd.DataFrame(emb_distances))


Pairwise embedding distances matrix:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,-1.192093e-07,0.367056,4.505723e-01,0.395144,4.216557e-01,0.425685,0.461766,0.429378,0.423899,0.487719,...,0.583975,0.635324,0.539602,0.558611,0.532653,5.584481e-01,5.647708e-01,5.565794e-01,5.309169e-01,4.769994e-01
1,3.670558e-01,0.000000,3.052719e-01,0.305720,4.073814e-01,0.374939,0.439182,0.385875,0.461539,0.475265,...,0.389733,0.600887,0.531681,0.580726,0.563707,6.053404e-01,5.508413e-01,5.861101e-01,5.622566e-01,5.190470e-01
2,4.505723e-01,0.305272,1.192093e-07,0.224064,3.304458e-01,0.347690,0.382022,0.396302,0.392333,0.436748,...,0.516973,0.394448,0.453453,0.537968,0.543912,5.539652e-01,5.330527e-01,5.570374e-01,5.947318e-01,5.788243e-01
3,3.951436e-01,0.305720,2.240641e-01,0.000000,2.202495e-01,0.274150,0.349375,0.330737,0.370005,0.334440,...,0.478348,0.485124,0.288720,0.448288,0.506345,5.768511e-01,5.127805e-01,4.663749e-01,5.229679e-01,5.226910e-01
4,4.216557e-01,0.407381,3.304458e-01,0.220250,-2.384186e-07,0.271072,0.328272,0.385380,0.350168,0.411046,...,0.558882,0.554137,0.458712,0.329490,0.516931,5.517978e-01,6.043406e-01,5.407463e-01,5.725387e-01,5.686412e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,5.584481e-01,0.605340,5.539652e-01,0.576851,5.517978e-01,0.517420,0.383461,0.539388,0.489304,0.426590,...,0.335973,0.268910,0.315025,0.236932,0.200151,5.960464e-08,1.791719e-01,2.465696e-01,3.127927e-01,3.932495e-01
97,5.647708e-01,0.550841,5.330527e-01,0.512781,6.043406e-01,0.517139,0.538382,0.365633,0.453192,0.380745,...,0.335010,0.296332,0.281057,0.296621,0.194265,1.791719e-01,-2.384186e-07,1.944420e-01,2.251213e-01,4.001524e-01
98,5.565794e-01,0.586110,5.570374e-01,0.466375,5.407463e-01,0.540958,0.525898,0.513730,0.349448,0.298640,...,0.359552,0.324793,0.286140,0.278661,0.284995,2.465696e-01,1.944420e-01,-1.192093e-07,1.796831e-01,3.677030e-01
99,5.309169e-01,0.562257,5.947318e-01,0.522968,5.725387e-01,0.532205,0.608103,0.524896,0.464704,0.253665,...,0.379440,0.408977,0.327636,0.302374,0.266963,3.127927e-01,2.251213e-01,1.796831e-01,5.960464e-08,2.848255e-01


In [32]:
print("\nRank correlations per number:")
display(results_df)
print("\nAverage rank correlation:", np.mean(correlations))


Rank correlations per number:


Unnamed: 0,number,rank_correlation
0,0,0.597135
1,1,0.645453
2,2,0.642654
3,3,0.567902
4,4,0.574975
...,...,...
96,96,0.548480
97,97,0.603133
98,98,0.542618
99,99,0.552430



Average rank correlation: 0.5273763478897343


In [33]:
# Create width strings with format "ширина: {width}"
widths = list(range(0, 501, 20))
width_strs = [f'ширина: {w}' for w in widths]

# Get embeddings for width strings
width_embeddings = sbert.encode(width_strs, batch_size=BATCH_SIZE, show_progress_bar=True)

# Calculate pairwise distances between embeddings
width_emb_distances = 1 - cosine_similarity(width_embeddings)

# Calculate actual euclidean distances between widths
width_actual_distances = np.abs(np.subtract.outer(widths, widths))

# For each width, calculate rank correlation between embedding distances and actual distances
width_correlations = []
for i in range(len(widths)):
    # Get distances from current width to all others
    emb_dists_i = width_emb_distances[i]
    actual_dists_i = width_actual_distances[i]
    
    # Calculate rank correlation
    correlation, _ = spearmanr(emb_dists_i, actual_dists_i)
    width_correlations.append(correlation)

# Create results dataframe
width_results_df = pd.DataFrame({
    'width': widths,
    'rank_correlation': width_correlations
})

print("\nPairwise width embedding distances matrix:")
display(pd.DataFrame(width_emb_distances))

print("\nRank correlations per width:")
display(width_results_df)
print("\nAverage width rank correlation:", np.mean(width_correlations))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Pairwise width embedding distances matrix:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.0,0.601118,0.5782273,0.6600982,0.6058856,0.5136678,0.5997036,0.5111964,0.5846296,0.5792635,...,0.5661821,0.5389955,0.4897271,0.553702,0.5409682,0.5498525,0.5587035,0.611117,0.5569532,0.5279526
1,0.601118,0.0,0.4696898,0.4614549,0.4423062,0.5541204,0.5814555,0.5871146,0.5473536,0.5582153,...,0.5899199,0.6152986,0.5906175,0.5883846,0.6517217,0.3340708,0.6160221,0.5741969,0.6108109,0.6900826
2,0.578227,0.46969,-2.384186e-07,0.3830299,0.3202419,0.4862989,0.4701555,0.4044428,0.5207425,0.5797158,...,0.561529,0.2254333,0.4855332,0.4767951,0.2655194,0.4420629,0.1749193,0.4073137,0.4172693,0.5101923
3,0.660098,0.461455,0.3830299,1.788139e-07,0.3583125,0.4400783,0.3017257,0.5528891,0.3303838,0.3675998,...,0.5380068,0.5405055,0.4228975,0.4926679,0.4891429,0.4927359,0.5073866,0.1531271,0.473183,0.4772182
4,0.605886,0.442306,0.3202419,0.3583125,-3.576279e-07,0.3896317,0.4022319,0.4206074,0.3677071,0.4451498,...,0.5453616,0.4930713,0.5720507,0.2712008,0.4982993,0.5366669,0.488884,0.42656,0.288336,0.5177785
5,0.513668,0.55412,0.4862989,0.4400783,0.3896317,-2.384186e-07,0.398409,0.4368169,0.4098654,0.4698252,...,0.5887937,0.5485118,0.5211869,0.5171058,0.431262,0.5911548,0.5730746,0.5224085,0.5381756,0.3509731
6,0.599704,0.581456,0.4701555,0.3017257,0.4022319,0.398409,-4.768372e-07,0.3439444,0.2934767,0.2826687,...,0.4564909,0.4678974,0.4055438,0.3866861,0.4655049,0.4634696,0.492298,0.2931998,0.3983608,0.4548314
7,0.511196,0.587115,0.4044428,0.5528891,0.4206074,0.4368169,0.3439444,-3.576279e-07,0.3569351,0.3871498,...,0.5376078,0.3647079,0.4349865,0.4099172,0.4035407,0.4348904,0.3755428,0.5031757,0.3752303,0.5461139
8,0.58463,0.547354,0.5207425,0.3303838,0.3677071,0.4098654,0.2934767,0.3569351,-4.768372e-07,0.2466263,...,0.3896822,0.4506288,0.4596954,0.3257531,0.4474627,0.3984737,0.4574753,0.236455,0.340515,0.4341301
9,0.579264,0.558215,0.5797158,0.3675998,0.4451498,0.4698252,0.2826687,0.3871498,0.2466263,-3.576279e-07,...,0.3981768,0.497834,0.3972323,0.3818812,0.5057881,0.4076159,0.5536017,0.3635902,0.4328726,0.5017614



Rank correlations per width:


Unnamed: 0,width,rank_correlation
0,0,-0.206154
1,20,0.568986
2,40,-0.067715
3,60,0.323243
4,80,0.388984
5,100,0.45509
6,120,0.36653
7,140,0.161903
8,160,0.337214
9,180,0.605377



Average width rank correlation: 0.4799494870726465


# Options experiments

In [None]:
MODEL_CKPT_DIR = 'model_params_big_test'
# MODEL_CKPT_DIR = 'res_balanced_accuracy'
# MODEL_CKPT_DIR = 'res_f1'

MODEL_DATA_FILE = Path(MODEL_CKPT_DIR) / 'data.csv'

df_model = pd.read_csv(MODEL_DATA_FILE)

In [None]:
# > AVERAGE FILES HERE
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_all_query_pairs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-5_query_pairs.csv'

# > GOOD FILES HERE
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-5_query-23_nonquery-5539_pairs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-5_query-23_nonquery-5539_embedded.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-20_query-23_nonquery-5539_embedded.csv'
DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-50_query-23_nonquery-5539_embedded.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-50_query-23_nonquery-5539_pairs.csv'

# > FILES W/INNER RUCLIP AS RANKER
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-30_query-2_nonquery-6_embedded.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-20query-23_nonquery-5539_embedded_sbert=all-distilroberta-v1_clip=siamese_contrastive.pt.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-20_query-2_nonquery-6_embedded_sbert=all-distilroberta-v1_clip=siamese_contrastive.pt_final-embs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-50_query-23_nonquery-5539_embedded_sbert=all-distilroberta-v1_clip=siamese_contrastive.pt_final-embs.csv'

# df_all = pd.read_csv(DATA_FILE)
# df_all.columns.tolist()

In [None]:
pd.set_option('display.max_colwidth', None)

keywords = r'карта'
kw_mask = df_model.name_first.str.contains(keywords, case=False, regex=True)

df_model_domain = df_model[
    kw_mask

    # & (df_model.label == 1)
    # & (df_model.label == 0)
]

# for idx, row in df_model_domain.iterrows():
#     print(row.name_first)
#     print(row.sku_first)

#     print(row.name_second)
#     print(row.sku_second)
#     print('-' * 50)
#     print()

# pd.reset_option('display.max_colwidth')

In [None]:
# Define batch size for processing

# Extract options text
options_first = df_model_domain['options_first'].tolist()
options_second = df_model_domain['options_second'].tolist()

# Generate embeddings for options in batches
options_first_embeddings = sbert.encode(options_first, batch_size=BATCH_SIZE, show_progress_bar=True)
options_second_embeddings = sbert.encode(options_second, batch_size=BATCH_SIZE, show_progress_bar=True)

# Add embeddings to dataframe
df_model_domain['options_first_embedding'] = list(options_first_embeddings)
df_model_domain['options_second_embedding'] = list(options_second_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [57]:
# Calculate cosine similarity between options embeddings
from sklearn.metrics.pairwise import cosine_similarity
options_similarities = [cosine_similarity([emb1], [emb2])[0][0] 
                        for emb1, emb2 in zip(options_first_embeddings, options_second_embeddings)]
# Fix SettingWithCopyWarning by using .loc
df_model_domain.loc[:, 'options_similarity'] = options_similarities

In [75]:
# drop identical pairs
df_model_domain = df_model_domain[df_model_domain['sku_first'] != df_model_domain['sku_second']]

In [76]:
# Extract height and width from options text
import re

# Function to extract height and width from options text
def extract_height_width(options_text):
    # Look for height pattern like "Высота предмета: 107 см"
    height_match = re.search(r'Высота.*?: (\d+)', options_text, re.IGNORECASE)
    height = int(height_match.group(1)) if height_match else None
    
    # Look for width pattern like "Ширина предмета: 157 см"
    width_match = re.search(r'Ширина.*?: (\d+)', options_text, re.IGNORECASE)
    width = int(width_match.group(1)) if width_match else None

    # Look for combined pattern like "157x107"
    combined_match = re.search(r'(\d+)\s*[xх*]\s*(\d+)', options_text, re.IGNORECASE)
    if combined_match:
        width = int(combined_match.group(1))
        height = int(combined_match.group(2))

    return height, width


# Function to process each group
def process_group(group):
    # Extract height and width from first product options (reference)
    reference_options = group['options_first'].iloc[0]
    ref_height, ref_width = extract_height_width(reference_options)
    group['width_first'] = ref_width
    group['height_first'] = ref_height

    # Create new columns with height and width extraction results
    group.loc[:, 'height_second'] = group['options_second'].apply(
        lambda x: extract_height_width(x)[0]).astype('Int64')
    group.loc[:, 'width_second'] = group['options_second'].apply(
        lambda x: extract_height_width(x)[1]).astype('Int64')
    return group

# Apply the processing function to each group
grouped = df_model_domain.groupby('sku_first', group_keys=False)
processed_df = grouped.apply(process_group, include_groups=True).dropna()
processed_df.head(1)

  processed_df = grouped.apply(process_group, include_groups=True).dropna()


Unnamed: 0,balance_first,sales_first,rating_first,final_price_first,comments_first,description_first,name_first,options_first,sku_first,has_video_first,...,name_sim,img_sim,label,options_first_embedding,options_second_embedding,options_similarity,width_first,height_first,height_second,width_second
22,1,1,5,432,95,"Карта мира настенная ""Мир"" 1,0х0,7м, м-б 1:34 000 000. Физическая карта мира напечатана одним листом на плотном мелованном картоне, без ламинации. Карта настенная прекрасно впишется в любое помещение и станет приятным подарком и для ребенка и для взрослого. Небольшой размер карты так же позволяет поместить ее на рабочий стол под стекло или под любой другой прозрачный материал. Карта мира для детей без ламинации, а значит она станет идеальным фотофоном для семейных и интерьерных фотосъемок, поскольку не дает бликов. Так же издания без ламинации позволяют рисовать или делать пометки на карте, что поможет в работе или развитии творческих способностей - карта развития ребенка. Современное учебное пособие для изучения географии и окружающего мира, атлас мира для школьников, для детей младшего и среднего школьного возраста. Карта мира для детей на стену прекрасный подарок для мальчика или девочке. Универсальная карта на стену станет ярким элементом декора в оформлении квартиры или дома, полезной деталью дизайна детской комнаты ребенка или гостиной, познавательной зоной в офисе или рабочем кабинете, вызывающей интерес. Скретч карта мира украсит детскую комнату или любую стену. Карта мира - нужный и интересный подарок к школе на 1 сентября, на выпускной ребенку, любимой женщине и полезное развлечение для всей семьи. Этот постер станет отличным помощником в учебе и школьных буднях для вашего ребенка, а также идеально подойдет для подростков для учебы в школе, университете, вузе, колледже, для работы. Цветная карта будет полезной стеновой декорацией в любом другом учреждении образования. В нашем ассортименте вы найдете также другие карты, такие как карта мира физическая и политическая, новая большая карта России настенная для детей, карта звездного неба и созвездий, карта дорог России с городами для путешественников, карта животных, карта животный и растительный мир, картина на стену, карта земли, карта для школы, постер космическое пространство и детский атлас мира. Закажите уже сегодня и украсьте свою стену красивым и информативным изображением нашей Родины - настенной географической картой мира! Изучайте мир вместе с АТЛАС ПРИНТ. Цветом показан рельеф суши и морского дна. Обозначены основные населенные пункты. Карта настольная, плакат на стену, картина на стену, декор на стену","Географическая физическая карта Мира ""Мир"" 100х70 см",Высота предмета: 70 см. Ширина предмета: 100 см. Тип карты: Географическая; детские карты; настенная. Вид карты: физическая; настольная карта; Интерьерная. Масштаб: 1:34000000. Страна производства: Россия. Комплектация: Карта Мира настенная 70х100 см.,64354900,0,...,0.83,0.61,1,"[0.032793194, -0.030637827, -0.0064851665, 0.019776396, -0.0633618, -0.035394456, 0.02328478, -0.020472685, -0.044235308, 0.016656656, 0.034888934, -0.025758406, -0.004444477, 0.0013730659, 0.016152378, -0.046341356, -0.021061925, -0.01838166, 0.025725948, 0.03079035, 0.02375403, 0.079863995, -0.00379466, -0.0037534677, -0.02411043, -0.0019409277, -0.0014351468, -0.0060945717, -0.13618548, 0.052619815, -0.050256092, 0.019624474, -0.03792269, 0.008710592, -0.028665915, 0.026701672, 0.046598956, -0.031180745, -0.0092269275, 0.07243504, -0.011656352, 0.0117158275, 0.05128237, 0.00022830546, -0.001957942, 0.035116617, 0.0005101067, -0.05734167, 0.02815501, -0.0029705737, -0.011123582, -0.02172256, -0.027522175, 0.0059820563, -0.0045746122, 0.042848255, 0.007714739, -0.12709941, 0.05282511, 0.032889668, -0.01632066, -0.03742703, -0.038455207, 1.9617113e-05, 0.019173514, -0.0492984, 0.006510193, 0.053324174, -0.00033854725, -0.049684227, 0.012852187, -0.029684111, 0.012068731, 0.014603281, 0.0071551227, -0.028081007, -0.04592802, 0.036613807, -0.011041426, -0.04730771, 0.023643248, 0.048175216, 0.02264705, -0.037822124, 0.030752499, 0.040965818, -0.0017573332, -0.043430056, -0.0021040458, 0.024745658, 0.026557997, 0.010074647, -0.027270082, 0.0031908802, 0.008544485, -0.031177748, 0.026881132, 0.06932713, 0.008190405, 0.00044260133, ...]","[0.025375823, -0.03657312, -0.0073862183, 0.01619693, -0.054069813, -0.039896443, 0.02025447, -0.022977931, -0.044401344, 0.026535809, 0.029209722, -0.0066124555, 0.015548598, -0.0044737826, -0.022497123, -0.03595512, -0.032935947, 0.0014859501, 0.008354851, 0.044080473, 0.019021222, 0.09658968, 0.011145302, -0.0037561948, -0.021782698, 0.0006590658, -0.021864654, 0.011421028, -0.1296104, 0.02773565, -0.04761371, 0.015277754, -0.026719453, -0.0062626656, -0.039746013, 0.039081275, 0.030102268, -0.04009729, -0.017793013, 0.07793485, -0.011419759, -0.01193122, 0.060189, -0.011233648, -0.0036852956, 0.039520893, 0.010237513, -0.033171456, 0.05077601, -0.034975063, 0.004193409, -0.04569595, -0.02388133, 0.017178865, 0.008920047, 0.05357628, 0.004396396, -0.09544067, 0.051745743, 0.018033123, -0.012748121, -0.026736598, -0.028738817, 0.026946472, 0.0028335804, -0.04293689, 0.022302825, 0.047504038, -0.0005532359, -0.06607013, 0.04081544, -0.015315204, 0.01464152, -0.025648233, 0.021558795, -0.00849306, -0.056107726, 0.007499936, -0.013480753, -0.04401858, 0.011390801, 0.042701893, 0.020242978, -0.056645393, 0.022674164, 0.055672117, -0.0011616659, -0.0489748, -0.015803069, 0.009169765, 0.058655605, 0.005860909, -0.013715878, -0.017976904, 0.015265221, 0.0030384935, 0.0473745, 0.032432944, 0.005818617, 0.0005181484, ...]",0.91,70,100,70,101


In [77]:
# Get embeddings for height and width
import numpy as np
from tqdm import tqdm

# Use a single template for the second embeddings
HW_TEMPLATE = 'height: {height}, width: {width}'
HW_TEMPLATE = '[{height}, {width}]'

# Initialize lists to store concatenated embeddings
width_height_embeddings_first = []
width_height_embeddings_second = []

# Process the dataframe in batches
for start_idx in tqdm(range(0, len(processed_df), BATCH_SIZE), desc="Processing batches"):
    # Get the batch of rows
    batch = processed_df.iloc[start_idx:start_idx + BATCH_SIZE]

    # Extract height and width for _first and _second products
    heights_first = batch['height_first'].tolist()
    widths_first = batch['width_first'].tolist()
    heights_second = batch['height_second'].tolist()
    widths_second = batch['width_second'].tolist()

    # Compute embeddings for _first products using an inline f-string
    embeddings_first = sbert.encode(
        [f'[{height}, {width}]' for height, width in zip(heights_first, widths_first)], 
        batch_size=BATCH_SIZE, show_progress_bar=False
    )
    width_height_embeddings_first.extend(embeddings_first)

    # Compute embeddings for _second products using the template with keyword arguments
    embeddings_second = sbert.encode([
        HW_TEMPLATE.format(height=height, width=width) if not np.isnan(height) and not np.isnan(width)
        else '[nan, nan]'
        for height, width in zip(heights_second, widths_second)
    ], batch_size=BATCH_SIZE, show_progress_bar=False)
    width_height_embeddings_second.extend(embeddings_second)

# Add the computed embeddings as new columns in the dataframe
processed_df['width_height_embedding_first'] = width_height_embeddings_first
processed_df['width_height_embedding_second'] = width_height_embeddings_second

Processing batches: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it]


In [78]:
# Compute separate height and width embeddings for first and second products,
# then compute an aggregated embedding (concatenation of height and width embeddings)

height_embeddings_first = []
width_embeddings_first = []
agg_embeddings_first = []

height_embeddings_second = []
width_embeddings_second = []
agg_embeddings_second = []

for start_idx in tqdm(range(0, len(processed_df), BATCH_SIZE), desc="Processing batches"):
    batch = processed_df.iloc[start_idx:start_idx + BATCH_SIZE]
    
    # Extract height and width values for first and second products
    heights_first = batch['height_first'].tolist()
    widths_first = batch['width_first'].tolist()
    heights_second = batch['height_second'].tolist()
    widths_second = batch['width_second'].tolist()
    
    # Create stringified inputs (using "nan" if missing)
    height_strs_first = [str(h) if pd.notnull(h) else "nan" for h in heights_first]
    width_strs_first = [str(w) if pd.notnull(w) else "nan" for w in widths_first]
    height_strs_second = [str(h) if pd.notnull(h) else "nan" for h in heights_second]
    width_strs_second = [str(w) if pd.notnull(w) else "nan" for w in widths_second]
    
    # Compute embeddings for first products
    emb_height_first = sbert.encode(height_strs_first, batch_size=BATCH_SIZE, show_progress_bar=False)
    emb_width_first = sbert.encode(width_strs_first, batch_size=BATCH_SIZE, show_progress_bar=False)
    emb_agg_first = [np.concatenate([h, w]) for h, w in zip(emb_height_first, emb_width_first)]
    
    height_embeddings_first.extend(emb_height_first)
    width_embeddings_first.extend(emb_width_first)
    agg_embeddings_first.extend(emb_agg_first)
    
    # Compute embeddings for second products
    emb_height_second = sbert.encode(height_strs_second, batch_size=BATCH_SIZE, show_progress_bar=False)
    emb_width_second = sbert.encode(width_strs_second, batch_size=BATCH_SIZE, show_progress_bar=False)
    emb_agg_second = [np.concatenate([h, w]) for h, w in zip(emb_height_second, emb_width_second)]
    
    height_embeddings_second.extend(emb_height_second)
    width_embeddings_second.extend(emb_width_second)
    agg_embeddings_second.extend(emb_agg_second)

# Add the computed embeddings as new dataframe columns
processed_df['height_embedding_first'] = height_embeddings_first
processed_df['width_embedding_first'] = width_embeddings_first
processed_df['agg_embedding_first'] = agg_embeddings_first

processed_df['height_embedding_second'] = height_embeddings_second
processed_df['width_embedding_second'] = width_embeddings_second
processed_df['agg_embedding_second'] = agg_embeddings_second

Processing batches: 100%|██████████| 1/1 [00:02<00:00,  2.91s/it]


In [113]:
# Apply extraction to the selected SKU group
SELECTED_SKU_IDX = 1

unique_skus = processed_df['sku_first'].unique()
selected_sku = unique_skus[SELECTED_SKU_IDX]
selected_group = processed_df.groupby('sku_first').get_group(selected_sku).copy()  # Use .copy() to avoid SettingWithCopyWarning
selected_group.shape, selected_group.label.unique()

((34, 49), array([0, 1]))

In [115]:
# Get final ranking table for similarities
from sklearn.metrics.pairwise import cosine_similarity

ref_height, ref_width = selected_group[['height_first', 'width_first']].iloc[0]

# Compute Euclidean distance for all items if reference dimensions exist
if ref_height and ref_width:
    print(f"\nReference product dimensions: Height {ref_height} cm, Width {ref_width} cm")
    
    # Calculate Euclidean distance to reference height and width for all rows
    selected_group.loc[:, 'euclidean_distance'] = selected_group.apply(
        lambda row: ((row['height_second'] - ref_height) ** 2 + 
                     (row['width_second'] - ref_width) ** 2) ** 0.5 
        if pd.notnull(row['height_second']) and pd.notnull(row['width_second']) else None, 
        axis=1
    )
    
    # Sort by Euclidean distance
    sorted_group = selected_group.sort_values('euclidean_distance')
    
    # Display sorted results
    print("\nProducts sorted by Euclidean distance to reference dimensions:")
else:
    print("\nReference dimensions are not available.")
    
# Compute ranks by Euclidean distance and similarity without ties
if 'euclidean_distance' in selected_group.columns:
    selected_group['rank_by_distance'] = selected_group['euclidean_distance'].rank(method='first', ascending=True)
else:
    selected_group['rank_by_distance'] = None

if 'options_similarity' in selected_group.columns:
    selected_group['rank_by_similarity'] = selected_group['options_similarity'].rank(method='first', ascending=False)
else:
    selected_group['rank_by_similarity'] = None

# Compute similarity for width/height embeddings
if 'width_height_embedding_first' in selected_group.columns and 'width_height_embedding_second' in selected_group.columns:
    selected_group['wh_emb_sim'] = selected_group.apply(
        lambda row: cosine_similarity(
            [row['width_height_embedding_first']], 
            [row['width_height_embedding_second']]
        )[0][0] if not pd.isnull(row['width_height_embedding_first']).any() and not pd.isnull(row['width_height_embedding_second']).any() else None,
        axis=1
    )
    # Compute ranks for width/height embedding similarity
    selected_group['rank_by_wh_emb_sim'] = selected_group['wh_emb_sim'].rank(method='first', ascending=False)
else:
    selected_group['wh_emb_sim'] = None
    selected_group['rank_by_wh_emb_sim'] = None

# Compute similarity for aggregated embeddings and their ranks
if 'agg_embedding_first' in selected_group.columns and 'agg_embedding_second' in selected_group.columns:
    selected_group['agg_emb_sim'] = selected_group.apply(
        lambda row: cosine_similarity(
            [row['agg_embedding_first']], 
            [row['agg_embedding_second']]
        )[0][0] if row['agg_embedding_first'] is not None and row['agg_embedding_second'] is not None else None,
        axis=1
    )
    selected_group['rank_by_agg_emb_sim'] = selected_group['agg_emb_sim'].rank(method='first', ascending=False)
else:
    selected_group['agg_emb_sim'] = None
    selected_group['rank_by_agg_emb_sim'] = None

# Compute and print rank correlation between different ranking metrics
if 'rank_by_distance' in selected_group.columns and 'rank_by_similarity' in selected_group.columns:
    rank_corr_distance_similarity = selected_group[['rank_by_distance', 'rank_by_similarity']].corr(method='spearman').iloc[0, 1]
    print(f"\nSpearman rank correlation between distance and similarity ranks: {rank_corr_distance_similarity:.4f}")
else:
    print("\nRank columns are not available for correlation computation between distance and similarity.")

if 'rank_by_distance' in selected_group.columns and 'rank_by_wh_emb_sim' in selected_group.columns:
    rank_corr_distance_emb_sim = selected_group[['rank_by_distance', 'rank_by_wh_emb_sim']].corr(method='spearman').iloc[0, 1]
    print(f"Spearman rank correlation between distance and width/height embedding similarity ranks: {rank_corr_distance_emb_sim:.4f}")
else:
    print("\nRank columns are not available for correlation computation between distance and width/height embedding similarity.")

if 'rank_by_distance' in selected_group.columns and 'rank_by_agg_emb_sim' in selected_group.columns:
    rank_corr_distance_agg = selected_group[['rank_by_distance', 'rank_by_agg_emb_sim']].corr(method='spearman').iloc[0, 1]
    print(f"Spearman rank correlation between distance and aggregated embedding similarity ranks: {rank_corr_distance_agg:.4f}")
else:
    print("\nRank columns are not available for correlation computation between distance and aggregated embedding similarity.")

# Find similar dimensions (for demonstration)
dimension_df = selected_group[[
    'label',

    'sku_first', 'sku_second',

    # 'options_first', 'options_second',

    'width_first', 'height_first',
    'width_second', 'height_second',

    # 'balance_first',
    'balance_second',

    'sales_first',
    'sales_second',

    'rank_by_distance',
    'rank_by_similarity',
    # 'rank_by_wh_emb_sim',
    'rank_by_agg_emb_sim',

    # 'euclidean_distance',
    # 'options_similarity',
    # 'wh_emb_sim',
    # 'agg_emb_sim',
]].copy()

filtered_dimension_df = dimension_df

# Sort the filtered results by rank_by_similarity

# sort_rank_col = 'rank_by_similarity'
# sort_rank_col = 'rank_by_distance'
sort_rank_col = 'rank_by_agg_emb_sim'

filtered_dimension_df = filtered_dimension_df.sort_values(sort_rank_col, ascending=True)

pd.options.display.float_format = '{:.2f}'.format
filtered_dimension_df = filtered_dimension_df.reset_index(drop=True)

# Display the filtered and sorted results
pd.set_option('display.max_colwidth', None)
print("\nProducts with extracted dimensions, ranks, embedding similarity, and labels:")

display(filtered_dimension_df[filtered_dimension_df.label == 1])
display(filtered_dimension_df)


Reference product dimensions: Height 107 cm, Width 157 cm

Products sorted by Euclidean distance to reference dimensions:

Spearman rank correlation between distance and similarity ranks: 0.0640
Spearman rank correlation between distance and width/height embedding similarity ranks: 0.5135
Spearman rank correlation between distance and aggregated embedding similarity ranks: 0.9319

Products with extracted dimensions, ranks, embedding similarity, and labels:


Unnamed: 0,label,sku_first,sku_second,width_first,height_first,width_second,height_second,balance_second,sales_first,sales_second,rank_by_distance,rank_by_similarity,rank_by_agg_emb_sim
2,1,64354767,182755008,157,107,160,100,583,271,2117,2.0,28.0,3.0
3,1,64354767,182755012,157,107,160,100,203,271,899,3.0,29.0,4.0
4,1,64354767,182755006,157,107,160,100,0,271,320,4.0,30.0,5.0


Unnamed: 0,label,sku_first,sku_second,width_first,height_first,width_second,height_second,balance_second,sales_first,sales_second,rank_by_distance,rank_by_similarity,rank_by_agg_emb_sim
0,0,64354767,72587452,157,107,160,90,76,271,701,5.0,24.0,1.0
1,0,64354767,147848541,157,107,160,100,145,271,2354,1.0,27.0,2.0
2,1,64354767,182755008,157,107,160,100,583,271,2117,2.0,28.0,3.0
3,1,64354767,182755012,157,107,160,100,203,271,899,3.0,29.0,4.0
4,1,64354767,182755006,157,107,160,100,0,271,320,4.0,30.0,5.0
5,0,64354767,78583225,157,107,160,85,1,271,1069,10.0,5.0,6.0
6,0,64354767,78583224,157,107,160,85,22,271,614,11.0,4.0,7.0
7,0,64354767,158564491,157,107,160,85,46,271,429,12.0,2.0,8.0
8,0,64354767,157384922,157,107,160,85,30,271,320,13.0,3.0,9.0
9,0,64354767,16863976,157,107,150,90,47,271,1077,6.0,17.0,10.0
