# Filter Siamese model candidates

In [26]:
import pandas as pd
from pathlib import Path

DATA_PATH = 'data'
# file_path = 'tables_OZ_geo_5500/processed/pairwise-rendered/test/num-rows=4476_limit-pos=None_pos-neg=1.0_hard-soft=0.5_seed=42/preds_fpr=0.05.parquet'
# file_path = 'tables_OZ_geo_5500/processed/pairwise-rendered/test/num-rows=4476_limit-pos=None_pos-neg=1.0_hard-soft=0.5_seed=42/preds_fpr=1.0.parquet'
file_path = 'tables_OZ_geo_5500/test_results/top-k/query-seller=ИНТЕРТРЕЙД/siamese_contrastive_soft-neg_epoch=1_val-f1=0.829_val-pos-acc=0.802_val-neg-acc=0.932_splitting-by-query_cc12m_rubert_tiny_ep_1.pt_best-f1-threshold=1.010.pt/top-None@thr_best.csv'
file_path = 'tables_OZ_geo_5500/processed/regex-pairwise-groups/regex-pairwise-groups_num-queries=20_patterns-dict-hash=a6223255f273e52a893ba7235e3c19b3/preds_fpr=0.01.parquet'
file_path = 'tables_OZ_geo_5500/processed/regex-pairwise-groups/regex-pairwise-groups_num-queries=20_patterns-dict-hash=a6223255f273e52a893ba7235e3c19b3/preds_fpr=0.36.parquet'

if file_path.endswith('parquet'):
    pairwise_df = pd.read_parquet(Path(DATA_PATH) / file_path)
elif file_path.endswith('csv'):
    pairwise_df = pd.read_csv(Path(DATA_PATH) / file_path)

print(pairwise_df.shape)
pairwise_df.columns.tolist()

(110780, 96)


['sku_first',
 'description_first',
 'image_url_first',
 'name_first',
 'category_first',
 'схема_first',
 'brand_first',
 'niche_first',
 'seller_first',
 'balance_fbo_first',
 'balance_fbs_first',
 'warehouses_count_first',
 'comments_first',
 'final_price_first',
 'max_price_first',
 'min_price_first',
 'average_price_first',
 'median_price_first',
 'membership_card_price_first',
 'sales_first',
 'revenue_first',
 'revenue_potential_first',
 'revenue_average_first',
 'lost_profit_first',
 'lost_profit_percent_first',
 'url_first',
 'thumb_first',
 'pics_count_first',
 'has_video_first',
 'first_date_first',
 'days_in_website_first',
 'days_in_stock_first',
 'days_with_sales_first',
 'average_if_in_stock_first',
 'rating_first',
 'fbs_first',
 'base_price_first',
 'category_position_first',
 'categories_last_count_first',
 'sales_per_day_average_first',
 'sales.1_first',
 'frozen_stocks_first',
 'frozen_stocks_cost_first',
 'frozen_stocks_percent_first',
 'balance_first',
 'image_nam

In [27]:
PRICE_MARGIN = 0.3
MIN_SALES = 0
MIN_FBO = 0
MIN_RATING = 4.3

# --- original filtering mask ---
orig_mask = (
    (pairwise_df['sales_second'] > MIN_SALES)
    & ((pairwise_df['balance_fbo_second'] > MIN_FBO) | (pairwise_df['balance_fbs_second'] > MIN_FBO))
    & (pairwise_df['rating_second'] > MIN_RATING)
)
filtered = pairwise_df[orig_mask]

# --- now per‐sku_first price‐ratio filtering ---
def within_margin(group):
    ratio = group['final_price_second'] / group['final_price_first']
    return group[ratio.between(1 - PRICE_MARGIN, 1 + PRICE_MARGIN)]

filtered_pairwise_df = (
    filtered
    .groupby('sku_first', group_keys=False)
    .apply(within_margin)
    .reset_index(drop=True)
)

print('Original size:', len(pairwise_df))
print('Filtered size:', len(filtered_pairwise_df))


Original size: 110780
Filtered size: 1149


  .apply(within_margin)


In [28]:
def construct_wide_table(
    df,
    label_col: str,
    top_k: int = None,
    positive_only: bool = True,
    include_urls: bool = False,
    layout: str = 'blocked',
    drop_na: bool = True,
):
    """
    Builds a wide-format table of top‐k matches per Query_SKU.
    """
    rows = []
    # Determine how many slots: either fixed top_k or max found per group
    if top_k is None:
        max_counts = (
            df[df[label_col] == 1]
            .groupby('sku_first')
            .size()
            .max()
        )
        n_slots = int(max_counts) if max_counts is not None and not pd.isna(max_counts) else 0
    else:
        n_slots = int(top_k)

    for query_sku, group in df.groupby('sku_first'):
        candidates = group[group[label_col] == 1] if positive_only else group

        # always reset index so positional iloc works
        candidates = candidates.reset_index(drop=True)

        # then sort by probability if available
        if 'proba' in candidates.columns:
            candidates = (
                candidates.sort_values('proba', ascending=False)
                          .reset_index(drop=True)
            )

        # If drop_na is True and there are no candidates, skip this row
        if drop_na and len(candidates) == 0:
            continue

        row = {'Query_SKU': int(query_sku)}

        for i in range(n_slots):
            if i < len(candidates):
                row[f'Top-{i+1}_SKU'] = int(candidates.iloc[i]['sku_second'])
                if include_urls and 'url_second' in candidates.columns:
                    row[f'Top-{i+1}_URL'] = candidates.iloc[i]['url_second']
                if 'proba' in candidates.columns:
                    row[f'Top-{i+1}_Proba'] = float(candidates.iloc[i]['proba'])
            else:
                row[f'Top-{i+1}_SKU'] = -1
                if include_urls:
                    row[f'Top-{i+1}_URL'] = ''
                if 'proba' in candidates.columns:
                    row[f'Top-{i+1}_Proba'] = 0.0

        rows.append(row)

    matches_wide_df = pd.DataFrame(rows)

    if layout == 'blocked':
        sku_cols   = [c for c in matches_wide_df if c.startswith('Top-') and c.endswith('_SKU')]
        url_cols   = [c for c in matches_wide_df if c.startswith('Top-') and c.endswith('_URL')]
        proba_cols = [c for c in matches_wide_df if c.startswith('Top-') and c.endswith('_Proba')]
        matches_wide_df = matches_wide_df[['Query_SKU'] + sku_cols + url_cols + proba_cols]

    return matches_wide_df


In [29]:
# --- Set your desired top_k (e.g. 5) and construct the wide table.

matches_wide_df = construct_wide_table(
    df=filtered_pairwise_df,

    # label_col = 'label',
    label_col = 'prediction',

    top_k = None,
    positive_only=True,

    include_urls=False,
    layout='blocked',
)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
matches_wide_df

Unnamed: 0,Query_SKU,Top-1_SKU,Top-2_SKU,Top-3_SKU,Top-4_SKU,Top-5_SKU,Top-6_SKU,Top-7_SKU,Top-8_SKU,Top-9_SKU,Top-10_SKU,Top-11_SKU,Top-12_SKU,Top-13_SKU,Top-14_SKU,Top-15_SKU,Top-16_SKU,Top-17_SKU,Top-18_SKU,Top-19_SKU,Top-20_SKU,Top-21_SKU,Top-22_SKU,Top-23_SKU,Top-24_SKU,Top-25_SKU,Top-26_SKU,Top-27_SKU,Top-28_SKU,Top-29_SKU,Top-30_SKU,Top-31_SKU,Top-32_SKU,Top-33_SKU,Top-34_SKU,Top-35_SKU,Top-36_SKU,Top-37_SKU,Top-38_SKU,Top-39_SKU,Top-40_SKU,Top-41_SKU,Top-42_SKU,Top-43_SKU,Top-44_SKU,Top-45_SKU,Top-46_SKU,Top-47_SKU,Top-48_SKU,Top-49_SKU,Top-50_SKU,Top-51_SKU,Top-52_SKU,Top-53_SKU,Top-54_SKU,Top-55_SKU,Top-56_SKU,Top-57_SKU
0,491268805,1934870477,1912007512,1887527156,1840978191,1746327437,1743558896,1716671407,1672570414,1672563802,1663086943,1649963830,1649961933,1640637003,1600969688,1543106833,1500421705,1469038854,1345355836,1317729731,1312120043,1223935936,1079913513,1079902314,864566278,861606446,857515421,857108036,856647774,854685999,853831334,853830521,853804528,853784415,851623566,851295373,844770867,844750071,836151949,671211264,490461409,490461399,490461387,268682160,268682152,219077425,219077206,217513834,217489097,217475629,217473297,180358423,180358421,178733797,178724269,178711585,166584097,166584090
1,491270272,1873027006,1729352595,1703583975,1581328190,1567001565,1546437392,1414696452,1345370830,1345349994,1303657192,1294181996,1294181688,1155694731,974286048,974244048,899009330,861723214,856070472,847687475,847647035,805285669,804154679,536896417,322886997,166584096,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,491270369,1598283510,1756838762,1650167061,1650080585,1640617682,1629783179,1621519668,1607805052,1486407874,1333475299,1289030979,1152101160,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,491271284,1758403668,1703583975,1756838762,1729352595,1713036930,1629783179,1567001565,1414696452,1345370830,1345349994,1333475299,1169061464,974244048,861605997,857968654,856985388,854395161,805285669,601320601,590294661,564435026,322886997,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,491271320,1666035693,861593242,268682160,1743558896,1713036930,1711547819,1671095638,1649961933,1640637003,1613663117,1600969688,1497205767,1438527246,1438364142,1436451393,1436449667,1422663978,1190097076,1079902314,861605997,856647774,854395161,851295373,844750071,804154003,671211264,590294661,490461399,268682152,219077206,217489097,217475629,217473297,178724269,178711585,166584098,166584090,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,491271768,1934870477,1934863722,1912007512,1887527156,1840978191,1746327437,1743558896,1743520965,1716671407,1681032195,1672570414,1672563802,1663086943,1649963830,1649961933,1629023767,1600969688,1543106833,1500421705,1469038854,1456835651,1345355836,1317729731,1312120043,1230674601,1079913513,1079902314,942840362,900480076,864566278,861606446,857515421,857108036,854685999,853831334,853830521,853804528,853784415,851626799,836151949,590228312,490461387,324368126,268682134,219077425,219077206,217513834,217489097,217475629,180358423,178800177,178733797,178724269,166584097,166584090,-1,-1
6,491273438,1934870477,1934863722,1912007512,1887527156,1840978191,1746327437,1743558896,1743520965,1716671407,1681032195,1672570414,1672563802,1663086943,1650435961,1649963830,1649961933,1629023767,1543106833,1500421705,1469038854,1456835651,1345355836,1312120043,1230674601,942840362,900480076,864566278,861606446,857515421,857108036,854685999,853831334,853830521,853804528,853784415,851626799,836151949,590228312,490461387,324368126,268682134,219077425,219077206,217513834,217489097,180358423,180358420,178813265,178800177,178733797,166584097,166584090,-1,-1,-1,-1,-1
7,491273791,1607805052,1650167061,1497224885,1486407874,1152101160,588876919,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
8,491279127,1713026634,1737112217,1629783179,1613663117,1713036930,1497205767,1422663978,974244048,861605997,601320601,564435026,1857130100,1758403711,1758403668,1758403630,1756838762,1737112763,1729352595,1703583975,1567001565,1414696452,1345370830,1345349994,1333475299,1169061464,926003864,857968654,856985388,854395161,805285669,590294661,322886997,219077186,178726257,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9,494562010,1043438100,1608590952,1085305049,1024760325,945062673,945062420,945062310,874681793,523060794,288794858,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [30]:
# Filter matches by query seller

QUERY_SELLER = 'ИНТЕРТРЕЙД'

# Get all query SKUs from filtered_pairwise_df
query_skus = filtered_pairwise_df[filtered_pairwise_df.seller_first == QUERY_SELLER]['sku_first'].unique()

# Filter matches_wide_df by Query_SKU being only from that list
query_matches_wide_df = matches_wide_df[matches_wide_df['Query_SKU'].isin(query_skus)]

query_matches_wide_df

Unnamed: 0,Query_SKU,Top-1_SKU,Top-2_SKU,Top-3_SKU,Top-4_SKU,Top-5_SKU,Top-6_SKU,Top-7_SKU,Top-8_SKU,Top-9_SKU,Top-10_SKU,Top-11_SKU,Top-12_SKU,Top-13_SKU,Top-14_SKU,Top-15_SKU,Top-16_SKU,Top-17_SKU,Top-18_SKU,Top-19_SKU,Top-20_SKU,Top-21_SKU,Top-22_SKU,Top-23_SKU,Top-24_SKU,Top-25_SKU,Top-26_SKU,Top-27_SKU,Top-28_SKU,Top-29_SKU,Top-30_SKU,Top-31_SKU,Top-32_SKU,Top-33_SKU,Top-34_SKU,Top-35_SKU,Top-36_SKU,Top-37_SKU,Top-38_SKU,Top-39_SKU,Top-40_SKU,Top-41_SKU,Top-42_SKU,Top-43_SKU,Top-44_SKU,Top-45_SKU,Top-46_SKU,Top-47_SKU,Top-48_SKU,Top-49_SKU,Top-50_SKU,Top-51_SKU,Top-52_SKU,Top-53_SKU,Top-54_SKU,Top-55_SKU,Top-56_SKU,Top-57_SKU
0,491268805,1934870477,1912007512,1887527156,1840978191,1746327437,1743558896,1716671407,1672570414,1672563802,1663086943,1649963830,1649961933,1640637003,1600969688,1543106833,1500421705,1469038854,1345355836,1317729731,1312120043,1223935936,1079913513,1079902314,864566278,861606446,857515421,857108036,856647774,854685999,853831334,853830521,853804528,853784415,851623566,851295373,844770867,844750071,836151949,671211264,490461409,490461399,490461387,268682160,268682152,219077425,219077206,217513834,217489097,217475629,217473297,180358423,180358421,178733797,178724269,178711585,166584097,166584090
1,491270272,1873027006,1729352595,1703583975,1581328190,1567001565,1546437392,1414696452,1345370830,1345349994,1303657192,1294181996,1294181688,1155694731,974286048,974244048,899009330,861723214,856070472,847687475,847647035,805285669,804154679,536896417,322886997,166584096,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,491270369,1598283510,1756838762,1650167061,1650080585,1640617682,1629783179,1621519668,1607805052,1486407874,1333475299,1289030979,1152101160,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,491271284,1758403668,1703583975,1756838762,1729352595,1713036930,1629783179,1567001565,1414696452,1345370830,1345349994,1333475299,1169061464,974244048,861605997,857968654,856985388,854395161,805285669,601320601,590294661,564435026,322886997,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,491271320,1666035693,861593242,268682160,1743558896,1713036930,1711547819,1671095638,1649961933,1640637003,1613663117,1600969688,1497205767,1438527246,1438364142,1436451393,1436449667,1422663978,1190097076,1079902314,861605997,856647774,854395161,851295373,844750071,804154003,671211264,590294661,490461399,268682152,219077206,217489097,217475629,217473297,178724269,178711585,166584098,166584090,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,491271768,1934870477,1934863722,1912007512,1887527156,1840978191,1746327437,1743558896,1743520965,1716671407,1681032195,1672570414,1672563802,1663086943,1649963830,1649961933,1629023767,1600969688,1543106833,1500421705,1469038854,1456835651,1345355836,1317729731,1312120043,1230674601,1079913513,1079902314,942840362,900480076,864566278,861606446,857515421,857108036,854685999,853831334,853830521,853804528,853784415,851626799,836151949,590228312,490461387,324368126,268682134,219077425,219077206,217513834,217489097,217475629,180358423,178800177,178733797,178724269,166584097,166584090,-1,-1
6,491273438,1934870477,1934863722,1912007512,1887527156,1840978191,1746327437,1743558896,1743520965,1716671407,1681032195,1672570414,1672563802,1663086943,1650435961,1649963830,1649961933,1629023767,1543106833,1500421705,1469038854,1456835651,1345355836,1312120043,1230674601,942840362,900480076,864566278,861606446,857515421,857108036,854685999,853831334,853830521,853804528,853784415,851626799,836151949,590228312,490461387,324368126,268682134,219077425,219077206,217513834,217489097,180358423,180358420,178813265,178800177,178733797,166584097,166584090,-1,-1,-1,-1,-1
7,491273791,1607805052,1650167061,1497224885,1486407874,1152101160,588876919,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
8,491279127,1713026634,1737112217,1629783179,1613663117,1713036930,1497205767,1422663978,974244048,861605997,601320601,564435026,1857130100,1758403711,1758403668,1758403630,1756838762,1737112763,1729352595,1703583975,1567001565,1414696452,1345370830,1345349994,1333475299,1169061464,926003864,857968654,856985388,854395161,805285669,590294661,322886997,219077186,178726257,147896030,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9,494562010,1043438100,1608590952,1085305049,1024760325,945062673,945062420,945062310,874681793,523060794,288794858,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


# Inspect matches

In [32]:
QUERY_SKU_TO_INSPECT = 491270369

sorted_df = (
    filtered_pairwise_df
    .groupby('sku_first', group_keys=False)
    .apply(lambda group: group.sort_values('siam_l2_dist', ascending=True), include_groups=True)
)

cols_to_inspect = [
    'sku_first',
    'sku_second',
    'siam_l2_dist',
    'prediction',
    'label',
    'final_price_first',
    'final_price_second',
]

# if 'pair_type' in sorted_df.columns:
#     cols_to_inspect.append('pair_type')

sorted_df[sorted_df.sku_first == QUERY_SKU_TO_INSPECT][cols_to_inspect]

  .apply(lambda group: group.sort_values('siam_l2_dist', ascending=True), include_groups=True)


Unnamed: 0,sku_first,sku_second,siam_l2_dist,prediction,label,final_price_first,final_price_second
157,491270369,1621519668,0.92372,1,0,813,584
155,491270369,1640617682,1.023639,1,0,813,583
153,491270369,1650167061,1.175605,1,0,813,650
163,491270369,1486407874,1.435541,1,0,813,762
170,491270369,1152101160,1.436312,1,0,813,597
168,491270369,1289030979,1.654091,1,0,813,601
159,491270369,1607805052,2.052945,1,0,813,751
148,491270369,1756838762,2.199004,1,0,813,624
142,491270369,1598283510,2.415878,1,0,813,579
154,491270369,1650080585,2.524429,1,0,813,1003
