# Filter Siamese model candidates

In [58]:
import pandas as pd
from pathlib import Path

DATA_PATH = 'data'
# file_path = 'tables_OZ_geo_5500/processed/pairwise-rendered/test/num-rows=4476_limit-pos=None_pos-neg=1.0_hard-soft=0.5_seed=42/preds_fpr=0.05.parquet'
file_path = 'tables_OZ_geo_5500/processed/pairwise-rendered/test/num-rows=4476_limit-pos=None_pos-neg=1.0_hard-soft=0.5_seed=42/preds_fpr=1.0.parquet'

pairwise_df = pd.read_parquet(Path(DATA_PATH) / file_path).drop(columns='Unnamed: 0', errors='ignore')
print(pairwise_df.shape)
pairwise_df.columns.tolist()

(4476, 97)


['description_first',
 'image_url_first',
 'name_first',
 'category_first',
 'схема_first',
 'brand_first',
 'niche_first',
 'seller_first',
 'balance_fbo_first',
 'balance_fbs_first',
 'warehouses_count_first',
 'comments_first',
 'final_price_first',
 'max_price_first',
 'min_price_first',
 'average_price_first',
 'median_price_first',
 'membership_card_price_first',
 'sales_first',
 'revenue_first',
 'revenue_potential_first',
 'revenue_average_first',
 'lost_profit_first',
 'lost_profit_percent_first',
 'url_first',
 'thumb_first',
 'pics_count_first',
 'has_video_first',
 'first_date_first',
 'days_in_website_first',
 'days_in_stock_first',
 'days_with_sales_first',
 'average_if_in_stock_first',
 'rating_first',
 'fbs_first',
 'base_price_first',
 'category_position_first',
 'categories_last_count_first',
 'sales_per_day_average_first',
 'sales.1_first',
 'frozen_stocks_first',
 'frozen_stocks_cost_first',
 'frozen_stocks_percent_first',
 'balance_first',
 'image_name_first',
 'de

In [59]:
PRICE_MARGIN = 0.3
MIN_SALES = 0
MIN_FBO = 0
MIN_RATING = 4.3

# --- original filtering mask ---
orig_mask = (
    (pairwise_df['sales_second'] > MIN_SALES)
    & ((pairwise_df['balance_fbo_second'] > MIN_FBO) | (pairwise_df['balance_fbs_second'] > MIN_FBO))
    & (pairwise_df['rating_second'] > MIN_RATING)
)
filtered = pairwise_df[orig_mask]

# --- now per‐sku_first price‐ratio filtering ---
def within_margin(group):
    ratio = group['final_price_second'] / group['final_price_first']
    return group[ratio.between(1 - PRICE_MARGIN, 1 + PRICE_MARGIN)]

filtered_pairwise_df = (
    filtered
    .groupby('sku_first', group_keys=False)
    .apply(within_margin)
)

print('Original size:', len(pairwise_df))
print('Filtered size:', len(filtered_pairwise_df))


Original size: 4476
Filtered size: 166


  .apply(within_margin)


In [60]:
def construct_wide_table(
    df,
    label_col: str,
    top_k: int = None,
    positive_only: bool = True,
    include_urls: bool = False,
    layout: str = 'blocked',
):
    """
    Builds a wide-format table of top‐k matches per Query_SKU.
    """
    rows = []
    # Determine how many slots: either fixed top_k or max found per group
    if top_k is None:
        max_counts = (
            df[df[label_col] == 1]
            .groupby('sku_first')
            .size()
            .max()
        )
        n_slots = int(max_counts)
    else:
        n_slots = int(top_k)

    for query_sku, group in df.groupby('sku_first'):
        candidates = group[group[label_col] == 1] if positive_only else group

        # always reset index so positional iloc works
        candidates = candidates.reset_index(drop=True)

        # then sort by probability if available
        if 'proba' in candidates.columns:
            candidates = (
                candidates.sort_values('proba', ascending=False)
                          .reset_index(drop=True)
            )

        row = {'Query_SKU': int(query_sku)}

        for i in range(n_slots):
            if i < len(candidates):
                row[f'Top-{i+1}_SKU'] = int(candidates.iloc[i]['sku_second'])
                if include_urls and 'url_second' in candidates.columns:
                    row[f'Top-{i+1}_URL'] = candidates.iloc[i]['url_second']
                if 'proba' in candidates.columns:
                    row[f'Top-{i+1}_Proba'] = float(candidates.iloc[i]['proba'])
            else:
                row[f'Top-{i+1}_SKU'] = -1
                if include_urls:
                    row[f'Top-{i+1}_URL'] = ''
                if 'proba' in candidates.columns:
                    row[f'Top-{i+1}_Proba'] = 0.0

        rows.append(row)

    matches_wide_df = pd.DataFrame(rows)

    if layout == 'blocked':
        sku_cols   = [c for c in matches_wide_df if c.startswith('Top-') and c.endswith('_SKU')]
        url_cols   = [c for c in matches_wide_df if c.startswith('Top-') and c.endswith('_URL')]
        proba_cols = [c for c in matches_wide_df if c.startswith('Top-') and c.endswith('_Proba')]
        matches_wide_df = matches_wide_df[['Query_SKU'] + sku_cols + url_cols + proba_cols]

    return matches_wide_df


In [61]:
# --- Set your desired top_k (e.g. 5) and construct the wide table.

matches_wide_df = construct_wide_table(
    df=filtered_pairwise_df,

    # label_col = 'label',
    label_col = 'prediction',

    top_k = None,
    positive_only=True,

    include_urls=False,
    layout='blocked',
)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
matches_wide_df

Unnamed: 0,Query_SKU,Top-1_SKU,Top-2_SKU,Top-3_SKU,Top-4_SKU,Top-5_SKU
0,289295263,1125087194,1098195593,-1,-1,-1
1,339485530,491270272,-1,-1,-1,-1
2,437089512,-1,-1,-1,-1,-1
3,467396304,-1,-1,-1,-1,-1
4,485355883,861605997,-1,-1,-1,-1
5,490461399,856647774,922231521,-1,-1,-1
6,491268805,-1,-1,-1,-1,-1
7,491271768,-1,-1,-1,-1,-1
8,491273438,-1,-1,-1,-1,-1
9,508611672,-1,-1,-1,-1,-1


In [67]:
# Filter matches by query seller

QUERY_SELLER = 'ИНТЕРТРЕЙД'

# Get all query SKUs from filtered_pairwise_df
query_skus = filtered_pairwise_df[filtered_pairwise_df.seller_first == QUERY_SELLER]['sku_first'].unique()

# Filter matches_wide_df by Query_SKU being only from that list
query_matches_wide_df = matches_wide_df[matches_wide_df['Query_SKU'].isin(query_skus)]

query_matches_wide_df

Unnamed: 0,Query_SKU,Top-1_SKU,Top-2_SKU,Top-3_SKU,Top-4_SKU,Top-5_SKU
6,491268805,-1,-1,-1,-1,-1
7,491271768,-1,-1,-1,-1,-1
8,491273438,-1,-1,-1,-1,-1
9,508611672,-1,-1,-1,-1,-1
20,824158517,-1,-1,-1,-1,-1
26,922229770,861593242,-1,-1,-1,-1
27,922231521,490461399,856647774,-1,-1,-1


# Inspect matches

In [56]:
QUERY_SKU_TO_INSPECT = 289295263

sorted_df = (
    filtered_pairwise_df
    .groupby('sku_first', group_keys=False)
    .apply(lambda group: group.sort_values('siam_l2_dist', ascending=False), include_groups=True)
)

sorted_df[sorted_df.sku_first == QUERY_SKU_TO_INSPECT][[
    'sku_first',
    'sku_second',
    'siam_l2_dist',
    'final_price_first',
    'final_price_second'
]]

  .apply(lambda group: group.sort_values('siam_l2_dist', ascending=False), include_groups=True)


Unnamed: 0,sku_first,sku_second,siam_l2_dist,final_price_first,final_price_second
2085,289295263,974286048,1.59447,1574,1166
2089,289295263,1098195593,0.032001,1574,1824
2086,289295263,1125087194,0.023881,1574,1483
