In [122]:
import pandas as pd
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path

# Set the checkpoint directory.
# MODEL_CKPT_DIR = 'model_params_big_test'
MODEL_CKPT_DIR = 'res_balanced_accuracy'
# MODEL_CKPT_DIR = 'res_f1'

# Paths to the saved model and scaler.
xgb_model_path = Path(MODEL_CKPT_DIR) / 'xgboost_model.json'
scaler_path = Path(MODEL_CKPT_DIR) / 'std_scaler.bin'

# Load the scaler and the pre-trained XGBoost model.
scaler = joblib.load(scaler_path)
model = xgb.XGBClassifier()
model.load_model(xgb_model_path)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [123]:
MODEL_DATA_FILE = Path(MODEL_CKPT_DIR) / 'data.csv'

# > AVERAGE FILES HERE
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_all_query_pairs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-5_query_pairs.csv'

# > GOOD FILES HERE
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-5_query-23_nonquery-5539_pairs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-5_query-23_nonquery-5539_embedded.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-20_query-23_nonquery-5539_embedded.csv'
DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-50_query-23_nonquery-5539_embedded.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-50_query-23_nonquery-5539_pairs.csv'

# > FILES W/INNER RUCLIP AS RANKER
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-30_query-2_nonquery-6_embedded.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-20query-23_nonquery-5539_embedded_sbert=all-distilroberta-v1_clip=siamese_contrastive.pt.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-20_query-2_nonquery-6_embedded_sbert=all-distilroberta-v1_clip=siamese_contrastive.pt_final-embs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/processed/tabular_OZ_geo_5500_top-50_query-23_nonquery-5539_embedded_sbert=all-distilroberta-v1_clip=siamese_contrastive.pt_final-embs.csv'

df_all = pd.read_csv(DATA_FILE)
df_model = pd.read_csv(MODEL_DATA_FILE)
# df_all.columns.tolist()

In [206]:
df_all[['balance_second', 'sales_second', 'comments_second', 'rating_second']].describe()

Unnamed: 0,balance_second,sales_second,comments_second,rating_second
count,1150.0,1150.0,1150.0,1150.0
mean,234.71913,2.884348,82.755652,2.54487
std,401.364071,15.944028,356.843007,2.420845
min,0.0,0.0,0.0,0.0
25%,6.0,0.0,0.0,0.0
50%,44.0,0.0,2.0,4.2
75%,100.75,0.0,21.0,4.975
max,2056.0,236.0,3063.0,5.0


In [124]:
pd.set_option('display.max_colwidth', None)

keywords = r'карта'
kw_mask = df_model.name_first.str.contains(keywords, case=False, regex=True)

df_model_domain = df_model[
    kw_mask

    # & (df_model.label == 1)
    # & (df_model.label == 0)
]

# for idx, row in df_model_domain.iterrows():
#     print(row.name_first)
#     print(row.sku_first)

#     print(row.name_second)
#     print(row.sku_second)
#     print('-' * 50)
#     print()

# pd.reset_option('display.max_colwidth')

# Run inference

In [183]:
# Choose the dataset to use for inference.

df_chosen_all = df_all
# df_chosen_all = df_model
# df_chosen_all = df_model_domain

FILTER_POSITIVE_LABEL = True
# FILTER_POSITIVE_LABEL = False

In [184]:
N_SKU = 23

df_chosen_all = df_chosen_all.drop(df_chosen_all[df_chosen_all['sku_first'] == df_chosen_all['sku_second']].index)

all_sku = df_chosen_all.sku_first.unique()
subset_sku = pd.Series(all_sku)\
    .sample(min(N_SKU, len(all_sku)), random_state=42).tolist()
df = (
    df_chosen_all[df_chosen_all['sku_first'].isin(subset_sku)]
)

if 'label' in df.columns and FILTER_POSITIVE_LABEL:
    df = df[df['label'] == 1]

print(f'Total Query SKU: {df.sku_first.nunique()}/{df_chosen_all.sku_first.nunique()}')
print(f'Total columns: {len(df.columns)}')

Total Query SKU: 23/23
Total columns: 35


In [185]:
# SCALE FEATURES

# Separate the true labels from the dataset.
if 'label' in df.columns:
    y_true = df['label']

# Define the columns that were not used as features during training.
columns_to_drop = [
    'sku_first', 'sku_second',
    'name_first', 'description_first',
    'name_second', 'description_second',
    'options_first', 'options_second',
    'image_url_first', 'image_url_second',
    'image_id_first', 'image_id_second',
    'label'
]

# Create a DataFrame for scaling by dropping the extra columns.
# The original df remains unchanged.
X = df.drop(columns=columns_to_drop, errors='ignore')

# Ensure the columns match exactly what the scaler was trained on.
# The scaler's attribute 'feature_names_in_' holds the expected column names.
if hasattr(scaler, 'feature_names_in_'):
    expected_features = list(scaler.feature_names_in_)
    X_for_scaler = X[expected_features]
else:
    X_for_scaler = X

# Store feature names for later use with SHAP
model.feature_names = X_for_scaler.columns.tolist()

# print("Columns used for scaling:", X_for_scaler.columns.tolist())

# Scale the features using the loaded scaler.
X_scaled = scaler.transform(X_for_scaler)

In [186]:
# Compute predictions

import pandas as pd

# Compute predictions and probabilities (using your pre-trained model)
predictions = model.predict(X_scaled)
predicted_probas = model.predict_proba(X_scaled)

# For binary classification, use the positive class probability;
# for multiclass, use the highest probability of any class.
if predicted_probas.shape[1] == 2:
    sort_probas = predicted_probas[:, 1]
else:
    sort_probas = predicted_probas.max(axis=1)

# Create a DataFrame with the predictions and probabilities.
# We align by the original dataframe's index.
results_df = pd.DataFrame({
    'prediction': predictions,
    'proba': sort_probas
}, index=df.index)

# Concatenate the predictions to the original DataFrame.
df_with_preds = pd.concat([df, results_df], axis=1)

# Instead of using groupby.apply (with include_groups), sort directly:
df_grouped_sorted = df_with_preds.sort_values(
    by=['sku_first', 'proba'], ascending=[True, False]
)

print("Data grouped by sku_first and sorted within each group by probability:")
df_grouped_sorted.columns.tolist()

matches_df = df_grouped_sorted.copy()
matches_df['price_delta_abs'] = matches_df['final_price_first'] - matches_df['final_price_second']
matches_df['price_delta_rel'] = (
    (matches_df['final_price_first'] - matches_df['final_price_second'])
    / matches_df['final_price_first']
).abs()

matches_df = matches_df[matches_df.prediction == 1]

if 'url_first' in matches_df.columns:
    matches_df_short = matches_df[[
        'sku_first', 'sku_second', 'prediction', 'proba',
        'url_first', 'url_second'
    ]]
else:
    matches_df_short = matches_df[[
        'sku_first', 'sku_second', 'prediction', 'proba',
    ]]

matches_df_short.head()

Data grouped by sku_first and sorted within each group by probability:


Unnamed: 0,sku_first,sku_second,prediction,proba,url_first,url_second
300,491268805,1573965314,1,0.808651,https://www.ozon.ru/context/detail/id/491268805/,https://www.ozon.ru/context/detail/id/1573965314/
309,491268805,804154003,1,0.699807,https://www.ozon.ru/context/detail/id/491268805/,https://www.ozon.ru/context/detail/id/804154003/
304,491268805,847684763,1,0.669368,https://www.ozon.ru/context/detail/id/491268805/,https://www.ozon.ru/context/detail/id/847684763/
302,491268805,1176719536,1,0.652677,https://www.ozon.ru/context/detail/id/491268805/,https://www.ozon.ru/context/detail/id/1176719536/
318,491268805,808465077,1,0.505918,https://www.ozon.ru/context/detail/id/491268805/,https://www.ozon.ru/context/detail/id/808465077/


In [187]:
import matplotlib.pyplot as plt

# Calculate accuracy metrics
if 'label' in df.columns:
    # Overall accuracy
    accuracy = (matches_df['prediction'] == matches_df['label']).mean()
    print(f"\nOverall accuracy: {accuracy:.4f}")
    
    # Precision, recall, f1-score
    from sklearn.metrics import classification_report
    print("\nDetailed classification report:")
    print(classification_report(matches_df['label'], matches_df['prediction']))
    
    # Plot confusion matrix
    from sklearn.metrics import confusion_matrix
    import seaborn as sns
    
    cm = confusion_matrix(matches_df['label'], matches_df['prediction'])
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()


In [201]:
#@title print_samples
import shap
import matplotlib.pyplot as plt
import numpy as np

def print_samples(
    top_k=5,
    display_stats=False,
    display_name=False,
    display_shap=False,  # Flag to control SHAP explanation display
    agg_waterfall=True,  # Whether to show initial aggregated waterfall
    waterfalls=False,    # Whether to show individual waterfalls
    shap_values=False,   # Whether to print detailed top SHAP values report
    stats_fmt="table"    # Format for displaying stats: "text" or "table"
):
    # Load the model and explainer if SHAP is enabled
    if display_shap:
        # Check booster type to determine which explainer to use
        booster_type = model.get_xgb_params().get("booster")
        
        if booster_type == "gblinear":
            # Use LinearExplainer for linear models with proper masker
            # Create a background dataset for the masker
            sample_size = min(100, len(matches_df))
            background_data = matches_df[model.feature_names].sample(n=sample_size, random_state=42)
            # Scale the background data
            background_data_scaled = pd.DataFrame(
                scaler.transform(background_data),
                columns=model.feature_names  # Add feature names to the scaled data
            )
            # Create an Independent masker with the background data
            masker = shap.maskers.Independent(background_data_scaled)
            explainer = shap.LinearExplainer(model, masker)
        else:
            # Use TreeExplainer for tree-based models
            explainer = shap.TreeExplainer(model)
        
        # Create an aggregated waterfall plot for the entire dataset
        if agg_waterfall:
            # Sample a subset of data for the summary plot
            sample_size = min(100, len(matches_df))
            sample_df = matches_df.sample(n=sample_size, random_state=42)
            
            # Get features for the sample
            sample_features = sample_df[model.feature_names]
            
            # Scale features
            scaled_sample_features = pd.DataFrame(
                scaler.transform(sample_features),
                columns=model.feature_names  # Add feature names to the scaled data
            )
            
            # Calculate SHAP values for the sample
            shap_values_agg = explainer.shap_values(scaled_sample_features)
            
            # For binary classification, get values for positive class
            if isinstance(shap_values_agg, list):
                shap_values_agg = shap_values_agg[1]
            
            plt.figure(figsize=(10, 8))
            shap.summary_plot(
                shap_values_agg,
                scaled_sample_features,
                feature_names=model.feature_names,
                show=False  # <--- Turn off auto-show
            )
            plt.title("Aggregated Feature Importance")
            plt.tight_layout()
            plt.show()  # <--- Show the finalized figure

    for query_idx, (query_sku, query_df) in enumerate(matches_df.groupby('sku_first')):
        print(f'Query #{query_idx} '+'#' * 40)
        print(f"Query SKU: {query_sku}")
        if 'url_first' in query_df.columns:
            print(f'Query URL: {query_df["url_first"].iloc[0]}')

        if display_name:
            print(f"Query Name: {query_df['name_first'].iloc[0]}")
            # print(f"Query Description: {query_df['description_first'].iloc[0]}")

        if display_stats:
            stats_data = query_df[[
                'final_price_first',
                'balance_first',
                'sales_first',
                'comments_first',
                'rating_first',
            ]].iloc[0]
            if stats_fmt == "text":
                print(stats_data.to_string())
            else:  # Default to "table"
                display(stats_data.to_frame().T)

        print('-' * 60)
        for top_k_idx, row in query_df.reset_index(drop=True).iterrows():
            if top_k_idx == top_k-1:
                break
            print(f"Top-{top_k_idx} SKU: {row['sku_second']}")
            if 'url_second' in row:
                print(f'Top-{top_k_idx} URL: {row["url_second"]}')
            print(f"Match probability: {row['proba']:.4f}")

            if display_name:
                print(f'Top-{top_k_idx} Name: {row["name_second"]}')
                # print(f'Top-{top_k_idx} Description: {row['description_second']}')

            if display_stats:
                stats_data = row[[
                    'final_price_second',
                    'balance_second',
                    'sales_second',
                    'comments_second',
                    'rating_second',
                ]]
                if stats_fmt == "text":
                    print(stats_data.to_string())
                else:  # Default to "table"
                    display(stats_data.to_frame().T)
            
            # Display SHAP explanation for this prediction
            if display_shap:
                # Get the feature values for this pair
                features = row[model.feature_names].to_frame().T
                
                # Scale features using the same scaler used during training
                scaled_features = pd.DataFrame(
                    scaler.transform(features),
                    columns=model.feature_names  # Add feature names to the scaled data
                )
                
                # Calculate SHAP values using scaled features
                shap_values_row = explainer.shap_values(scaled_features)
                
                # For binary classification, get values for positive class
                if isinstance(shap_values_row, list):
                    shap_values_row = shap_values_row[1]
                
                # Get expected value (base value)
                expected_value = explainer.expected_value
                if isinstance(expected_value, list):
                    expected_value = expected_value[1]
                
                if shap_values:
                    # Display summary of top contributing features
                    top_features = pd.DataFrame({
                        'Feature': model.feature_names,
                        'SHAP Value': shap_values_row[0],
                        'Feature Value': features.values[0],
                        'Scaled Value': scaled_features.values[0]
                    }).sort_values(by='SHAP Value', key=abs, ascending=False).head(5)
                    
                    print("\nTop contributing features:")
                    for i, (_, feat) in enumerate(top_features.iterrows()):
                        impact = "+" if feat['SHAP Value'] > 0 else ""
                        print(f"  {i+1}. {feat['Feature']} = {feat['Feature Value']:.4f} (scaled: {feat['Scaled Value']:.4f}, impact: {impact}{feat['SHAP Value']:.4f})")
                    
                    # Fix for DeprecationWarning - properly extract scalar value from numpy array
                    expected_value_float = expected_value.item() if isinstance(expected_value, np.ndarray) else expected_value
                    
                    print(f"\nBase value: {expected_value_float:.4f}")
                    print(f"Sum of SHAP values: {shap_values_row[0].sum():.4f}")
                    print(f"Final prediction: {expected_value_float + shap_values_row[0].sum():.4f}")
                    print(f"Probability (sigmoid): {1/(1+np.exp(-(expected_value_float + shap_values_row[0].sum()))):.4f}")
                    # Print model's raw probability prediction
                    raw_proba = model.predict_proba(scaled_features)
                    print(f"\nModel predict_proba output: {raw_proba[0]}")
                    print(f"Positive class probability: {raw_proba[0][1]:.4f}")
                
                # Add waterfall plot
                if waterfalls:
                    # Fix for DeprecationWarning - properly extract scalar value from numpy array
                    expected_value_float = expected_value.item() if isinstance(expected_value, np.ndarray) else expected_value
                    
                    plt.figure(figsize=(10, 6))
                    shap.plots.waterfall(shap.Explanation(
                        values=shap_values_row[0],
                        base_values=expected_value_float,
                        data=scaled_features.values[0],
                        feature_names=model.feature_names
                    ), max_display=10, show=True)
            
            print()
            
        print()

    # TODO: исключить карты, которые не в наличии?

In [208]:
print_samples(
    display_stats=True,
    stats_fmt='text', # or 'table'
    display_name=True,
    # display_shap=True,  # Flag to control SHAP explanation display
    # agg_waterfall=True,  # Whether to show initial aggregated waterfall
    # waterfalls=True,    # Whether to show individual waterfalls
    # shap_values=False    # Whether to print detailed top SHAP values report
)

Query #0 ########################################
Query SKU: 491268805
Query URL: https://www.ozon.ru/context/detail/id/491268805/
Query Name: Карта Москвы настенная,100х70 см "Москва современная с линиями метро"
final_price_first    416.0
balance_first        233.0
sales_first           67.0
comments_first       606.0
rating_first           4.8
------------------------------------------------------------
Top-0 SKU: 1573965314
Top-0 URL: https://www.ozon.ru/context/detail/id/1573965314/
Match probability: 0.8087
Top-0 Name: Настенная карта Москвы "Москва современная с линиями метро"/размер 102х143
final_price_second    1203
balance_second          39
sales_second             0
comments_second          4
rating_second          5.0

Top-1 SKU: 804154003
Top-1 URL: https://www.ozon.ru/context/detail/id/804154003/
Match probability: 0.6998
Top-1 Name: Атлас ПринтНастенная карта "Москва современная" 1:50/размер 70х100 см
final_price_second    515
balance_second         43
sales_second      

# Output files

In [193]:
def construct_wide_table(df, top_k):
    """
    Constructs a wide table such that each row corresponds to a unique query SKU and contains:
    
      Query_SKU, Top-1_SKU, Top-2_SKU, ... Top-k_SKU,
      Query_URL, Top-1_URL, Top-2_URL, ... Top-k_URL,
      Top-1_Proba, Top-2_Proba, ... Top-k_Proba
      
    Only candidates with a positive prediction (prediction == 1) are included.
    If there are fewer than top_k positive predictions for a query, the remaining columns are filled with None.
    """
    wide_rows = []
    # Process each unique query SKU.
    for query_sku, group in df.groupby('sku_first'):
        # Consider only candidates with positive prediction.
        group_positive = group[group['prediction'] == 1]
        # Sort the positive candidates by probability in descending order.
        group_sorted = group_positive.sort_values(by='proba', ascending=False).reset_index(drop=True)
        # Assume the query URL is the same for every row with the same query SKU.
        query_url = group_sorted.loc[0, 'url_first'] if not group_sorted.empty else None
        
        # Build the row dictionary with the desired column order.
        row = {}
        row['Query_SKU'] = query_sku
        
        # Add candidate SKU columns.
        for i in range(top_k):
            if i < len(group_sorted):
                row[f'Top-{i+1}_SKU'] = group_sorted.loc[i, 'sku_second']
            else:
                row[f'Top-{i+1}_SKU'] = None
        
        # Add the Query URL.
        row['Query_URL'] = query_url
        
        # Add candidate URL columns.
        for i in range(top_k):
            if i < len(group_sorted):
                row[f'Top-{i+1}_URL'] = group_sorted.loc[i, 'url_second']
            else:
                row[f'Top-{i+1}_URL'] = None
        
        # Add candidate probability columns.
        for i in range(top_k):
            if i < len(group_sorted):
                row[f'Top-{i+1}_proba'] = group_sorted.loc[i, 'proba']
            else:
                row[f'Top-{i+1}_proba'] = None
        
        wide_rows.append(row)
    return pd.DataFrame(wide_rows)


In [194]:
# # --- Set your desired top_k (e.g. 5) and construct the wide table.
# TOP_K = 5

# matches_wide_df = construct_wide_table(df_grouped_sorted, TOP_K)
# matches_wide_df = matches_wide_df.dropna()
# matches_wide_df

In [195]:
# subset_cols = [
#     'Query_SKU',
#     # 'Top-1_SKU', 'Top-2_SKU', 'Top-3_SKU', 'Top-4_SKU', 'Top-5_SKU',
#     # 'Query_URL',
#     # 'Top-1_URL', 'Top-2_URL', 'Top-3_URL', 'Top-4_URL', 'Top-5_URL',
#     # 'Top-1_proba', 'Top-2_proba', 'Top-3_proba', 'Top-4_proba', 'Top-5_proba'
# ]

# output_file_path = (
#     Path(DATA_FILE).parent /
#     Path('test_results') / 
#     'result-errors-template.csv'
#     # (f'result-errors-template-{MODEL_CKPT_DIR}_' + Path(DATA_FILE).name)
# )
# matches_wide_df.to_clipboard(index=False, header=True, excel=True, columns=subset_cols)
# matches_wide_df.to_csv(output_file_path, index=False, header=True, columns=subset_cols)

In [196]:
# DATA_PATH = 'data'

# output_file_path = (
#     Path(DATA_FILE).parent /
#     Path('test_results') / 
#     (f'result-{MODEL_CKPT_DIR}_' + Path(DATA_FILE).name)
# )
# output_file_path.parent.mkdir(parents=True, exist_ok=True)
# # output_file_path

# matches_wide_df.to_csv(output_file_path, index=None)