In [104]:
import pandas as pd
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path

# Set the checkpoint directory.
# MODEL_CKPT_DIR = 'model_params_big_test'
MODEL_CKPT_DIR = 'res_balanced_accuracy'
# MODEL_CKPT_DIR = 'res_f1'

# Paths to the saved model and scaler.
xgb_model_path = Path(MODEL_CKPT_DIR) / 'xgboost_model.json'
scaler_path = Path(MODEL_CKPT_DIR) / 'std_scaler.bin'

# Load the scaler and the pre-trained XGBoost model.
scaler = joblib.load(scaler_path)
model = xgb.XGBClassifier()
model.load_model(xgb_model_path)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [129]:
DATA_FILE = Path(MODEL_CKPT_DIR) / 'data.csv'

# > AVERAGE FILES HERE
# DATA_FILE = 'data/tables_OZ_geo_5500/tabular_OZ_geo_5500.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/tabular_OZ_geo_5500_all_query_pairs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/tabular_OZ_geo_5500_top-5_query_pairs.csv'

# > GOOD FILES HERE
# DATA_FILE = 'data/tables_OZ_geo_5500/tabular_OZ_geo_5500_top-5_query-23_nonquery-5539_pairs.csv'
# DATA_FILE = 'data/tables_OZ_geo_5500/tabular_OZ_geo_5500_top-5_query-23_nonquery-5539_embedded.csv'
DATA_FILE = 'data/tables_OZ_geo_5500/tabular_OZ_geo_5500_top-20_query-23_nonquery-5539_embedded.csv'


df_all = pd.read_csv(DATA_FILE)
# df_all.columns.tolist()

In [130]:
N_SKU = 23

subset_sku = df_all.sku_first.sample(N_SKU, random_state=42).tolist()
df = df_all[df_all['sku_first'].isin(subset_sku)]

print(f'Total columns: {len(df.columns)}')
df.columns.tolist()

Total columns: 35


['balance_first',
 'sales_first',
 'rating_first',
 'final_price_first',
 'comments_first',
 'description_first',
 'name_first',
 'options_first',
 'sku_first',
 'has_video_first',
 'photo_count_first',
 'balance_second',
 'sales_second',
 'rating_second',
 'final_price_second',
 'comments_second',
 'description_second',
 'name_second',
 'options_second',
 'sku_second',
 'has_video_second',
 'photo_count_second',
 'iseq_vendor',
 'iseq_color',
 'iseq_brand',
 'iseq_supp',
 'are_related',
 'image_id_first',
 'image_id_second',
 'url_first',
 'url_second',
 'desc_sim',
 'opt_sim',
 'name_sim',
 'img_sim']

In [131]:
# Separate the true labels from the dataset.
if 'label' in df.columns:
    y_true = df['label']

# Define the columns that were not used as features during training.
columns_to_drop = [
    'sku_first', 'sku_second',
    'name_first', 'description_first',
    'name_second', 'description_second',
    'options_first', 'options_second',
    'image_url_first', 'image_url_second',
    'image_id_first', 'image_id_second',
    'label'
]

# Create a DataFrame for scaling by dropping the extra columns.
# The original df remains unchanged.
X = df.drop(columns=columns_to_drop, errors='ignore')

# Ensure the columns match exactly what the scaler was trained on.
# The scaler's attribute 'feature_names_in_' holds the expected column names.
if hasattr(scaler, 'feature_names_in_'):
    expected_features = list(scaler.feature_names_in_)
    X_for_scaler = df[expected_features]
else:
    X_for_scaler = df

# print("Columns used for scaling:", X_for_scaler.columns.tolist())

# Scale the features using the loaded scaler.
X_scaled = scaler.transform(X_for_scaler)

In [132]:
import pandas as pd

# Compute predictions and probabilities (using your pre-trained model)
predictions = model.predict(X_scaled)
predicted_probas = model.predict_proba(X_scaled)

# For binary classification, use the positive class probability;
# for multiclass, use the highest probability of any class.
if predicted_probas.shape[1] == 2:
    sort_probas = predicted_probas[:, 1]
else:
    sort_probas = predicted_probas.max(axis=1)

# Create a DataFrame with the predictions and probabilities.
# We align by the original dataframe's index.
results_df = pd.DataFrame({
    'prediction': predictions,
    'proba': sort_probas
}, index=df.index)

# Concatenate the predictions to the original DataFrame.
df_with_preds = pd.concat([df, results_df], axis=1)

# Instead of using groupby.apply (with include_groups), sort directly:
df_grouped_sorted = df_with_preds.sort_values(
    by=['sku_first', 'proba'], ascending=[True, False]
)

print("Data grouped by sku_first and sorted within each group by probability:")
df_grouped_sorted.columns.tolist()

matches_df = df_grouped_sorted[df_grouped_sorted.prediction == 1]
matches_df_short = matches_df[[
    'sku_first', 'sku_second', 'prediction', 'proba',
    'url_first', 'url_second'
]]

matches_df_short

Data grouped by sku_first and sorted within each group by probability:


Unnamed: 0,sku_first,sku_second,prediction,proba,url_first,url_second
120,491268805,1573965314,1,0.510342,https://www.ozon.ru/context/detail/id/491268805/,https://www.ozon.ru/context/detail/id/1573965314/
39,491270369,856985388,1,0.880422,https://www.ozon.ru/context/detail/id/491270369/,https://www.ozon.ru/context/detail/id/856985388/
28,491270369,178726257,1,0.861245,https://www.ozon.ru/context/detail/id/491270369/,https://www.ozon.ru/context/detail/id/178726257/
429,491271320,1939659158,1,0.997213,https://www.ozon.ru/context/detail/id/491271320/,https://www.ozon.ru/context/detail/id/1939659158/
426,491271320,1499606550,1,0.986244,https://www.ozon.ru/context/detail/id/491271320/,https://www.ozon.ru/context/detail/id/1499606550/
427,491271320,1442586966,1,0.985693,https://www.ozon.ru/context/detail/id/491271320/,https://www.ozon.ru/context/detail/id/1442586966/
428,491271320,1454661086,1,0.985693,https://www.ozon.ru/context/detail/id/491271320/,https://www.ozon.ru/context/detail/id/1454661086/
437,491271320,1442552488,1,0.979956,https://www.ozon.ru/context/detail/id/491271320/,https://www.ozon.ru/context/detail/id/1442552488/
434,491271320,1902142540,1,0.977008,https://www.ozon.ru/context/detail/id/491271320/,https://www.ozon.ru/context/detail/id/1902142540/
435,491271320,1914627744,1,0.976657,https://www.ozon.ru/context/detail/id/491271320/,https://www.ozon.ru/context/detail/id/1914627744/


In [133]:
TOP_K = 5

for _, query_df in matches_df_short.groupby('sku_first'):
    print(f"Query SKU: {query_df['sku_first'].iloc[0].item()}")
    print(f'Query URL: {query_df['url_first'].iloc[0]}')
    print('-' * 60)
    for idx, row in query_df.reset_index(drop=True).iterrows():
        if idx == TOP_K-1:
            break
        print(f"Top-{idx+1} SKU: {row['sku_second']}")
        print(f'Query URL: {row['url_second']}\n')
        
    print('#' * 30)
    print()

# TODO: исключить карты, которые не в наличии?

Query SKU: 491268805
Query URL: https://www.ozon.ru/context/detail/id/491268805/
------------------------------------------------------------
Top-1 SKU: 1573965314
Query URL: https://www.ozon.ru/context/detail/id/1573965314/

##############################

Query SKU: 491270369
Query URL: https://www.ozon.ru/context/detail/id/491270369/
------------------------------------------------------------
Top-1 SKU: 856985388
Query URL: https://www.ozon.ru/context/detail/id/856985388/

Top-2 SKU: 178726257
Query URL: https://www.ozon.ru/context/detail/id/178726257/

##############################

Query SKU: 491271320
Query URL: https://www.ozon.ru/context/detail/id/491271320/
------------------------------------------------------------
Top-1 SKU: 1939659158
Query URL: https://www.ozon.ru/context/detail/id/1939659158/

Top-2 SKU: 1499606550
Query URL: https://www.ozon.ru/context/detail/id/1499606550/

Top-3 SKU: 1442586966
Query URL: https://www.ozon.ru/context/detail/id/1442586966/

Top-4 SKU:

In [None]:
# # Run inference using the pre-trained model.
# predictions = model.predict(X_scaled)

# # Calculate metrics.
# accuracy = accuracy_score(y_true, predictions)
# f1 = f1_score(y_true, predictions, average='weighted')

# # print("Predicted classes:", predictions)
# print("Accuracy:", accuracy)
# print("F1-score:", f1)

NameError: name 'y_true' is not defined