In [105]:
OPENAI_API_KEY = "sk-or-vv-ac07a84e938358eb29c38b15da3a139c55c49a467c221d885d8c20aad7c2e62f"

In [106]:
import pandas as pd
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path

# Set the checkpoint directory.
MODEL_CKPT_DIR = 'model_params_big_test'
# MODEL_CKPT_DIR = 'res_balanced_accuracy'
# MODEL_CKPT_DIR = 'res_f1'

MODEL_DATA_FILE = Path(MODEL_CKPT_DIR) / 'data.csv'

df_model = pd.read_csv(MODEL_DATA_FILE)
print(f'Total rows: {len(df_model)}')
print(f'Unique sku_first: {df_model.sku_first.nunique()}')
print(f'Unique sku_second: {df_model.sku_second.nunique()}')
df_model[['name_first']].head()

Total rows: 4974
Unique sku_first: 211
Unique sku_second: 2369


Unnamed: 0,name_first
0,Базовый свитшот
1,Базовый свитшот
2,Базовый свитшот
3,Базовый свитшот
4,Базовый свитшот


# Minimal baseline for subset

In [2]:
# Define the regex pattern
regex_pattern = r'карт|схем'

# Create a new column 'category_id_first' based on the regex match
df_model['category_id'] = df_model['name_first'].str.contains(regex_pattern, case=False, na=False).astype(int)

# Assign category_id values: 0 for matching, -1 for non-matching
df_model['category_id'] = df_model['category_id'].replace({1: 0, 0: -1})

# Display the updated dataframe
df_model[['name_first', 'category_id']].head()

Unnamed: 0,name_first,category_id
0,Базовый свитшот,-1
1,Базовый свитшот,-1
2,Базовый свитшот,-1
3,Базовый свитшот,-1
4,Базовый свитшот,-1


In [3]:
from sklearn.model_selection import train_test_split

# Split dev/test (80/20 split) stratifying by the combination of 'label' and 'category_id'
stratify_col = df_model[['label', 'category_id']].apply(lambda row: f"{row['label']}_{row['category_id']}", axis=1)
dev_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42, stratify=stratify_col)
# Verify the proportions of 'label' and 'category_id' combinations in the dev and test sets.
print("Dev set distribution (proportions):")
dev_prop = dev_df.groupby(['label', 'category_id']).size() / len(dev_df)
print(dev_prop)

print("\nTest set distribution (proportions):")
test_prop = test_df.groupby(['label', 'category_id']).size() / len(test_df)
print(test_prop)

# Alternatively, view as crosstab percentages for each label:
print("\nDev set crosstab (row normalized):")
print(pd.crosstab(dev_df['label'], dev_df['category_id'], normalize='index'))

print("\nTest set crosstab (row normalized):")
print(pd.crosstab(test_df['label'], test_df['category_id'], normalize='index'))

Dev set distribution (proportions):
label  category_id
0      -1             0.601407
        0             0.062327
1      -1             0.326967
        0             0.009299
dtype: float64

Test set distribution (proportions):
label  category_id
0      -1             0.602010
        0             0.062312
1      -1             0.326633
        0             0.009045
dtype: float64

Dev set crosstab (row normalized):
category_id        -1         0
label                          
0            0.906096  0.093904
1            0.972347  0.027653

Test set crosstab (row normalized):
category_id        -1         0
label                          
0            0.906203  0.093797
1            0.973054  0.026946


In [4]:
output_file_path = (
    Path(MODEL_CKPT_DIR) / f'data_clustered_regex_classes={df_model.category_id.nunique()}.csv'
)
print(output_file_path)
df_model.to_csv(output_file_path, index=False)

model_params_big_test/data_clustered_regex_classes=2.csv


# Make clustreing via LLM

In [129]:
# SAMPLE_SKU_COUNT = 100
SAMPLE_SKU_COUNT = None

In [130]:
PRICE = 0.06

all_name_first = df_model.name_first.to_string(index=False)
total_symbols = len(all_name_first)
total_price = (total_symbols / 1000) * PRICE
total_price

15.220379999999999

In [131]:
# Sample random rows

unique_skus = df_model.drop_duplicates(subset=['sku_first'])
if SAMPLE_SKU_COUNT is not None:
    sampled_data = unique_skus.sample(SAMPLE_SKU_COUNT, random_state=42)
else:
    sampled_data = unique_skus.copy()

all_names = sampled_data[['sku_first', 'name_first', 'label']]

# all_names.index.name = 'index'
all_names = all_names.to_csv(index=False)
all_names[:100]

'sku_first,name_first,label\n13221213,Базовый свитшот,0\n157891430,Флешка 4 GB подарочная с гравировкой'

## Clustering

In [132]:
prompt_clustering = """
Perform topic modeling on product names.
Find groups of similar products. Make sure to include following categories:
- "Карта и путеводители (карты России, карты мира, путеводители, ...)"
Product names: \n{all_names}

Summarize resulting clusters with the following columns:
0. category_id (starting from 0)
1. category_name

Use double quotes for category names.
Do not include any other columns.
"""

prompt_mapping = """
For each input sku_first identify category of a given product by its name_first as given by the clustering.

Output following columns:
0. sku_first
1. category_id

Do not include any other columns.

Clustering: \n{response_clustering}

Products sku and names:\n{all_names}
"""

In [207]:
response_clustering = """
category_id,category_name
0,"Одежда женская (платья, юбки, блузки, кофты, брюки, шорты, костюмы, ...)"
1,"Одежда мужская (рубашки, футболки, брюки, шорты, костюмы, ...)"
2,"Одежда гимнастическая (гимнастическая форма, одежда для гимнастики, обувь для гимнастики, борцовка ...)"
3,"Одежда детская (футболки для мальчиков, футболки для девочек, платья для девочек, ...) [коме одежды для гимнастики]"
4,"Обувь (туфли, кроссовки, ботинки, сапоги, сандалии, ...)"
5,"Карта и путеводители (карты настенные, карты складные, путеводители, ...)"
6,"Товары для уборки (перчатки резиновые, швабры, тряпки, губки, салфетки, щетки, насадки, водосгоны, ведра и аксессуары, ...)"
7,"Товары для готовки (соевые соусы, ...)"
8,"Мебель (столы, стулья, диваны, кровати, ...)"
9,"Аксессуары для компьютеров (флешки с гравировкой, флешки гимнастические, USB-накопители, ...)"
10,"Всё остальное (игра настольная для детей, игрушки для детей, чуни, тапочки, маски ...)"
"""

prompt_mapping_hardcoded = f"""
For each input sku_first identify category of a given product by its name_first as given by the clustering.

Output following columns:
0. sku_first
1. category_id

Do not include any other columns.

Clustering: \n{response_clustering}

Products sku and names:\n{{all_names}}
"""

In [208]:
# ...existing code...
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain
from pathlib import Path
import pandas as pd

model_name="openai/gpt-4.1-mini"
# model_name="openai/gpt-4.1-nano"

# 1) instantiate the chat client (make sure to pass openai_api_base, not base_url)
chat = ChatOpenAI(
    model_name=model_name,
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://api.vsegpt.ru/v1",   # <— correct param name
    temperature=1
)

from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

# 2) build a chat-style prompt
system = SystemMessagePromptTemplate.from_template(
    """
Your task is to assist in topic modeling on product names.
Answer in valid CSV format with a header.
Use Russian language.
"""
)

In [209]:
# # Create a human prompt for clustering
# human_clustering = HumanMessagePromptTemplate.from_template(
#     prompt_clustering
#     # prompt_clustering_paired
# )
# # Combine clustering and mapping prompts
# chat_prompt_clustering = ChatPromptTemplate.from_messages([system, human_clustering])

# chain_clustering = LLMChain(llm=chat, prompt=chat_prompt_clustering)
# # Run clustering first and capture its output
# response_clustering = chain_clustering.run(all_names=all_names)
# print("\nClustering Response:")
# print(response_clustering)

In [210]:
# # Create a human prompt for mapping
# human_mapping = HumanMessagePromptTemplate.from_template(
#     prompt_mapping
# )

# chat_prompt_mapping = ChatPromptTemplate.from_messages([system, human_mapping])

# # Create separate chains without using a conversation memory buffer
# chain_mapping = LLMChain(llm=chat, prompt=chat_prompt_mapping)


# # Run mapping next by explicitly passing the output of the clustering chain
# response_mapping = chain_mapping.run(
#     all_names=all_names,
#     response_clustering=response_clustering,
#     all_names_len=len(sampled_data)
# )
# print("\nMapping Response:")
# print(response_mapping)

In [285]:
# Create a human prompt for mapping
human_mapping = HumanMessagePromptTemplate.from_template(
    prompt_mapping_hardcoded
)

chat_prompt_mapping = ChatPromptTemplate.from_messages([system, human_mapping])

# Create separate chains without using a conversation memory buffer
chain_mapping = LLMChain(llm=chat, prompt=chat_prompt_mapping)


# Run mapping next by explicitly passing the output of the clustering chain
response_mapping = chain_mapping.run(
    all_names=all_names,
    all_names_len=len(sampled_data)
)
print("\nMapping Response:")
print(response_mapping)


Mapping Response:
sku_first,category_id
13221213,0
157891430,9
11881845,10
64354900,5
28889877,0
53574284,6
52711454,6
52711355,6
192878035,10
143443098,6
52711475,6
114404338,6
197553673,10
192015329,6
138787213,6
52712391,6
13309742,8
53607574,6
148537608,8
52718395,6
73059730,6
138782899,6
53667142,6
154594166,10
52712544,6
110800212,10
192755374,6
52711726,6
158954944,0
24789729,0
39078129,0
9878713,0
34586933,0
53666683,6
72826737,6
64354767,5
140906524,5
64354945,5
64354832,5
149447321,5
124227858,3
120702186,2
16116490,3
124019640,3
9611772,0
70705908,0
9611773,0
25943646,0
16347823,0
67057495,0
26399228,0
112412420,6
54772798,6
53620801,6
112529138,6
52835848,6
15638408,8
110278239,10
53583846,6
178159309,6
165869451,6
173010972,6
140909384,6
133886335,6
52711617,6
196492371,6
52711770,6
141524109,6
165869471,6
53620786,6
82245013,7
52711907,6
144139672,6
52712189,6
148825064,10
57991032,6
141524110,6
148969749,6
131619601,6
186903213,6
165869426,6
54765700,6
111657438,6
14927

In [286]:
# Merge mapping_df and clustering_df on 'category_id' only
from io import StringIO

mapping_df = pd.read_csv(StringIO(response_mapping))
mapping_df = mapping_df.drop_duplicates(subset=['sku_first'])
clustering_df = pd.read_csv(StringIO(response_clustering))
display(clustering_df)

merged_df = pd.merge(mapping_df, clustering_df, on='category_id', how='left')

print(f'Total entries: {len(merged_df)}')
merged_df[['sku_first', 'category_id', 'category_name']].sample(5)

Unnamed: 0,category_id,category_name
0,0,"Одежда женская (платья, юбки, блузки, кофты, б..."
1,1,"Одежда мужская (рубашки, футболки, брюки, шорт..."
2,2,"Одежда гимнастическая (гимнастическая форма, о..."
3,3,"Одежда детская (футболки для мальчиков, футбол..."
4,4,"Обувь (туфли, кроссовки, ботинки, сапоги, санд..."
5,5,"Карта и путеводители (карты настенные, карты с..."
6,6,"Товары для уборки (перчатки резиновые, швабры,..."
7,7,"Товары для готовки (соевые соусы, ...)"
8,8,"Мебель (столы, стулья, диваны, кровати, ...)"
9,9,Аксессуары для компьютеров (флешки с гравировк...


Total entries: 211


Unnamed: 0,sku_first,category_id,category_name
151,12073345,0,"Одежда женская (платья, юбки, блузки, кофты, б..."
128,120702196,2,"Одежда гимнастическая (гимнастическая форма, о..."
22,53667142,6,"Товары для уборки (перчатки резиновые, швабры,..."
183,9515803,0,"Одежда женская (платья, юбки, блузки, кофты, б..."
131,120702190,2,"Одежда гимнастическая (гимнастическая форма, о..."


In [288]:
DATA_PATH = 'data'

file_name = (
    f"data_clustering_sku={merged_df.sku_first.nunique()}_"
    f"model={Path(model_name).name}"
)
if SAMPLE_SKU_COUNT is not None:
    file_name += f'_samples={SAMPLE_SKU_COUNT}'

file_name += '.csv'
clustering_output_file_path = Path(DATA_PATH) / 'tables_labeled' / 'data_clustering' / file_name

print(clustering_output_file_path)
merged_df.to_csv(clustering_output_file_path, index=False)

data/tables_labeled/data_clustering/data_clustering_sku=211_model=gpt-4.1-mini.csv


In [289]:
final_merged_df = pd.merge(sampled_data, merged_df, on='sku_first', how='left')

sizes_per_category = final_merged_df.groupby(['label', 'category_id']).size()
labels = final_merged_df['label'].unique()
categories = final_merged_df['category_id'].unique()
idx = pd.MultiIndex.from_product([labels, categories], names=['label', 'category_id'])
sizes_per_category = sizes_per_category.reindex(idx, fill_value=0)
sizes_per_category = sizes_per_category.sort_index(level=['label', "category_id"])

pd.set_option('display.max_rows', None)
display(sizes_per_category)

label  category_id
0      0              59
       1               1
       2              10
       3               3
       5              13
       6              51
       7               1
       8               3
       9               1
       10              9
1      0              22
       1               1
       2               1
       3               1
       5               2
       6              28
       7               0
       8               0
       9               3
       10              2
dtype: int64

## Inspect final merged data

In [299]:
import pandas as pd
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from pathlib import Path

# Set the checkpoint directory.
MODEL_CKPT_DIR = 'model_params_big_test'
MODEL_CKPT_DIR = 'res_balanced_accuracy'

MODEL_DATA_FILE = Path(MODEL_CKPT_DIR) / 'data.csv'

df_model = pd.read_csv(MODEL_DATA_FILE)
print(f'Total rows: {len(df_model)}')
print(f'Unique sku_first: {df_model.sku_first.nunique()}')
print(f'Unique sku_second: {df_model.sku_second.nunique()}')

Total rows: 4974
Unique sku_first: 211
Unique sku_second: 2369


In [300]:
# clustering_output_file_path = 'data/tables_labeled/data_clustering/data_clustering_sku=153_model=gpt-4.1-mini.csv'
print(clustering_output_file_path)

merged_df = pd.read_csv(clustering_output_file_path)
# merged_df['category_name'].nunique()
merged_df.shape

data/tables_labeled/data_clustering/data_clustering_sku=211_model=gpt-4.1-mini.csv


(211, 3)

In [301]:
# inspect samples per category
# final_merged_df[final_merged_df.category_id == 8].sample(10)[['sku_first', 'name_first', 'category_id', 'category_name']]   

In [302]:
final_merged_df = pd.merge(df_model, merged_df, on='sku_first', how='left')

# Fill NA category_id with -1
final_merged_df.category_id = final_merged_df.category_id.fillna(-1)

# Compute sizes per category
sizes_per_category = final_merged_df.groupby(['label', 'category_id']).size()
labels = final_merged_df['label'].unique()
categories = final_merged_df['category_id'].unique()
idx = pd.MultiIndex.from_product([labels, categories], names=['label', 'category_id'])
sizes_per_category = sizes_per_category.reindex(idx, fill_value=0)
sizes_per_category = sizes_per_category.sort_index(level=['label', "category_id"])

pd.set_option('display.max_rows', None)
display(sizes_per_category)

label  category_id
0      0              1263
       1                34
       2               146
       3                69
       5               280
       6              1296
       7                 3
       8                57
       9                 3
       10              151
1      0               755
       1                26
       2                48
       3                19
       5                46
       6               686
       7                10
       8                 0
       9                11
       10               71
dtype: int64

In [303]:
DATA_PATH = 'data'

file_name = (
    f"data_clustered_clusters={final_merged_df.category_id.nunique()}_"
    f"sku={final_merged_df.sku_first.nunique()}_"
    f"model={Path(model_name).name}"
)
if SAMPLE_SKU_COUNT is not None:
    output_file_path += f'_samples={SAMPLE_SKU_COUNT}'

file_name += '.csv'
output_file_path = Path(MODEL_DATA_FILE).parent / file_name

print(output_file_path)
final_merged_df.to_csv(output_file_path, index=False)

res_balanced_accuracy/data_clustered_clusters=10_sku=211_model=gpt-4.1-mini.csv
