imports

In [1]:
%load_ext autoreload
%autoreload 1
%aimport src.my, src.net, src.data, src.models, src.text_utils

import sys
import numpy as np
import pandas as pd

import os
import gc
import matplotlib.pyplot as plt
import importlib
import pickle

# import net, data
import src.text_utils as tu
import src.my as my
from src.my import p
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 200)
pd.set_option("max_colwidth", 45)
pd.set_option("display.precision", 1)
pd.options.display.float_format = "{:.3f}".format
# pd.set_option("display.max_rows", 5)
# pd.reset_option("display.max_rows")

from sklearn.model_selection import train_test_split

# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)

dir_data = 'data/'
dir_out = 'out/'
os.makedirs(dir_out, exist_ok=True)

SEED = 34
N_CPU = os.cpu_count()

np.random.seed(SEED)
rng = np.random.default_rng(SEED)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
Xy = pd.read_parquet(dir_out+'prepared_df.pq')
X_test = pd.read_parquet(dir_out+'prepared_test.pq')
Xy[:2]

Unnamed: 0,product_id,category_id,shop_id,category_name,fold,text
0,325286,251,493,электроника смартфоны телефоны аксессуары...,4,зарядный кабель borofone bx1 lightning ай...
1,888134,748,6081,одежда женская одежда белье купальники трусы,3,трусы sela трусы слипы эластичного бесшов...


In [3]:
# os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
class_names = Xy['category_id'].unique().tolist()
len(class_names)

874

Так выглядят эмбеддинги от берта и EfficientNet:

In [4]:
fold = 0
bert_train_val_embs = pd.read_parquet(f'out/bert_model/bert_train_val_embs_f{fold}.pq')
bert_test_embs = pd.read_parquet(f'out/bert_model/bert_test_embs_f{fold}.pq')

img_train_val_embs = pd.read_parquet(f'out/image_model/img_train_val_embs_f{fold}.pq')
img_test_embs = pd.read_parquet(f'out/image_model/img_test_embs_f{fold}.pq')

bert_train_val_embs[:2]

Unnamed: 0,product_id,bert0,bert1,bert2,bert3,bert4,bert5,bert6,bert7,bert8,...,bert758,bert759,bert760,bert761,bert762,bert763,bert764,bert765,bert766,bert767
0,325286,1.629,0.429,1.014,1.358,1.44,0.094,-1.702,1.529,-0.759,...,1.979,-0.461,0.37,1.162,2.046,0.587,1.47,0.95,0.53,2.958
1,888134,0.656,0.249,-1.058,1.961,-0.309,0.743,-1.077,-0.318,1.623,...,0.46,1.588,1.023,0.386,0.743,1.068,-1.224,0.858,-0.987,-0.238


In [5]:
bert_train_val_embs.shape, img_train_val_embs.shape

((91120, 769), (91120, 1281))

pyboost c PCA и Umap: f1 ~ 0.875

In [None]:
from py_boost import GradientBoosting, TLPredictor, TLCompiledPredictor
from py_boost.cv import CrossValidation

from py_boost.gpu.losses.multiclass_metrics import MultiF1Score
import cupy as cp
from sklearn.metrics import f1_score

from sklearn.decomposition import PCA

class F1Weighted(MultiF1Score):
    """CrossEntropy Metric for the multiclassification task"""
    alias = 'F1Weighted'

    def __call__(self, y_true, y_pred, sample_weight=None):
        return f1_score(y_true.get(), y_pred.argmax(axis=1).get(), average='weighted')

fold = 0

both_reducer = PCA(n_components=50)

Xy_train = Xy.loc[Xy.fold!=fold,['product_id','shop_title', 'category_id']]

cats_in_train = Xy_train['category_id'].unique()

Xy_val = Xy.loc[(Xy.fold==fold) & (Xy.category_id.isin(cats_in_train)),['product_id','shop_title', 'category_id']]

bert_train_val_embs = pd.read_parquet(f'out/bert_model/bert_train_val_embs_f{fold}.pq')
img_train_val_embs = pd.read_parquet(f'out/image_model/img_train_val_embs_f{fold}.pq')

Xy_train = Xy_train.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

Xy_val = Xy_val.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

train_umap = both_reducer.fit_transform(Xy_train.iloc[:,3:])
train_umap = pd.DataFrame(train_umap)
my.flat_cols(train_umap,'both')

val_umap = both_reducer.transform(Xy_val.iloc[:,3:])
val_umap = pd.DataFrame(val_umap)
my.flat_cols(val_umap,'both')    

Xy_train = pd.concat([Xy_train.iloc[:,:3],train_umap],axis=1)
Xy_val = pd.concat([Xy_val.iloc[:,:3],val_umap],axis=1)

cols = ['product_id','category_id','shop_title']

X_train=Xy_train.drop(columns=cols).to_numpy()
y_train=Xy_train['category_id'].to_numpy()

X_val=Xy_val.drop(columns=cols).to_numpy()
y_val=Xy_val['category_id'].to_numpy()

model = GradientBoosting('crossentropy',metric=F1Weighted(),ntrees=10000, lr=0.03, verbose=5, es=50, lambda_l2=1, gd_steps=1,
                         subsample=1, colsample=1, min_data_in_leaf=10, use_hess=True,
                         max_bin=256, max_depth=6)

model.fit(X_train, y_train, eval_sets = [{'X': X_val, 'y': y_val}])

In [None]:
# pip install -U cupy-cuda11x py-boost
import umap
from py_boost import GradientBoosting, TLPredictor, TLCompiledPredictor
from py_boost.cv import CrossValidation

fold = 0

both_reducer = umap.UMAP(n_components=50,
                  random_state=SEED,
                  low_memory=False,
                  metric='cosine',
                  verbose=True)

Xy_train = Xy.loc[Xy.fold!=fold,['product_id','shop_title', 'category_id']]

cats_in_train = Xy_train['category_id'].unique()

Xy_val = Xy.loc[(Xy.fold==fold) & (Xy.category_id.isin(cats_in_train)),['product_id','shop_title', 'category_id']]

bert_train_val_embs = pd.read_parquet(f'out/bert_model/bert_train_val_embs_f{fold}.pq')
img_train_val_embs = pd.read_parquet(f'out/image_model/img_train_val_embs_f{fold}.pq')

Xy_train = Xy_train.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

Xy_val = Xy_val.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

train_umap = both_reducer.fit_transform(Xy_train.iloc[:,3:])
train_umap = pd.DataFrame(train_umap)
my.flat_cols(train_umap,'both')

val_umap = both_reducer.transform(Xy_val.iloc[:,3:])
val_umap = pd.DataFrame(val_umap)
my.flat_cols(val_umap,'both')    

Xy_train = pd.concat([Xy_train.iloc[:,:3],train_umap],axis=1)
Xy_val = pd.concat([Xy_val.iloc[:,:3],val_umap],axis=1)

cols = ['product_id','category_id','shop_title']

X_train=Xy_train.drop(columns=cols).to_numpy()
y_train=Xy_train['category_id'].to_numpy()

X_val=Xy_val.drop(columns=cols).to_numpy()
y_val=Xy_val['category_id'].to_numpy()

model = GradientBoosting('crossentropy',metric='f1',
                         ntrees=10000, lr=0.03, verbose=5, es=50, lambda_l2=1, gd_steps=1,
                         subsample=1, colsample=1, min_data_in_leaf=10, use_hess=True,
                         max_bin=256, max_depth=6)

model.fit(X_train, y_train, eval_sets = [{'X': X_val, 'y': y_val}])

catboost f1 ~ 0.87

In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.decomposition import PCA

fold = 0

both_reducer = PCA(n_components=100)

class_names = Xy['category_id'].unique().tolist()

Xy_train = Xy.loc[Xy.fold!=fold,['product_id','shop_title', 'category_id']]

cats_in_train = Xy_train['category_id'].unique()

Xy_val = Xy.loc[(Xy.fold==fold) & (Xy.category_id.isin(cats_in_train)),['product_id','shop_title', 'category_id']]

bert_train_val_embs = pd.read_parquet(f'out/bert_model/bert_train_val_embs_f{fold}.pq')
img_train_val_embs = pd.read_parquet(f'out/image_model/img_train_val_embs_f{fold}.pq')

Xy_train = Xy_train.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

Xy_val = Xy_val.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

train_umap = both_reducer.fit_transform(Xy_train.iloc[:,3:])
train_umap = pd.DataFrame(train_umap)
my.flat_cols(train_umap,'both')

val_umap = both_reducer.transform(Xy_val.iloc[:,3:])
val_umap = pd.DataFrame(val_umap)
my.flat_cols(val_umap,'both')    

Xy_train = pd.concat([Xy_train.iloc[:,:3],train_umap],axis=1)
Xy_val = pd.concat([Xy_val.iloc[:,:3],val_umap],axis=1)

cols = ['product_id','category_id']

X_train=Xy_train.drop(columns=cols)
y_train=Xy_train['category_id']

X_val=Xy_val.drop(columns=cols)
y_val=Xy_val['category_id']

cols = ['product_id','category_id']

train_pool = Pool(data=Xy_train.drop(columns=cols), label=Xy_train['category_id'],cat_features=['shop_title'])

val_pool = Pool(data=Xy_val.drop(columns=cols), label=Xy_val['category_id'],cat_features=['shop_title'])

del Xy_train,Xy_val, bert_train_val_embs, img_train_val_embs
gc.collect()

LR = None

cb = CatBoostClassifier(iterations=10000,learning_rate=LR,random_seed=SEED, early_stopping_rounds=50,eval_metric='TotalF1:average=Weighted', class_names=class_names)

cb.fit(train_pool, eval_set=val_pool, verbose=1)
    # cb_models.append(cb)

In [None]:
from catboost import CatBoostClassifier, Pool, cv
import umap

both_reducer = umap.UMAP(n_components=50,
                  random_state=SEED,
                  low_memory=False,
                  metric='cosine',
                  verbose=True)

LR = None

for fold in sorted(Xy['fold'].unique()):
    Xy_train = Xy.loc[Xy.fold!=fold,['product_id','shop_title', 'category_id']]

    cats_in_train = Xy_train['category_id'].unique()

    Xy_val = Xy.loc[(Xy.fold==fold) & (Xy.category_id.isin(cats_in_train)),['product_id','shop_title', 'category_id']]

    bert_train_val_embs = pd.read_parquet(f'out/bert_model/bert_train_val_embs_f{fold}.pq')
    img_train_val_embs = pd.read_parquet(f'out/image_model/img_train_val_embs_f{fold}.pq')

    Xy_train = Xy_train.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

    Xy_val = Xy_val.merge(bert_train_val_embs,on='product_id').merge(img_train_val_embs,on='product_id')

    train_umap = both_reducer.fit_transform(Xy_train.iloc[:,3:])
    train_umap = pd.DataFrame(train_umap)
    my.flat_cols(train_umap,'both')

    val_umap = both_reducer.transform(Xy_val.iloc[:,3:])
    val_umap = pd.DataFrame(val_umap)
    my.flat_cols(val_umap,'both')    

    Xy_train = pd.concat([Xy_train.iloc[:,:3],train_umap],axis=1)
    Xy_val = pd.concat([Xy_val.iloc[:,:3],val_umap],axis=1)

    cols = ['product_id','category_id']

    train_pool = Pool(data=Xy_train.drop(columns=cols), label=Xy_train['category_id'],cat_features=['shop_title'])

    val_pool = Pool(data=Xy_val.drop(columns=cols), label=Xy_val['category_id'],cat_features=['shop_title'])

    del Xy_train,Xy_val, bert_train_val_embs, img_train_val_embs
    gc.collect()

    cb = CatBoostClassifier(iterations=10000,learning_rate=LR,random_seed=SEED, task_type="GPU",early_stopping_rounds=50,eval_metric='TotalF1:average=Weighted', gpu_ram_part=0.8, class_names=class_names, boosting_type='Plain')

    cb.fit(train_pool, eval_set=val_pool, verbose=1)
    # cb_models.append(cb)
  
    break

отдельный reducer для каждого эмбеддинга не улучшал результат

In [None]:
bert_reducer = umap.UMAP(n_components=10,
                  random_state=SEED,
                  low_memory=False,
                  metric='cosine',
                  verbose=True)

img_reducer = umap.UMAP(n_components=10,
                  random_state=SEED,
                  low_memory=False,
                  metric='cosine',
                  verbose=True)