In [1]:
%load_ext autoreload
%autoreload 1
%aimport src.my, src.net, src.data, src.models, src.text_utils

import sys
import numpy as np
import pandas as pd

import os
import gc
import matplotlib.pyplot as plt
import importlib
import pickle

import src.huse as huse
import src.models as ms
import src.net as net
import src.text_utils as tu
import src.my as my
from src.my import p
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 200)
pd.set_option("max_colwidth", 45)
pd.set_option("display.precision", 1)
pd.options.display.float_format = "{:.3f}".format
# pd.set_option("display.max_rows", 5)
# pd.reset_option("display.max_rows")

from sklearn.model_selection import train_test_split

# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)

dir_data = 'data/'
dir_out = 'out/'
os.makedirs(dir_out, exist_ok=True)

SEED = 34
N_CPU = os.cpu_count()

np.random.seed(SEED)
rng = np.random.default_rng(SEED)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


В данном ноутбуке тренируются DL модели на извлеченных эмбеддингах

In [2]:
Xy = pd.read_parquet(dir_out+'prepared_df.pq')
X_test = pd.read_parquet(dir_out+'prepared_test.pq')
Xy[:2]

Unnamed: 0,product_id,category_id,shop_id,category_name,fold,text
0,325286,251,493,электроника смартфоны телефоны аксессуары...,4,зарядный кабель borofone bx1 lightning ай...
1,888134,748,6081,одежда женская одежда белье купальники трусы,3,трусы sela трусы слипы эластичного бесшов...


In [3]:
CHECKPOINT_DIR = dir_out + 'ftt_model'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
CHECKPOINT_DIR

'out/ftt_model'

3 модели можно тренировать: MLP, RESNET, FTT Transformer, меняя CFG.name. f1 ~ 0.86-0.88

In [4]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, TQDMProgressBar, ModelCheckpoint
import src.net as net, src.data as data
from pytorch_lightning.plugins.precision import MixedPrecisionPlugin

torch.set_float32_matmul_precision('medium')

class CFG:
    name='resnet'
    # name='mlp'
    # name='ftt'
    lr = 5e-4

    scheduler='cosine'
    num_cycles=0.5
    num_warmup_steps=30
    epochs=30
    batch_size=32
    gradient_checkpointing=False
    gradient_accumulation_steps=1
    max_grad_norm=50
    precision = 16

    num_classes = 874

def train_fold(Xy:pd.DataFrame, Xy_test:pd.DataFrame, fold:int=0):
    print('[TRAIN FOLD]:',fold)
    net.set_seed(SEED + 10*fold)
    NAME_CKPT = f'best_f{fold}'

    dm = data.FTTDataModule(Xy, Xy_test, fold=fold, batch_size = CFG.batch_size, val_bs=32,n_cpu=3)

    model = net.FTTModule(name=CFG.name)

    tq = TQDMProgressBar(refresh_rate=5)

    es = EarlyStopping('val_f1', min_delta=0.001,patience=5,verbose=True, mode='max', check_on_train_epoch_end=False)

    chpt = ModelCheckpoint(dirpath=CHECKPOINT_DIR,filename=f'best_f{fold}',  monitor='val_f1',mode='max')

    trainer = pl.Trainer(
        # precision=CFG.precision,
        # plugins=[MixedPrecisionPlugin(precision=16,device='cuda')],
    callbacks=[tq,es,chpt],
    max_epochs=CFG.epochs,
    deterministic = True,
    accelerator='auto',
    accumulate_grad_batches = CFG.gradient_accumulation_steps,
    gradient_clip_val = CFG.max_grad_norm,
    # val_check_interval = 0.001,
#     logger = False,
    log_every_n_steps = 50,
    enable_model_summary = True if fold==0 else False)

    trainer.fit(model, datamodule=dm)

    del trainer
    torch.cuda.empty_cache()
    gc.collect()

    return chpt.best_model_score.cpu().item()

res = []
for fold in sorted(Xy['fold'].unique()):
    res_fold = train_fold(Xy, X_test, fold=fold)
    res.append((fold,res_fold))
    break

Global seed set to 34
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[TRAIN FOLD]: 0


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params
---------------------------------------------
0 | model   | FTTModel         | 507 K 
1 | loss_fn | CrossEntropyLoss | 0     
---------------------------------------------
507 K     Trainable params
0         Non-trainable params
507 K     Total params
2.031     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_f1 improved. New best score: 0.880


Validation: 0it [00:00, ?it/s]

Metric val_f1 improved by 0.002 >= min_delta = 0.001. New best score: 0.882


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_f1 improved by 0.002 >= min_delta = 0.001. New best score: 0.884


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_f1 did not improve in the last 5 records. Best score: 0.884. Signaling Trainer to stop.
