In [24]:
PRFX = "transformer_20210307D1"
PRFX_PRP = 'prep_20210307B1'
FREEZE_BERT = False

In [25]:
!nvidia-smi

Sun Mar  7 18:01:29 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   53C    P0    54W / 300W |   1960MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

- https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=goRmGIRI5cfC

# setup

http://localhost:8080/notebooks/git/product-category/notebooks/prep_20210304A1.ipynb

In [26]:
HOME = "/data/git/product-category"
p_out = f'{HOME}/data/{PRFX}'
!mkdir -p {p_out}

In [27]:
sz = int(1e4)
DATA2USE = f'{HOME}/data/data_sample_{sz}__{PRFX_PRP}.csv'

# eda 

In [28]:
import pandas as pd
import numpy as np
from collections import Counter

In [29]:
%%time
# df = pd.read_csv(f'../data/data__{PRFX_PRP}.csv')
# df = pd.read_csv(f'{HOME}/data/data_sample__{PRFX_PRP}.csv')
df = pd.read_csv(DATA2USE, nrows=1000)
print(df.shape)
df.sample(3)

(1000, 3)
CPU times: user 4.18 ms, sys: 8.15 ms, total: 12.3 ms
Wall time: 11.5 ms


Unnamed: 0,category,title,is_validation
506,"Sports & Outdoors|Outdoor Recreation|Skates, S...",XBoard Illuminators :: Super Bright LED Ground...,0
545,Automotive|Motorcycle & Powersports|Parts|Elec...,Atlantis A2520 Flush Kit for JetSki and Ultra ...,0
677,Movies & TV|Genre for Featured Categories|Acti...,Legend Of The Dragonslayer Sword,0


In [30]:
df = pd.read_csv(DATA2USE, nrows=1000)


In [31]:
MIN_CNT = 50
cat2cnt = Counter((j for i in df.category.apply(lambda x: x.split('|')) for j in i))
i2cat = sorted(k for k,v in cat2cnt.items() if v>MIN_CNT)
cat2i = {v:k for k,v in enumerate(i2cat)}

print("len(i2cat)", len(i2cat))
print("|".join(i2cat))

len(i2cat) 10
Accessories|Automotive|Books|Clothing|Clothing, Shoes & Jewelry|Electronics|Home & Kitchen|Sports & Outdoors|Toys & Games|Women


In [32]:
cat2cnt.most_common(20)

[('Books', 213),
 ('Clothing, Shoes & Jewelry', 171),
 ('Women', 103),
 ('Home & Kitchen', 89),
 ('Clothing', 83),
 ('Sports & Outdoors', 64),
 ('Automotive', 64),
 ('Accessories', 63),
 ('Electronics', 61),
 ('Toys & Games', 52),
 ('Men', 47),
 ('Kindle Store', 41),
 ('Kindle eBooks', 40),
 ('Tools & Home Improvement', 40),
 ('Shoes', 40),
 ('Sports & Fitness', 35),
 ('Cell Phones & Accessories', 34),
 ('Replacement Parts', 33),
 ('Imported', 32),
 ('Patio, Lawn & Garden', 28)]

# dataset

In [33]:
from argparse import ArgumentParser
import pytorch_lightning as pl
from transformers.optimization import AdamW

from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer
import torch

def mk_tensors(txt, tokenizer, max_seq_length):
    tok_res = tokenizer(
        txt, truncation=True, padding='max_length', max_length=max_seq_length
    )
    input_ids = tok_res["input_ids"]
    attention_mask = tok_res["attention_mask"]
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    return input_ids, attention_mask

def mk_ds(txt, tokenizer, max_seq_length, ys):
    input_ids, attention_mask = mk_tensors(txt, tokenizer, max_seq_length)
    return TensorDataset(input_ids, 
                         attention_mask, 
                         torch.tensor(ys)) 

class PCDataModule(pl.LightningDataModule):
    def __init__(self, 
                 model_name_or_path, 
                 max_seq_length, 
                 min_products_for_category,
                 train_batch_size,
                 val_batch_size,
                 dataloader_num_workers,
                 data_file_path=None,
                 dataframe=None):
        super().__init__()
        self.data_file_path = data_file_path
        self.dataframe = dataframe
        self.min_products_for_category = min_products_for_category
        self.model_name_or_path = model_name_or_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.dataloader_num_workers = dataloader_num_workers
        self.num_classes = None
      
    def prepare_data(self):
        #prepare_data is called from a single process (e.g. GPU 0). Do not use it to assign state (self.x = y).
        _ = AutoTokenizer.from_pretrained(self.model_name_or_path)

    def setup(self, stage=None):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        if self.dataframe is None:
            self.dataframe = pd.read_csv(self.data_file_path)
            self.dataframe.dropna(inplace=True)

        
#         self.dataframe = self.dataframe[self.dataframe.title.notna()].copy()
        
        cats = self.dataframe.category.apply(lambda x: x.split('|'))
        cat2cnt = Counter((j for i in cats for j in i))
        i2cat = sorted(k for k,v in cat2cnt.items() if v>self.min_products_for_category)
        cat2i = {v:k for k,v in enumerate(i2cat)}
        self.num_classes = len(i2cat)
        self.i2cat, self.cat2i = i2cat, cat2i
        
        ys = np.zeros((len(self.dataframe), len(i2cat)))
        for i,cats in enumerate(self.dataframe.category):
            idx_pos = [cat2i[cat] for cat in cats.split('|') if cat in cat2i]
            ys[i,idx_pos] = 1
        
        msk_val = self.dataframe.is_validation==1
        self.df_trn = self.dataframe[~msk_val]
        self.df_val = self.dataframe[ msk_val]
        idx_trn = np.where(~msk_val)[0]
        idx_val = np.where( msk_val)[0]
        self.ys_trn, self.ys_val = ys[idx_trn], ys[idx_val]
        
        self.train_dataset = mk_ds(list(self.df_trn.title), self.tokenizer, self.max_seq_length, self.ys_trn)
        self.eval_dataset  = mk_ds(list(self.df_val.title), self.tokenizer, self.max_seq_length, self.ys_val)
        
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.train_batch_size,
            num_workers=self.dataloader_num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.eval_dataset,
            batch_size=self.val_batch_size,
            num_workers=self.dataloader_num_workers,
        )

# model

In [34]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel


def getaccu(logits, ys):
    return ((logits>0.).int() == ys).float().mean()

class PCModel(pl.LightningModule):
    def __init__(self, model_name_or_path, freeze_bert, num_classes, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
        super().__init__()
        self.save_hyperparameters()
        self.model_name_or_path = model_name_or_path
        self.bert = AutoModel.from_pretrained(self.model_name_or_path)
        self.freeze_bert = freeze_bert
        if freeze_bert==True:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.num_classes = num_classes
        self.W = nn.Linear(self.bert.config.hidden_size, self.num_classes)

    def prepare_data(self):
        #prepare_data is called from a single process (e.g. GPU 0). Do not use it to assign state (self.x = y).
        _ = AutoModel.from_pretrained(self.model_name_or_path)

    def forward(self, input_ids, attention_mask):
        h = self.bert(input_ids, attention_mask)['last_hidden_state']
        h_cls = h[:, 0]
        return self.W(h_cls)
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        logits = self(input_ids, attention_mask)
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        accu = getaccu(logits, ys)
        self.log('train_loss', loss, on_epoch=True)
        self.log('train_accu', accu, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        logits = self(input_ids, attention_mask)
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        accu = getaccu(logits, ys)
        self.log('valid_loss', loss, on_step=False, sync_dist=True)
        self.log('valid_accu', accu, on_step=False, sync_dist=True)
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),
                          self.hparams.learning_rate,
                          betas=(self.hparams.adam_beta1,
                                 self.hparams.adam_beta2),
                          eps=self.hparams.adam_epsilon,)
        return optimizer    
    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--learning_rate', type=float, default=5e-5)
        parser.add_argument('--adam_beta1', type=float, default=0.9)
        parser.add_argument('--adam_beta2', type=float, default=0.999)
        parser.add_argument('--adam_epsilon', type=float, default=1e-8)
        return parser

# train

In [35]:
parser = ArgumentParser()

parser.add_argument('--model_name_or_path', type=str,
                    default="distilbert-base-cased")
parser.add_argument('--freeze_bert', action='store_true')
parser.add_argument('--max_seq_length', type=int, default=128)
parser.add_argument('--min_products_for_category', type=int, default=100)
parser.add_argument('--train_batch_size', type=int, default=128)
parser.add_argument('--val_batch_size', type=int, default=256)
parser.add_argument("--dataloader_num_workers", type=int, default=8)

parser = pl.Trainer.add_argparse_args(parser)
parser = PCModel.add_model_specific_args(parser)

args_list = [
    '--default_root_dir', p_out,
]
if FREEZE_BERT: args_list.append('--freeze_bert')


args = parser.parse_args(args_list)


data_module = PCDataModule(
    model_name_or_path=args.model_name_or_path,
#     data_file_path=f'{HOME}/data/data_sample__{PRFX_PRP}.csv',
    data_file_path=DATA2USE,
    min_products_for_category=args.min_products_for_category,
    max_seq_length=args.max_seq_length,
    train_batch_size=args.train_batch_size,
    val_batch_size=args.val_batch_size,
    dataloader_num_workers=args.dataloader_num_workers,
)

data_module.prepare_data()

In [36]:
%%time
data_module.setup()

CPU times: user 1.85 s, sys: 79 ms, total: 1.92 s
Wall time: 706 ms


In [37]:
pcmodel = PCModel(
    model_name_or_path=args.model_name_or_path,
    freeze_bert=args.freeze_bert,
    num_classes= data_module.num_classes,
    learning_rate=args.learning_rate,
    adam_beta1=args.adam_beta1,
    adam_beta2=args.adam_beta2,
    adam_epsilon=args.adam_epsilon,
)
pcmodel.prepare_data()

## trainer

In [38]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger

csv_logger = CSVLogger(p_out, name='csv')
tb_logger = TensorBoardLogger(p_out, name='tensorboard')

trainer = pl.Trainer.from_argparse_args(args, 
#                                         limit_train_batches=10, limit_val_batches=5, 
                                        fast_dev_run=True,
                                        max_epochs=10,
                                        callbacks=[EarlyStopping(monitor='valid_loss')],
                                        stochastic_weight_avg=True,
                                        log_gpu_memory=True, 
                                        gpus=1,
                                        logger=[tb_logger,csv_logger],
                                       )

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using 1 batch(es).


## run

In [39]:
pl.seed_everything(1234)
trainer.fit(pcmodel, data_module)

Global seed set to 1234

  | Name | Type            | Params
-----------------------------------------
0 | bert | DistilBertModel | 65.2 M
1 | W    | Linear          | 37.7 K
-----------------------------------------
65.2 M    Trainable params
0         Non-trainable params
65.2 M    Total params
260.914   Total estimated model params size (MB)


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

  value = torch.tensor(value, device=device, dtype=torch.float)





1

In [40]:
!find $p_out/

/data/git/product-category/data/transformer_20210307D1/
/data/git/product-category/data/transformer_20210307D1/tensorboard_csv
/data/git/product-category/data/transformer_20210307D1/tensorboard_csv/0_0
/data/git/product-category/data/transformer_20210307D1/tensorboard_csv/0_0/checkpoints
/data/git/product-category/data/transformer_20210307D1/tensorboard_csv/0_0/checkpoints/epoch=1-step=265.ckpt
/data/git/product-category/data/transformer_20210307D1/csv
/data/git/product-category/data/transformer_20210307D1/csv/version_0
/data/git/product-category/data/transformer_20210307D1/csv/version_0/metrics.csv
/data/git/product-category/data/transformer_20210307D1/csv/version_0/hparams.yaml
/data/git/product-category/data/transformer_20210307D1/tensorboard
/data/git/product-category/data/transformer_20210307D1/tensorboard/version_0
/data/git/product-category/data/transformer_20210307D1/tensorboard/version_0/hparams.yaml
/data/git/product-category/data/transformer_20210307D1/tensorboar

## tensorboard

In [41]:
!find $p_out/tensorboard/

/data/git/product-category/data/transformer_20210307D1/tensorboard/
/data/git/product-category/data/transformer_20210307D1/tensorboard/version_0
/data/git/product-category/data/transformer_20210307D1/tensorboard/version_0/hparams.yaml
/data/git/product-category/data/transformer_20210307D1/tensorboard/version_0/events.out.tfevents.1615139961.ip-10-0-3-91.26094.0


## load model

# eval

## csv results

In [42]:
!find $p_out/csv/

/data/git/product-category/data/transformer_20210307D1/csv/
/data/git/product-category/data/transformer_20210307D1/csv/version_0
/data/git/product-category/data/transformer_20210307D1/csv/version_0/metrics.csv
/data/git/product-category/data/transformer_20210307D1/csv/version_0/hparams.yaml


In [43]:
dfmtr = pd.read_csv(f"{p_out}/csv/version_0/metrics.csv")

dfmtr

Unnamed: 0,train_loss_step,train_accu_step,gpu_id: 0/memory.used (MB),epoch,step,train_loss_epoch,train_accu_epoch,valid_loss,valid_accu
0,0.174922,0.961416,5800.0,0,49,,,,
1,0.143278,0.96301,5800.0,0,99,,,,
2,,,5806.0,0,132,0.193434,0.955759,,
3,,,6190.0,0,132,,,0.12291,0.967532
4,0.11464,0.969707,6190.0,1,149,,,,
5,0.104589,0.970982,6190.0,1,199,,,,
6,0.088796,0.97449,6190.0,1,249,,,,
7,,,6190.0,1,265,0.106451,0.970692,,
8,,,6190.0,1,265,,,0.092555,0.973867
9,0.09062,0.971939,6190.0,2,299,,,,


## run model

## demo

In [44]:
def do_demo(df):
    i2cat = data_module.i2cat
    tokenizer = data_module.tokenizer
    max_seq_length = data_module.max_seq_length
    row = df.sample()
    display(row)
    txt = list(row.title.values)
    input_ids, attention_mask = mk_tensors(txt, tokenizer, max_seq_length)
    logits = pcmodel(input_ids, attention_mask)[0]
    print('Truth:', sorted(o for o in row.category.values[0].split('|') if o in i2cat))

    top_icats = np.argsort(-logits.detach().numpy())[:5]
    print('Top Preds:',[i2cat[i] for i in top_icats])
    print('Preds 0.5+:',[i2cat[i] for i in np.where(logits>0)[0]])


In [45]:
do_demo(data_module.df_val)

Unnamed: 0,category,title,is_validation
8536,Automotive|Motorcycle & Powersports|Accessorie...,Saddlemen Luggage Desperado Pillion Bag Universal,1


Truth: ['Accessories', 'Automotive']
Top Preds: ['Arts, Crafts & Sewing', '100% Cotton', 'Exterior Accessories', 'Novelty & More', 'Kindle Store']
Preds 0.5+: ['100% Cotton', 'Arts, Crafts & Sewing', 'Cases, Holsters & Sleeves', 'Cell Phones & Accessories', "Children's Books", 'Clothing, Shoes & Jewelry', 'Exterior Accessories', 'Fan Shop', 'Grocery & Gourmet Food', 'History', 'Home Dcor', 'Home Dcor Accents', 'Industrial & Scientific', 'Jewelry', 'Kindle Store', 'Literature & Fiction', 'Literature &amp; Fiction', 'Machine Wash', 'Novelty', 'Novelty & More', 'Office Products', 'Outdoor Recreation', 'Patio, Lawn & Garden', 'Pet Supplies', 'Replacement Parts', 'Sports & Outdoors', 'T-Shirts', 'Toys & Games', 'Women']


# fin

In [46]:
!nvidia-smi

Sun Mar  7 18:01:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   53C    P0    55W / 300W |   2646MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    