In [1]:
!nvidia-smi

Sun Mar  7 01:54:37 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   43C    P0    40W / 300W |      0MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

- https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=goRmGIRI5cfC

# setup

http://localhost:8080/notebooks/git/product-category/notebooks/prep_20210304A1.ipynb

In [2]:
prfx_prp = 'prep_20210304A1'

In [3]:
HOME = "/data/git/product-category"
p_out = f'{HOME}/data/transformer_20210306D1'
!mkdir -p {p_out}

In [4]:
sz = int(1e4)
DATA2USE = f'{HOME}/data/data_sample_{sz}__{prfx_prp}.csv'

# eda 

In [5]:
import pandas as pd
import numpy as np
from collections import Counter

In [6]:
%%time
# df = pd.read_csv(f'../data/data__{prfx_prp}.csv')
# df = pd.read_csv(f'{HOME}/data/data_sample__{prfx_prp}.csv')
df = pd.read_csv(DATA2USE, nrows=1000)
print(df.shape)
df.sample(3)

(1000, 9)
CPU times: user 21.6 ms, sys: 54 µs, total: 21.7 ms
Wall time: 27.6 ms


Unnamed: 0,category,description,title,brand,feature,asin,domain,txt,is_validation
408,Home & Kitchen|Home Dcor|Home Dcor Accents|Orn...,This Round Sand Picture features a wavy design...,G.W. Schleidt SSR BW 6 in. Round Sand Picture ...,G W Schleidt,,B00DQB5O1K,Home_and_Kitche,G.W. Schleidt SSR BW 6 in. Round Sand Picture ...,0.0
169,CDs & Vinyl|Classical|Forms & Genres|Sonatas,Evgeny Kissin has made brave choices in select...,Schumann: Carnaval Op. 9 / Sonata No. 1 in F S...,Evgeny Kissin,,B0000665WU,CDs_and_Vinyl,Schumann: Carnaval Op. 9 / Sonata No. 1 in F S...,0.0
355,Automotive|Replacement Parts|Fuel System|Carbu...,Carburetor W/ Gasket Tecumseh 640338 640274 Carb,Carburetor W/ Gasket Tecumseh 640338 640274 Carb,KING,Free domestic shipping\nOne Year Warranty\nBra...,B00LKEJT0C,Automotive,Carburetor W/ Gasket Tecumseh 640338 640274 Ca...,0.0


In [7]:
df.txt.notna().mean()

0.938

In [8]:
MIN_CNT = 50
dmn2cnt = Counter(df.domain.value_counts().to_dict())
i2dmn = sorted(dmn2cnt.keys())
dmn2i = {v:k for k,v in enumerate(i2dmn)}
cat2cnt = Counter((j for i in df.category.apply(lambda x: x.split('|')) for j in i))
i2cat = sorted(k for k,v in cat2cnt.items() if v>50)
cat2i = {v:k for k,v in enumerate(i2cat)}

print("len(i2dmn), len(i2cat)", len(i2dmn), len(i2cat))
print("|".join(i2dmn))
print()
print("|".join(i2cat))

len(i2dmn), len(i2cat) 22 9
Appliance|Arts_Crafts_and_Sewi|Automotive|Book|CDs_and_Vinyl|Cell_Phones_and_Accessorie|Clothing_Shoes_and_Jewelry|Electronic|Grocery_and_Gourmet_Food|Home_and_Kitche|Industrial_and_Scientific|Kindle_Store|Movies_and_TV|Musical_Instrument|Office_Product|Patio_Lawn_and_Garde|Pet_Supplie|Software|Sports_and_Outdoor|Tools_and_Home_Improvement|Toys_and_Game|Video_Game

Automotive|Books|Clothing|Clothing, Shoes & Jewelry|Electronics|Home & Kitchen|Men|Sports & Outdoors|Women


# dataset

In [9]:
from argparse import ArgumentParser
import pytorch_lightning as pl
from transformers.optimization import AdamW

from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer
import torch

def mk_tensors(txt, tokenizer, max_seq_length):
    tok_res = tokenizer(
        txt, truncation=True, padding='max_length', max_length=max_seq_length
    )
    input_ids = tok_res["input_ids"]
    attention_mask = tok_res["attention_mask"]
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    return input_ids, attention_mask

def mk_ds(txt, tokenizer, max_seq_length, ys):
    input_ids, attention_mask = mk_tensors(txt, tokenizer, max_seq_length)
    return TensorDataset(input_ids, 
                         attention_mask, 
                         torch.tensor(ys)) 

class PCDataModule(pl.LightningDataModule):
    def __init__(self, 
                 model_name_or_path, 
                 max_seq_length, 
                 min_products_for_category,
                 train_batch_size,
                 val_batch_size,
                 dataloader_num_workers,
                 data_file_path=None,
                 dataframe=None):
        super().__init__()
        self.data_file_path = data_file_path
        self.dataframe = dataframe
        self.min_products_for_category = min_products_for_category
        self.model_name_or_path = model_name_or_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.dataloader_num_workers = dataloader_num_workers
        self.num_classes = None
      
    def prepare_data(self):
        #prepare_data is called from a single process (e.g. GPU 0). Do not use it to assign state (self.x = y).
        _ = AutoTokenizer.from_pretrained(self.model_name_or_path)

    def setup(self, stage=None):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        if self.dataframe is None:
            self.dataframe = pd.read_csv(self.data_file_path)
        
        self.dataframe = self.dataframe[self.dataframe.txt.notna()].copy()
        
        cats = self.dataframe.category.apply(lambda x: x.split('|'))
        cat2cnt = Counter((j for i in cats for j in i))
        i2cat = sorted(k for k,v in cat2cnt.items() if v>self.min_products_for_category)
        cat2i = {v:k for k,v in enumerate(i2cat)}
        self.num_classes = len(i2cat)
        self.i2cat, self.cat2i = i2cat, cat2i
        
        ys = np.zeros((len(self.dataframe), len(i2cat)))
        for i,cats in enumerate(self.dataframe.category):
            idx_pos = [cat2i[cat] for cat in cats.split('|') if cat in cat2i]
            ys[i,idx_pos] = 1
        
        msk_val = self.dataframe.is_validation==1
        self.df_trn = self.dataframe[~msk_val]
        self.df_val = self.dataframe[msk_val]
        idx_trn = np.where(~msk_val)[0]
        idx_val = np.where(msk_val)[0]
        self.ys_trn, self.ys_val = ys[idx_trn], ys[idx_val]
        
        txt = self.dataframe.txt.values
        self.train_dataset = mk_ds(list(self.df_trn.txt), self.tokenizer, self.max_seq_length, self.ys_trn)
        self.eval_dataset  = mk_ds(list(self.df_val.txt), self.tokenizer, self.max_seq_length, self.ys_val)
        
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.train_batch_size,
            num_workers=self.dataloader_num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.eval_dataset,
            batch_size=self.val_batch_size,
            num_workers=self.dataloader_num_workers,
        )

# model

In [10]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel


def getaccu(logits, ys):
    return ((logits>0.).int() == ys).float().mean()

class PCModel(pl.LightningModule):
    def __init__(self, model_name_or_path, num_classes, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
        super().__init__()
        self.save_hyperparameters()
        self.model_name_or_path = model_name_or_path
        self.bert = AutoModel.from_pretrained(self.model_name_or_path)
        self.num_classes = num_classes
        self.W = nn.Linear(self.bert.config.hidden_size, self.num_classes)

    def prepare_data(self):
        #prepare_data is called from a single process (e.g. GPU 0). Do not use it to assign state (self.x = y).
        _ = AutoModel.from_pretrained(self.model_name_or_path)

    def forward(self, input_ids, attention_mask):
        h = self.bert(input_ids, attention_mask)['last_hidden_state']
        h_cls = h[:, 0]
        return self.W(h_cls)
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        logits = self(input_ids, attention_mask)
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        accu = getaccu(logits, ys)
        self.log('train_loss', loss, on_epoch=True)
        self.log('train_accu', accu, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        logits = self(input_ids, attention_mask)
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        accu = getaccu(logits, ys)
        self.log('valid_loss', loss, on_step=False, sync_dist=True)
        self.log('valid_accu', accu, on_step=False, sync_dist=True)
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),
                          self.hparams.learning_rate,
                          betas=(self.hparams.adam_beta1,
                                 self.hparams.adam_beta2),
                          eps=self.hparams.adam_epsilon,)
        return optimizer    
    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--learning_rate', type=float, default=5e-5)
        parser.add_argument('--adam_beta1', type=float, default=0.9)
        parser.add_argument('--adam_beta2', type=float, default=0.999)
        parser.add_argument('--adam_epsilon', type=float, default=1e-8)
        return parser

# train

In [11]:
parser = ArgumentParser()

parser.add_argument('--model_name_or_path', type=str,
                    default="distilbert-base-cased")
parser.add_argument('--max_seq_length', type=int, default=512)
parser.add_argument('--min_products_for_category', type=int, default=100)
parser.add_argument('--train_batch_size', type=int, default=32)
parser.add_argument('--val_batch_size', type=int, default=64)
parser.add_argument("--dataloader_num_workers", type=int, default=8)

parser = pl.Trainer.add_argparse_args(parser)
parser = PCModel.add_model_specific_args(parser)

args = parser.parse_args([
    '--default_root_dir', p_out,
])


data_module = PCDataModule(
    model_name_or_path=args.model_name_or_path,
#     data_file_path=f'{HOME}/data/data_sample__{prfx_prp}.csv',
    data_file_path=DATA2USE,
    min_products_for_category=args.min_products_for_category,
    max_seq_length=args.max_seq_length,
    train_batch_size=args.train_batch_size,
    val_batch_size=args.val_batch_size,
    dataloader_num_workers=args.dataloader_num_workers,
)

data_module.prepare_data()

In [12]:
%%time
data_module.setup()

CPU times: user 8.86 s, sys: 450 ms, total: 9.31 s
Wall time: 2.42 s


In [13]:
pcmodel = PCModel(
    model_name_or_path=args.model_name_or_path,
    num_classes= data_module.num_classes,
    learning_rate=args.learning_rate,
    adam_beta1=args.adam_beta1,
    adam_beta2=args.adam_beta2,
    adam_epsilon=args.adam_epsilon,
)
pcmodel.prepare_data()

## run train

In [18]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import CSVLogger, TensorBoardLogger

In [29]:
csv_logger = CSVLogger(p_out, name='csv')
tb_logger = TensorBoardLogger(p_out, name='tensorboard')

In [30]:
pl.seed_everything(1234)
trainer = pl.Trainer.from_argparse_args(args, 
#                                         limit_train_batches=10, limit_val_batches=5, 
                                        max_epochs=1,
                                        callbacks=[EarlyStopping(monitor='valid_loss')],
#                                         fast_dev_run=True,
                                        stochastic_weight_avg=True,
                                        log_gpu_memory=True, 
                                        gpus=1,
                                        logger=[tb_logger,csv_logger],
                                       )

Global seed set to 1234
GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [31]:
trainer.fit(pcmodel, data_module)


  | Name | Type            | Params
-----------------------------------------
0 | bert | DistilBertModel | 65.2 M
1 | W    | Linear          | 37.7 K
-----------------------------------------
65.2 M    Trainable params
0         Non-trainable params
65.2 M    Total params
260.914   Total estimated model params size (MB)


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…




1

## tensorboard

In [35]:
ls $p_out/tensorboard/version_0

events.out.tfevents.1615082428.ip-10-0-3-91.31684.1  hparams.yaml


## load model

In [49]:
!find $p_out/ | grep *ckpt

/data/git/product-category/data/transformer_20210306D1/
/data/git/product-category/data/transformer_20210306D1/default
/data/git/product-category/data/transformer_20210306D1/default/version_1
/data/git/product-category/data/transformer_20210306D1/default/version_1/metrics.csv
/data/git/product-category/data/transformer_20210306D1/default/version_1/hparams.yaml
/data/git/product-category/data/transformer_20210306D1/default/version_0
/data/git/product-category/data/transformer_20210306D1/default/version_0/hparams.yaml
/data/git/product-category/data/transformer_20210306D1/default/version_0/events.out.tfevents.1615082268.ip-10-0-3-91.31684.0
/data/git/product-category/data/transformer_20210306D1/tensorboard_csv
/data/git/product-category/data/transformer_20210306D1/tensorboard_csv/0_0
/data/git/product-category/data/transformer_20210306D1/tensorboard_csv/0_0/checkpoints
/data/git/product-category/data/transformer_20210306D1/tensorboard_csv/0_0/checkpoints/epoch=0-step=249.ckpt

# eval

## csv results

In [40]:
ls $p_out/csv/version_0

hparams.yaml  metrics.csv


In [43]:
dfmtr = pd.read_csv(f"{p_out}/csv/version_0/metrics.csv")

In [44]:
dfmtr

Unnamed: 0,train_loss_step,train_accu_step,gpu_id: 0/memory.used (MB),epoch,step,train_loss_epoch,train_accu_epoch,valid_loss,valid_accu
0,0.188815,0.955995,16018.0,0,49,,,,
1,0.147974,0.963648,16018.0,0,99,,,,
2,0.137017,0.959184,16018.0,0,149,,,,
3,0.102367,0.971301,16018.0,0,199,,,,
4,0.124449,0.97449,16018.0,0,249,,,,
5,,,16018.0,0,249,0.157963,0.959145,,
6,,,4784.0,0,249,,,0.099894,0.970332


## run model

In [38]:
ls $p_out/lightning_logs/version_1/

[0m[01;34mcheckpoints[0m/  events.out.tfevents.1615059909.ip-10-0-3-91.12209.0  hparams.yaml


## demo

In [31]:
def do_demo(df):
    i2cat = data_module.i2cat
    tokenizer = data_module.tokenizer
    max_seq_length = data_module.max_seq_length
    row = df.sample()
    display(row)
    txt = list(row.txt.values)
    input_ids, attention_mask = mk_tensors(txt, tokenizer, max_seq_length)
    logits = pcmodel(input_ids, attention_mask)[0]
    print('Truth:', sorted(o for o in row.category.values[0].split('|') if o in i2cat))

    top_icats = np.argsort(-logits.detach().numpy())[:5]
    print('Top Preds:',[i2cat[i] for i in top_icats])
    print('Preds 0.5+:',[i2cat[i] for i in np.where(logits>0)[0]])


In [32]:
do_demo(data_module.df_val)

Unnamed: 0,category,description,title,brand,feature,asin,domain,txt,is_validation
68562,Books|Children's Books|Animals,,Happy: The Cavalier King Charles Spaniel,Kathleen Mary Clancy,,1946300071,Book,Happy: The Cavalier King Charles Spaniel Kathl...,1.0


Truth: ['Animals', 'Books', "Children's Books"]
Top Preds: ['Books', 'Literature &amp; Fiction', 'Literature & Fiction', 'Contemporary', 'Action &amp; Adventure']
Preds 0.5+: ['Books']


## save to script

# fin

In [19]:
!nvidia-smi

Sat Mar  6 22:13:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   62C    P0    60W / 300W |   1972MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    