- https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=goRmGIRI5cfC

# setup

In [1]:
HOME = "/data/git/product-category"
# http://localhost:8080/notebooks/git/product-category/notebooks/prep_20210304A1.ipynb
prfx_prp = 'prep_20210304A1'


p_out = f'{HOME}/data/transformer_20210305C1'
!mkdir -p {p_out}

# eda 

In [2]:
import pandas as pd
import numpy as np
from collections import Counter

In [3]:
%%time
# df = pd.read_csv(f'../data/data__{prfx_prp}.csv')
df = pd.read_csv(f'{HOME}/data/data_sample__{prfx_prp}.csv')

print(df.shape)
df.sample(3)

(10000, 9)
CPU times: user 137 ms, sys: 20.2 ms, total: 157 ms
Wall time: 156 ms


Unnamed: 0,category,description,title,brand,feature,asin,domain,txt,is_validation
8737,"Clothing, Shoes & Jewelry|Women|Clothing|Dress...",Laced detail dress perfect for any social occa...,Ark & Co.... Women's Lace Fit and Flare Dress,Ark & Co.,"55% Cotton, 45% Nylon\nImported\nDry Clean Onl...",B01FV0NWV2,Clothing_Shoes_and_Jewelry,Ark & Co.... Women's Lace Fit and Flare Dress ...,0
8393,Books|Arts &amp; Photography|History &amp; Cri...,The 1986 racial attack by white teenagers in H...,Incident at Howard Beach: The Case For Murder,Visit Amazon's Charles J. Hynes Page,,0399135006,Book,Incident at Howard Beach: The Case For Murder ...,0
5749,CDs & Vinyl|Classical,WEA 6154482; WEA ITALIANA - Italia; Classica v...,Britten: War Requiem,Benjamin Britten,,B00EAH3EAC,CDs_and_Vinyl,Britten: War Requiem Benjamin Britten WEA 6154...,0


In [4]:
MIN_CNT = 50
dmn2cnt = Counter(df.domain.value_counts().to_dict())
i2dmn = sorted(dmn2cnt.keys())
dmn2i = {v:k for k,v in enumerate(i2dmn)}
cat2cnt = Counter((j for i in df.category.apply(lambda x: x.split('|')) for j in i))
i2cat = sorted(k for k,v in cat2cnt.items() if v>50)
cat2i = {v:k for k,v in enumerate(i2cat)}

print("len(i2dmn), len(i2cat)", len(i2dmn), len(i2cat))
print("|".join(i2dmn))
print()
print("|".join(i2cat))

len(i2dmn), len(i2cat) 23 111
Appliance|Arts_Crafts_and_Sewi|Automotive|Book|CDs_and_Vinyl|Cell_Phones_and_Accessorie|Clothing_Shoes_and_Jewelry|Electronic|Grocery_and_Gourmet_Food|Home_and_Kitche|Industrial_and_Scientific|Kindle_Store|Magazine_Subscripti|Movies_and_TV|Musical_Instrument|Office_Product|Patio_Lawn_and_Garde|Pet_Supplie|Software|Sports_and_Outdoor|Tools_and_Home_Improvement|Toys_and_Game|Video_Game

100% Cotton|100% Leather|100% Polyester|Accessories|Accessories & Supplies|Action Figures & Statues|Americas|Arts &amp; Photography|Arts, Crafts & Sewing|Athletic|Audio & Video Accessories|Automotive|Baby|Baby Girls|Bags, Cases & Sleeves|Basic Cases|Bedding|Biographies &amp; Memoirs|Body & Trim|Books|Boots|Boys|Bracelets|Bumper Stickers, Decals & Magnets|CDs & Vinyl|Camera & Photo|Cases, Holsters & Sleeves|Casual|Cell Phones & Accessories|Children's Books|Christian Books &amp; Bibles|Clothing|Clothing, Shoes & Jewelry|Computer Accessories & Peripherals|Computers & Accessories

# dataset

In [5]:
from argparse import ArgumentParser
import pytorch_lightning as pl
from transformers.optimization import AdamW

from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer
import torch

def mk_tensors(txt, tokenizer, max_seq_length):
    tok_res = tokenizer(
        txt, truncation=True, padding='max_length', max_length=max_seq_length
    )
    input_ids = tok_res["input_ids"]
    attention_mask = tok_res["attention_mask"]
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    return input_ids, attention_mask

def mk_ds(txt, tokenizer, max_seq_length, ys):
    input_ids, attention_mask = mk_tensors(txt, tokenizer, max_seq_length)
    return TensorDataset(input_ids, 
                         attention_mask, 
                         torch.tensor(ys)) 

class PCDataModule(pl.LightningDataModule):
    def __init__(self, 
                 model_name_or_path, 
                 max_seq_length, 
                 min_products_for_category,
                 train_batch_size,
                 val_batch_size,
                 data_file_path=None,
                 dataframe=None):
        super().__init__()
        self.data_file_path = data_file_path
        self.dataframe = dataframe
        self.min_products_for_category = min_products_for_category
        self.model_name_or_path = model_name_or_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.num_classes = None
      
    def prepare_data(self):
        #prepare_data is called from a single process (e.g. GPU 0). Do not use it to assign state (self.x = y).
        _ = AutoTokenizer.from_pretrained(self.model_name_or_path)

    def setup(self, stage=None):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        if self.dataframe is None:
            self.dataframe = pd.read_csv(self.data_file_path)
        cats = self.dataframe.category.apply(lambda x: x.split('|'))
        cat2cnt = Counter((j for i in cats for j in i))
        i2cat = sorted(k for k,v in cat2cnt.items() if v>self.min_products_for_category)
        cat2i = {v:k for k,v in enumerate(i2cat)}
        self.num_classes = len(i2cat)
        self.i2cat, self.cat2i = i2cat, cat2i
        
        ys = np.zeros((len(self.dataframe), len(i2cat)))
        for i,cats in enumerate(self.dataframe.category):
            idx_pos = [cat2i[cat] for cat in cats if cat in cat2i]
            ys[i,idx_pos] = 1
        
        msk_val = self.dataframe.is_validation==1
        self.df_trn = self.dataframe[~msk_val]
        self.df_val = self.dataframe[msk_val]
        idx_trn = np.where(~msk_val)[0]
        idx_val = np.where(msk_val)[0]
        self.ys_trn, self.ys_val = ys[idx_trn], ys[idx_val]
        
        txt = self.dataframe.txt.values
        self.train_dataset = mk_ds(list(self.df_trn.txt), self.tokenizer, self.max_seq_length, self.ys_trn)
        self.eval_dataset  = mk_ds(list(self.df_val.txt), self.tokenizer, self.max_seq_length, self.ys_val)
        
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.train_batch_size,
        )

    def val_dataloader(self):
        return DataLoader(
            self.eval_dataset,
            batch_size=self.val_batch_size,
        )

# model

In [13]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel


def getaccu(logits, ys):
    return ((torch.sigmoid(logits)>0.5).int()==ys).float().mean()

class PCModel(pl.LightningModule):
    def __init__(self, model_name_or_path, num_classes, learning_rate, adam_beta1, adam_beta2, adam_epsilon):
        super().__init__()
        self.save_hyperparameters()
        self.model_name_or_path = model_name_or_path
        self.bert = AutoModel.from_pretrained(self.model_name_or_path)
        self.num_classes = num_classes
        self.W = nn.Linear(self.bert.config.hidden_size, self.num_classes)

    def prepare_data(self):
        #prepare_data is called from a single process (e.g. GPU 0). Do not use it to assign state (self.x = y).
        _ = AutoModel.from_pretrained(self.model_name_or_path)

    def forward(self, input_ids, attention_mask):
        h = self.bert(input_ids, attention_mask)['last_hidden_state']
        h_cls = h[:, 0]
        return self.W(h_cls)

    
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        logits = self(input_ids, attention_mask)
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        accu = getaccu(logits, ys)
        self.log('train_loss', loss, 'train_accu', accu, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, ys = batch    
        logits = self(input_ids, attention_mask)
        loss = F.binary_cross_entropy_with_logits(logits, ys)
        accu = getaccu(logits, ys)
        self.log('valid_loss', loss, 'train_accu', accu, on_step=True, sync_dist=True)
        return {'val_loss': loss}
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),
                          self.hparams.learning_rate,
                          betas=(self.hparams.adam_beta1,
                                 self.hparams.adam_beta2),
                          eps=self.hparams.adam_epsilon,)
        return optimizer    
    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--learning_rate', type=float, default=5e-5)
        parser.add_argument('--adam_beta1', type=float, default=0.9)
        parser.add_argument('--adam_beta2', type=float, default=0.999)
        parser.add_argument('--adam_epsilon', type=float, default=1e-8)
        return parser

# train

In [14]:
parser = ArgumentParser()

parser.add_argument('--model_name_or_path', type=str,
                    default="distilbert-base-cased")
parser.add_argument('--max_seq_length', type=int, default=32)
parser.add_argument('--min_products_for_category', type=int, default=100)
parser.add_argument('--train_batch_size', type=int, default=16)
parser.add_argument('--val_batch_size', type=int, default=8)

parser = pl.Trainer.add_argparse_args(parser)
parser = PCModel.add_model_specific_args(parser)

args = parser.parse_args([
    '--default_root_dir', p_out,
])


data_module = PCDataModule(
    model_name_or_path=args.model_name_or_path,
    data_file_path=f'{HOME}/data/data_sample__{prfx_prp}.csv',
    min_products_for_category=args.min_products_for_category,
    max_seq_length=args.max_seq_length,
    train_batch_size=args.train_batch_size,
    val_batch_size=args.val_batch_size,
)

data_module.prepare_data()

In [15]:
%%time
data_module.setup()

CPU times: user 9.39 s, sys: 107 ms, total: 9.5 s
Wall time: 1.81 s


In [16]:
pcmodel = PCModel(
    model_name_or_path=args.model_name_or_path,
    num_classes= data_module.num_classes,
    learning_rate=args.learning_rate,
    adam_beta1=args.adam_beta1,
    adam_beta2=args.adam_beta2,
    adam_epsilon=args.adam_epsilon,
)
pcmodel.prepare_data()

In [17]:
dl = data_module.train_dataloader()
for dat in dl:
    input_ids, attention_mask, ys = dat
    break

logits = pcmodel(input_ids, attention_mask)

accu = ((torch.sigmoid(logits)>0.5).int()==ys).float().mean()
accu

tensor(0.3833)

In [21]:
accu

tensor(0.3833)

## run train

In [18]:
pl.seed_everything(1234)
# trainer = pl.Trainer.from_argparse_args(args)
# trainer = pl.Trainer.from_argparse_args(args, fast_dev_run=True)
trainer = pl.Trainer.from_argparse_args(args, fast_dev_run=True, gpus=1)
# trainer = pl.Trainer.from_argparse_args(args, limit_train_batches=10, limit_val_batches=5, gpus=1)
# trainer = pl.Trainer.from_argparse_args(args, gpus=1)

Global seed set to 1234
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using 1 batch(es).


In [19]:
trainer.fit(pcmodel, data_module)


  | Name | Type            | Params
-----------------------------------------
0 | bert | DistilBertModel | 65.2 M
1 | W    | Linear          | 40.8 K
-----------------------------------------
65.2 M    Trainable params
0         Non-trainable params
65.2 M    Total params
260.927   Total estimated model params size (MB)


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

  value = torch.tensor(value, device=device, dtype=torch.float)





1

## tensorboard

In [20]:
ls {p_out}/lightning_logs/

ls: cannot access '/data/git/product-category/data/transformer_20210305C1/lightning_logs/': No such file or directory


In [None]:
!tensorboard --logdir {p_out}/lightning_logs/

# fin

In [None]:
!nvidia-smi