# setup

In [1]:
# http://localhost:8080/notebooks/git/product-category/notebooks/prep_20210302B1.ipynb
prfx_prp = 'prep_20210302B1'

In [2]:
import pandas as pd
import numpy as np
from collections import Counter

In [3]:
MIN_CNT = 50

# prep

In [4]:
%%time
# df = pd.read_csv(f'../data/data__{prfx_prp}.csv')
df = pd.read_csv(f'../data/data_sample__{prfx_prp}.csv')

CPU times: user 69.8 ms, sys: 26.2 ms, total: 96 ms
Wall time: 125 ms


In [5]:
%%time
def try2eval(x):
    try:
        return eval(x)
    except SyntaxError:
        return []

df.fillna("", inplace=True)
df = df.astype(str)
df['category'] = df.category.apply(try2eval)
print("df.shape:", df.shape)
df['txt'] = df.title + " " + df.brand + " " + df.description + " " + df.feature

df.shape: (10000, 7)
CPU times: user 169 ms, sys: 12 ms, total: 181 ms
Wall time: 190 ms


In [6]:
dmn2cnt = Counter(df.domain.value_counts().to_dict())
i2dmn = sorted(dmn2cnt.keys())
dmn2i = {v:k for k,v in enumerate(i2dmn)}
cat2cnt = Counter([j for i in df.category for j in i])
i2cat = sorted(k for k,v in cat2cnt.items() if v>50)
cat2i = {v:k for k,v in enumerate(i2cat)}

len(i2dmn), len(i2cat)

(23, 111)

## split train val

In [7]:
np.random.seed(101)
msk_val = np.random.rand(len(df))>0.85
dftrn = df[~msk_val]
dfval = df[msk_val]
dftrn.shape, dfval.shape

((8466, 8), (1534, 8))

## make ys

In [8]:
ys = np.zeros((len(df), len(i2cat)))

for i,cats in enumerate(df.category):
    idx_pos = [cat2i[cat] for cat in cats if cat in cat2i]
    ys[i,idx_pos] = 1

ys_trn = ys[~msk_val]
ys_val = ys[msk_val]
ys_trn.shape, ys_val.shape

((8466, 111), (1534, 111))

# modeling setup

- https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=goRmGIRI5cfC

In [12]:
from argparse import ArgumentParser
import pytorch_lightning as pl
import transformers as tfm
from transformers.optimization import AdamW

# dataset

In [30]:
from torch.utils.data.dataloader import Dataset, DataLoader
from transformers import AutoTokenizer

In [None]:
class PCDataset(Dataset):
    def __init__(self, df, ys):
        self.txt = df.txt
        self.ys = ys
    def __getitem__(self, i):
        self.txt[]

In [32]:
class PCDataModule(pl.LightningDataModule):
    def __init__(self, model_name_or_path, 
                 max_seq_length, 
                 train_batch_size, val_batch_size,
                 min_products_for_category,
                 train_dataframe=None, validation_dataframe=None,):
        super().__init__()
        self.validation_dataframe = validation_dataframe
        self.min_products_for_category = min_products_for_category
        self.model_name_or_path = model_name_or_path
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.train_dataframe = train_dataframe
  
    def setup(self, stage):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
        if self.train_dataframe is None:
            self.train_dataframe = pd.read_csv(self.train_file)
        if self.validation_dataframe is None:
            self.validation_dataframe = pd.read_csv(self.validation_file)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.train_batch_size,
            num_workers=self.dataloader_num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.eval_dataset,
            batch_size=self.val_batch_size,
            num_workers=self.dataloader_num_workers,
        )

# model

In [23]:
class PCModel(pl.LightningModule):
    def __init__(self, model_name_or_path):
        super().__init__()
        self.save_hyperparameters()
        self.bert = tfm.AutoModel.from_pretrained(model_name_or_path)
        

    def forward(self, x):
        return self.model(x).logits
    
    def training_step(self, batch, batch_idx):
        logits = self.bert(**batch)
        self.log('train_loss', loss, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.model(**batch).loss
        self.log('valid_loss', loss, on_step=True, sync_dist=True)
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(),
                          self.hparams.learning_rate,
                          betas=(self.hparams.adam_beta1,
                                 self.hparams.adam_beta2),
                          eps=self.hparams.adam_epsilon,)
        return optimizer    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--learning_rate', type=float, default=5e-5)
        parser.add_argument('--adam_beta1', type=float, default=0.9)
        parser.add_argument('--adam_beta2', type=float, default=0.999)
        parser.add_argument('--adam_epsilon', type=float, default=1e-8)
        return parser

In [None]:
data_module = PCDataModule(
    model_name_or_path=args.model_name_or_path,
    train_file=args.train_file,
    validation_file=args.validation_file,
)

In [26]:
parser = ArgumentParser()
parser.add_argument('--model_name_or_path', type=str,
                    default="distilbert-base-cased")
parser = pl.Trainer.add_argparse_args(parser)
parser = PCModel.add_model_specific_args(parser)
args = parser.parse_args([
#     '--default_root_dir', p_out,
#     '--site_id', site_id,
#     '--data_dir', data_dir,
#     '--dt_trn_bgn', '2020-07-30',
#     '--dt_trn_end', '2020-07-31',
#     '--dt_val_bgn', '2020-08-01',
#     '--dt_val_end', '2020-08-02',
])

In [27]:
pcmodel = PCModel(
    model_name_or_path=args.model_name_or_path,
)

In [28]:
pl.seed_everything(1234)


Global seed set to 1234


1234

In [29]:
trainer = pl.Trainer.from_argparse_args(args)
trainer.fit(pcmodel, data_module)


GPU available: True, used: False
TPU available: None, using: 0 TPU cores


NameError: name 'data_module' is not defined