In [1]:
import os
import json
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from datasets import Dataset, load_dataset

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.processors import BertProcessing

from transformers import PreTrainedTokenizerFast, LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers import BertConfig, BertForMaskedLM, BertForSequenceClassification, BertModel
from transformers import Trainer, TrainingArguments

## Data

In [2]:
data_path = '../../../data/train_sessions.jsonl'
  
train_sessions = pd.DataFrame()
chunks = pd.read_json(data_path, lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        # train_sessions = pd.concat([train_sessions, chunk])
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        train_sessions = pd.concat([train_sessions, chunk_session])
    else:
        break
        
train_sessions = train_sessions.reset_index(drop=True)
train_sessions.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks


In [15]:
data_path = '../../../data/test_sessions.jsonl'
  
test_sessions = pd.DataFrame()
chunks = pd.read_json(data_path, lines=True, chunksize=100_000)

for e, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }
    if e < 2:
        # train_sessions = pd.concat([train_sessions, chunk])
        for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
            for event in events:
                event_dict['session'].append(session)
                event_dict['aid'].append(event['aid'])
                event_dict['ts'].append(event['ts'])
                event_dict['type'].append(event['type'])
        chunk_session = pd.DataFrame(event_dict)
        test_sessions = pd.concat([test_sessions, chunk_session])
    else:
        break
        
test_sessions = test_sessions.reset_index(drop=True)
test_sessions.head()

Unnamed: 0,session,aid,ts,type
0,12383433,1542913,1661551200081,clicks
1,12383434,8211,1661551200511,clicks
2,12383435,940546,1661551201055,carts
3,12383435,45443,1661551213043,clicks
4,12383435,1769360,1661551246239,clicks


## Bert (MLM)

* Nearly 2M items, the size is too large for softmax

### Training sequence

In [7]:
aid_seq = train_sessions.sort_values(["session", "ts"]).reset_index(drop=True)
aid_seq["aid_2"] = aid_seq.aid.shift(1)
aid_seq = aid_seq[aid_seq.aid != aid_seq.aid_2]
aid_seq = aid_seq[["session", "aid", "ts", "type"]]
aid_seq["aid"] = aid_seq["aid"].astype(str)
aid_seq = aid_seq.groupby(["session"]).agg(list)["aid"].reset_index()
aid_seq = aid_seq[(aid_seq.aid.apply(len) > 1)].reset_index(drop=True)
aid_seq.head()

Unnamed: 0,session,aid
0,0,"[1517085, 1563459, 1309446, 16246, 1781822, 11..."
1,1,"[424964, 1492293, 910862, 1491172, 424964, 151..."
2,2,"[763743, 137492, 504789, 137492, 795863, 37834..."
3,3,"[1425967, 1343406, 1425967, 1343406, 1815570, ..."
4,4,"[613619, 298827, 383828, 255379, 1838173, 1453..."


In [11]:
with open("../../../data/transformer_training/bert_mlm_seq.txt", 'w') as f:
    for aid_list in aid_seq.aid:
        print(" ".join(aid_list), file = f)

### Tokenizer

In [106]:
VOCAB_SIZE = 10_000_000

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()
trainer = WordLevelTrainer(
    min_frequency = 0,
    vocab_size = VOCAB_SIZE,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
tokenizer.train(files=["../../../data/transformer_training/bert_mlm_seq.txt"], trainer=trainer)
tokenizer.post_processor = BertProcessing(
    ("[SEP]", 2),
    ("[CLS]", 1)
)
tokenizer = PreTrainedTokenizerFast(
    unk_token = "[UNK]",
    cls_token = "[CLS]",
    sep_token = "[SEP]",
    pad_token = "[PAD]",
    mask_token = "[MASK]",
    tokenizer_object=tokenizer
)

### Dataset

In [10]:
dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = ".../../../data/transformer_training/bert_mlm_seq.txt",
    block_size = 512  # maximum sequence length
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True , mlm_probability=0.15
)

Creating features from dataset file at ../data/bert_seq.txt


### Model

In [11]:
config= BertConfig(
    vocab_size = VOCAB_SIZE,
    hidden_size = 64,
    intermediate_size = 128,
    num_hidden_layers = 4, 
    num_attention_heads = 4,
    max_position_embeddings = 512
)
model = BertForMaskedLM(config)

### Training

In [13]:
training_args = TrainingArguments(
    output_dir = '../src/models',
    overwrite_output_dir = True,
    num_train_epochs = 1,
    per_device_train_batch_size = 64,
    save_steps = 10_000,
    save_total_limit = 2,
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
%%time
trainer.train()

***** Running training *****
  Num examples = 191597
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2994
  Number of trainable parameters = 65171200
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Bert (binary classification)

### Training sequence

In [3]:
corpus = train_sessions.aid.unique()

tmp = train_sessions.sort_values(["session", "ts"]).reset_index(drop=True)
tmp = tmp.groupby(["session"]).agg(list)[["aid", "ts", "type"]].reset_index()

In [4]:
with open("../../../data/transformer_training/bert_binary_seq.txt", 'w') as f:
    print("label,text", file=f)
    for idx, (session, aids, ts, tp) in tqdm(tmp.iterrows()):
        clicked = []
        for i, aid in enumerate(aids): 
            if i >= 5 and ts[i] - ts[i-1] < 86400:
                neg = str(np.random.choice(corpus))
                print("0," + " ".join([str(x) for x in aids[:i]]) + " " + neg, file = f)
                print("1," + " ".join([str(x) for x in aids[:i+1]]), file=f)
            clicked.append(aid)

0it [00:00, ?it/s]

### Dataset

In [5]:
train_seq = pd.read_table("../../../data/transformer_training/bert_binary_seq.txt", delimiter=",")
train_data = Dataset.from_pandas(train_seq)

### Tokenizer

In [7]:
VOCAB_SIZE = 1_000_000

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()
trainer = WordLevelTrainer(
    min_frequency = 0,
    vocab_size = VOCAB_SIZE,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
tokenizer.train(files=["../../../data/transformer_training/bert_mlm_seq.txt"], trainer=trainer)
tokenizer.post_processor = BertProcessing(
    ("[SEP]", 2),
    ("[CLS]", 1)
)
tokenizer = PreTrainedTokenizerFast(
    unk_token = "[UNK]",
    cls_token = "[CLS]",
    sep_token = "[SEP]",
    pad_token = "[PAD]",
    mask_token = "[MASK]",
    tokenizer_object=tokenizer
)

In [8]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer
)

train_tokenized = train_data.map(lambda x: tokenizer(x["text"]), batched=True)

  0%|          | 0/11220 [00:00<?, ?ba/s]

### model

In [9]:
config= BertConfig(
    vocab_size = VOCAB_SIZE,
    hidden_size = 64,
    intermediate_size = 128,
    num_hidden_layers = 4, 
    num_attention_heads = 4,
    max_position_embeddings = 512
)
model = BertForSequenceClassification(config)

### Training

In [10]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [27]:
training_args = TrainingArguments(
    output_dir = "'../../../src/models'",
    learning_rate=3e-4,
    per_device_train_batch_size=1024,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [29]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11219554
  Num Epochs = 3
  Instantaneous batch size per device = 1024
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 1
  Total optimization steps = 32871
  Number of trainable parameters = 64171202


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 