In [1]:
%load_ext autoreload
%autoreload 1
%aimport my, data, net

import sys
import numpy as np
import pandas as pd

import os
import gc
import matplotlib.pyplot as plt
import importlib
import pickle

import my, data, net
from my import p
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 200)
pd.set_option("max_colwidth", 45)
pd.set_option("display.precision", 1)
pd.options.display.float_format = "{:.3f}".format
# pd.set_option("display.max_rows", 5)
# pd.reset_option("display.max_rows")

from sklearn.model_selection import train_test_split

# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)
     
dir_out = "out/"
dir_data = 'data/'

SEED = 34
np.random.seed(SEED)
N_CPU = os.cpu_count()

In [None]:
import torch
import pytorch_lightning as pl

In [2]:
target_info = pd.read_parquet('out/info_targets.pq')
target_info[:3]

Unnamed: 0,target_age,target_is_male,user_id,old_age,user_len,tel_price,user_len_bin,user_bin,fold,len_bucket
0,1,1,350459,31,178,5,5,50511,3,4
1,1,1,188276,35,111,7,5,50711,0,4
2,2,0,99002,41,639,6,7,70620,3,7


EMB DIM

In [6]:
encoders = my.load_pickle('out/encs.pik')
n_cats = {'price':12,'part_of_day':4, 'dayofweek': 7}
for name, e in encoders.items():
    n_cats[name] = len(e.classes_)
    # print(name, len(e.classes_))
for k,v in n_cats.items():
    n_cats[k] = {'number':v,'emb_dim':my.emb_sz_rule(v)}
n_cats

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


{'price': {'number': 12, 'emb_dim': 6},
 'part_of_day': {'number': 4, 'emb_dim': 3},
 'dayofweek': {'number': 7, 'emb_dim': 5},
 'region_name': {'number': 81, 'emb_dim': 19},
 'city_name': {'number': 985, 'emb_dim': 76},
 'cpe_manufacturer_name': {'number': 37, 'emb_dim': 12},
 'cpe_model_name': {'number': 599, 'emb_dim': 57},
 'url_host': {'number': 199683, 'emb_dim': 512},
 'cpe_type_cd': {'number': 4, 'emb_dim': 3},
 'cpe_model_os_type': {'number': 3, 'emb_dim': 3}}

In [8]:
all_plts = my.load_pickle('out/train_val_plts.pik')
len(all_plts)

270000

TEST DATA

In [None]:
test_plts = my.load_pickle('out/test_plts.pik')
test_ids = test_plts['test_ids']
test_plts = test_plts['test_plts']
len(test_ids)

In [None]:
for i, user_id in enumerate(test_ids):
    test_plts[i]['y'] = user_id

In [76]:
test_ds = my.SeqToTargetDataset(data = test_plts, target_col_name='y')

test_dl = torch.utils.data.DataLoader(
        dataset=test_ds,
        collate_fn=test_ds.collate_fn, 
        batch_size=128,
        shuffle=False,
        num_workers=1,
)

TRAIN

In [77]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head
from torchmetrics.classification import MulticlassF1Score

from pytorch_lightning.callbacks import EarlyStopping, TQDMProgressBar, ModelCheckpoint

feat_config ={
'time_depend_cats': {
    'url_host': {'in': 199683+1, 'out': 96},
    'region_name': {'in': 81+1, 'out': 16},
    'city_name': {'in': 985+1, 'out': 32},
    'dayofweek': {'in': 7+1, 'out': 5},
    'part_of_day': {'in': 4+1, 'out': 3},
    },
'time_depend_float': {
    'request_cnt':'identity',
    'diff_time':'identity'
    },
'const_cats': {
    'price': {'in': 12+1, 'emb_dim': 5},
    'cpe_manufacturer_name':{'in':37+1,'emb_dim':12},
    'cpe_model_name': {'in': 599+1, 'emb_dim': 24},
    'cpe_type_cd': {'in': 4+1, 'emb_dim': 3},
    'cpe_model_os_type': {'in': 3+1, 'emb_dim': 3}
}}

TARGET = 'age'
CHECKPOINT_DIR = 'ckpts/'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
CHECKPOINT_DIR

'ckpts/'

In [None]:
# !rm -rf ckpts

In [81]:
preds = []

for fold in range(5):
    print('FOLD: ', fold)
    net.set_seed(SEED*(fold+1))

    train_ids, val_ids = my.get_train_val_ids(target_info, fold=fold, target='target_age')
    print('LENS: ', len(train_ids), len(val_ids))

    val_plts = my.get_val_plts(all_plts, val_ids)

    train_ds = my.SeqToTargetDataset(data = all_plts, target_col_name='y', real_len=len(train_ids),seed=SEED*(fold+1),aug=True)
    val_ds = my.SeqToTargetDataset(data = val_plts, target_col_name='y', real_len=len(val_ids))

    train_sr = data.BucketBatchSampler(train_ids,batch_size=128, seed=SEED)

    train_dl = torch.utils.data.DataLoader(
        dataset=train_ds,
        collate_fn=train_ds.collate_fn, 
        batch_sampler = train_sr,
        num_workers=1,
    )

    val_dl = torch.utils.data.DataLoader(
        dataset=val_ds,
        collate_fn=val_ds.collate_fn, 
        batch_size=128,
        shuffle=False,
        num_workers=1,
    )

    seq_enc = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings= feat_config['time_depend_cats'],
        numeric_values=feat_config['time_depend_float'],
        embeddings_noise=0.001,
        # spatial_dropout=True,
        emb_dropout=0.1,
    ),
    hidden_size=256,
    # bidir=True,
    num_layers=2,
    dropout=0.2,
        trainable_starter=None
    )

    user_enc = net.UserEncoder(seq_enc=seq_enc, cat_embs=feat_config['const_cats'])

    pl_model = net.mySequenceToTarget(
        seq_encoder=user_enc,
        head=Head(input_size=user_enc.output_size,
        hidden_layers_sizes=[10],drop_probs=[0.1], objective='classification', num_classes=6),
        loss=torch.nn.NLLLoss(),
        metric_list=MulticlassF1Score(num_classes=6,average='weighted'),
        optimizer_partial=partial(torch.optim.Adam, lr=0.002),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=15, gamma=0.5),
    )

    tq = TQDMProgressBar(refresh_rate=10)

    es = EarlyStopping('val_MulticlassF1Score', min_delta=0.001, patience=4,verbose=True, mode='max', check_on_train_epoch_end=False)

    chpt = ModelCheckpoint(dirpath=CHECKPOINT_DIR,filename=f'best_{TARGET}_f{fold}',  monitor='val_MulticlassF1Score',mode='max')

    trainer = pl.Trainer(
        callbacks=[tq,es,chpt],
        max_epochs=10,
        deterministic = True,
        gpus=1 if torch.cuda.is_available() else 0,
        gradient_clip_val = 10,
        # check_val_every_n_epoch=1
        val_check_interval = 0.99,
        enable_model_summary = True if fold==0 else False,
    )

    trainer.fit(pl_model, train_dl, val_dl)

    logit_preds = trainer.predict(dataloaders=test_dl, ckpt_path=chpt.best_model_path)

    fold_probs = np.exp(np.concatenate(logit_preds))
    preds.append(fold_probs)
    np.save(f'out/preds_{TARGET}_f{fold}',fold_probs)

Metric val_MulticlassF1Score improved by 0.009 >= min_delta = 0.001. New best score: 0.464


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_MulticlassF1Score did not improve in the last 4 records. Best score: 0.464. Signaling Trainer to stop.
Restoring states from the checkpoint path at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f1.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f1.ckpt


Predicting: 1681it [00:00, ?it/s]

Global seed set to 102


FOLD:  2
LENS:  215999 54000


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved. New best score: 0.446


Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved by 0.005 >= min_delta = 0.001. New best score: 0.451


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved by 0.008 >= min_delta = 0.001. New best score: 0.460


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_MulticlassF1Score did not improve in the last 4 records. Best score: 0.460. Signaling Trainer to stop.
Restoring states from the checkpoint path at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f2.ckpt


Predicting: 1681it [00:00, ?it/s]

Global seed set to 136


FOLD:  3
LENS:  215999 54000


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved. New best score: 0.440


Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved by 0.015 >= min_delta = 0.001. New best score: 0.455


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved by 0.008 >= min_delta = 0.001. New best score: 0.463


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_MulticlassF1Score did not improve in the last 4 records. Best score: 0.463. Signaling Trainer to stop.
Restoring states from the checkpoint path at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f3.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f3.ckpt


Predicting: 1681it [00:00, ?it/s]

Global seed set to 170


FOLD:  4
LENS:  216000 53999


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved. New best score: 0.446


Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved by 0.010 >= min_delta = 0.001. New best score: 0.456


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_MulticlassF1Score improved by 0.007 >= min_delta = 0.001. New best score: 0.463


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_MulticlassF1Score did not improve in the last 4 records. Best score: 0.463. Signaling Trainer to stop.
Restoring states from the checkpoint path at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f4.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/ubuntu/PROJECTS/1_MTC/ckpts/best_age_f4.ckpt


Predicting: 1681it [00:00, ?it/s]

In [83]:
mean_fold_preds = np.mean(np.stack(preds), axis=0)
mean_fold_preds.shape

(144724, 6)

In [84]:
cats = np.argmax(mean_fold_preds, axis = 1) + 1
cats

array([3, 3, 3, ..., 3, 2, 3])

In [85]:
sub = pd.DataFrame({'user_id':test_ids,TARGET:cats})
sub.to_csv(f'out/sub1_{TARGET}.csv',index=False)
sub

Unnamed: 0,user_id,age
0,1868,3
1,1989,3
2,2019,3
3,2022,3
4,2042,3
...,...,...
144719,412539,1
144720,413915,5
144721,414243,3
144722,414537,2


In [86]:
sub['age'].value_counts()

3    53785
2    52648
4    18690
1    12363
5     7238
Name: age, dtype: int64