In [93]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

### Pretrain datasets

In [94]:
%%time

import tqdm, torch

df_trans = pq.read_table('data/trans_filtered_pretrained.parquet').to_pandas()

cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']

for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)

100%|██████████| 4/4 [00:21<00:00,  5.43s/it]

CPU times: user 30.6 s, sys: 16.8 s, total: 47.4 s
Wall time: 45.6 s





In [95]:
df_trans = df_trans.drop('price', axis =1)

In [96]:
from sklearn.model_selection import train_test_split
import numpy as np

import bisect

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

# Combined target age_gender
df_public['age'] = list(map(age_bucket, df_public['age'] ))
df_public = df_public[(df_public['age'] != 'NA') & (df_public['is_male'] != 'NA')]
df_public = df_public.dropna()
df_public['target'] = df_public['age'].astype(int) + 7 * df_public['is_male'].astype(int)

# Merge
df_supervised = df_trans.merge(df_public[['user_id', 'target']], on='user_id')

train_ft, valid_ft = train_test_split(df_supervised, test_size = 0.3, random_state = 42)

train_ft = train_ft.to_dict(orient='records')
valid_ft = valid_ft.to_dict(orient='records')

In [97]:
len(df_train_trans), len(df_valid_trans)

(237893, 26433)

In [98]:
import pandas as pd
embedding_matrix = pd.read_pickle('artifacts/url_host_96.pickle').embeddings
url_embeddings = torch.nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix), freeze=True)

## Train COLES

### Model definition

In [136]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.frames.supervised import SeqToTargetDataset
from ptls.frames import PtlsDataModule
from ptls.data_load.iterable_processing import SeqLenFilter

from ptls.data_load.augmentations import AllTimeShuffle, DropoutTrx, RandomSlice
from  ptls.data_load.datasets import AugmentationDataset


def get_dataset(data, aug=False):
    ds = MemoryMapDataset(data=data, i_filters=[SeqLenFilter(max_seq_len=1000),])
    if aug:
        ds = AugmentationDataset(ds, f_augmentations = [RandomSlice(10, 500)])
    return SeqToTargetDataset(ds, target_col_name='target',)

supervised_dm = PtlsDataModule(
    train_data=get_dataset(train_ft, aug=True),
    valid_data=get_dataset(valid_ft),
    train_num_workers=4,
    train_batch_size=128,)

### Dataloader

In [137]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
import ptls

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'request_cnt': 'identity'
    },
    embeddings={
        #'price': {'in': 10, 'out': 2},
        #'region_name': {'in': 81, 'out': 4},
        #'city_name': {'in': 985, 'out': 16},
        #'cpe_manufacturer_name': {'in': 37, 'out': 4},
        #'cpe_model_name': {'in': 599, 'out': 16},
        #'cpe_type_cd': {'in': 4, 'out': 2}, 
        #'cpe_model_os_type': {'in': 3, 'out': 2}, 
        'part_of_day': {'in': 4, 'out': 1},
        'url_host': url_embeddings
    }
)


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=1024,
    num_layers=2,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.01),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
)

In [142]:
from functools import partial
import torch
import torchmetrics
from ptls.frames.supervised import SequenceToTarget
from ptls.nn import Head

model_supervised = SequenceToTarget(
    seq_encoder=model.seq_encoder,
    head=torch.nn.Sequential(torch.nn.Linear(1024, 512), 
                             torch.nn.ReLU(), 
                             torch.nn.Linear(512, 512), 
            Head(input_size=512,
        use_batch_norm=True,
        objective='classification',
        num_classes=14,)),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(compute_on_step=False),
    optimizer_partial=partial(torch.optim.Adam, lr=0.01, weight_decay=1e-5),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
)

### Trainer

In [143]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=50,
    limit_val_batches=100,
    gpus=[0],
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [144]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model_supervised, supervised_dm)
print(trainer.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type          | Params
------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 28.9 M
1 | head          | Sequential    | 795 K 
2 | loss          | NLLLoss       | 0     
3 | train_metrics | ModuleDict    | 0     
4 | valid_metrics | ModuleDict    | 0     
5 | test_metrics  | ModuleDict    | 0     
------------------------------------------------
10.5 M    Trainable params
19.2 M    Non-trainable params
29.7 M    Total params
118.871   Total estimated model params size (MB)


logger.version = 26


In [108]:
torch.save(model.state_dict(), "coles-sup-emb_pretrained.pt")

# Inference

In [78]:
model.load_state_dict(torch.load("coles-sup-emb_pretrained.pt"))

<All keys matched successfully>

In [17]:
%%time

import tqdm, torch

df_trans = pq.read_table('data/trans_filtered_pretrained.parquet').to_pandas()

cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']

for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)

100%|██████████| 4/4 [00:18<00:00,  4.62s/it]

CPU times: user 29.4 s, sys: 17 s, total: 46.4 s
Wall time: 44.8 s





In [110]:
%%time
import tqdm
from ptls.data_load.datasets import inference_data_loader
import numpy as np

def pooling_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.seq_encoder.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            X += [features]
    return X

def embed_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = model.seq_encoder(batch.to(device))
            X += [features]
    return X

dl = inference_data_loader(valid_ft, num_workers=0, batch_size=128)
X_coles = torch.vstack(embed_inference(model, dl, )).cpu().numpy()
X_pool = torch.vstack(pooling_inference(model, dl, )).cpu().numpy()
X_embeds = np.concatenate([X_coles, X_pool], axis=1)


df_embeds = pd.DataFrame(X_embeds, columns=[f"embed_{e}" for e in range(X_embeds.shape[1])])
df_embeds['user_id'] = pd.DataFrame(valid_ft)['user_id']
df_embeds.to_csv('./data/coles_sup.csv', index=False)

620it [00:21, 28.54it/s]
620it [00:12, 50.46it/s]


CPU times: user 31min 52s, sys: 32.9 s, total: 32min 25s
Wall time: 53.9 s


## Downstream

In [112]:
%%time

import bisect
import numpy as np

df_embeds = pd.read_csv('./data/coles_sup.csv')
df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'].loc[df_public["user_id"].isin(df_embeds["user_id"])])

X = df_public
X = X.merge(df_embeds, on="user_id", how='inner')
del X['user_id'], X['age']

CPU times: user 3.98 s, sys: 810 ms, total: 4.79 s
Wall time: 5.51 s


## Gender

In [113]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X.loc[X['is_male']!='NA'].drop('is_male', axis = 1), X['is_male'].loc[X['is_male']!='NA'], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=1000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42)
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender))

Learning rate set to 0.090934
0:	learn: 0.6619827	test: 0.6612122	best: 0.6612122 (0)	total: 99.4ms	remaining: 1m 39s
100:	learn: 0.4848365	test: 0.4832841	best: 0.4832841 (100)	total: 8.79s	remaining: 1m 18s
200:	learn: 0.4619425	test: 0.4761688	best: 0.4761688 (200)	total: 18.1s	remaining: 1m 11s
300:	learn: 0.4421667	test: 0.4726993	best: 0.4726993 (300)	total: 27s	remaining: 1m 2s
400:	learn: 0.4254223	test: 0.4707501	best: 0.4707501 (400)	total: 35.4s	remaining: 52.9s
500:	learn: 0.4102390	test: 0.4699662	best: 0.4699662 (500)	total: 43.8s	remaining: 43.6s
600:	learn: 0.3963362	test: 0.4695110	best: 0.4695110 (600)	total: 52.3s	remaining: 34.7s
700:	learn: 0.3834829	test: 0.4689836	best: 0.4689836 (700)	total: 1m	remaining: 25.9s
800:	learn: 0.3711384	test: 0.4689268	best: 0.4689268 (800)	total: 1m 9s	remaining: 17.2s
900:	learn: 0.3588902	test: 0.4692740	best: 0.4689268 (800)	total: 1m 17s	remaining: 8.54s
999:	learn: 0.3481730	test: 0.4696775	best: 0.4689268 (800)	total: 1m 26s	

<catboost.core.CatBoostClassifier at 0x7f8cd8393eb0>

In [114]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender.to_numpy(), clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.717


# Age

In [146]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=1000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age))

ValueError: Item wrong length 270000 instead of 79298.

In [None]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

In [None]:
0.752 + 2*0.46

In [None]:
0.79 + 2*0.49