In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

### Pretrain datasets

In [2]:
%%time

import tqdm, torch

df_trans = pq.read_table('data/trans_filtered_only_coles.parquet').to_pandas()

cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']

for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)


libgomp: Invalid value for environment variable OMP_NUM_THREADS
100%|██████████| 4/4 [00:09<00:00,  2.43s/it]

CPU times: user 20.5 s, sys: 13.1 s, total: 33.5 s
Wall time: 33.6 s





In [3]:
df_trans = df_trans.drop('price', axis =1)

In [4]:
from sklearn.model_selection import train_test_split

df_train_trans, df_valid_trans = train_test_split(df_trans, test_size = 0.1, random_state = 42)
df_train_trans = df_train_trans.to_dict(orient='records')
df_valid_trans = df_valid_trans.to_dict(orient='records')


libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [5]:
!export OMP_NUM_THREADS=16

In [6]:
len(df_train_trans), len(df_valid_trans)

(373785, 41532)

## Prepare embeddings

In [7]:
import pandas as pd
embedding_matrix = pd.read_pickle('artifacts/url_host_96.pickle').embeddings

In [8]:
url_embeddings = torch.nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix), freeze=True)

## Train COLES

### Model definition

In [6]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
import ptls

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'request_cnt': 'identity'
    },
    embeddings={
        #'price': {'in': 10, 'out': 2},
        #'region_name': {'in': 81, 'out': 4},
        #'city_name': {'in': 985, 'out': 16},
        #'cpe_manufacturer_name': {'in': 37, 'out': 4},
        #'cpe_model_name': {'in': 599, 'out': 16},
        #'cpe_type_cd': {'in': 4, 'out': 2}, 
        #'cpe_model_os_type': {'in': 3, 'out': 2}, 
        'part_of_day': {'in': 4, 'out': 1},
        'url_host': {'in': 132025, 'out': 512}
    }
)


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=1024,
    num_layers=2,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
    head = torch.nn.Sequential(torch.nn.Linear(1024, 512), torch.nn.ReLU(), torch.nn.Linear(512, 512), 
            ptls.nn.Head(use_norm_encoder = True))
)

### Dataloader

In [7]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule


train_ds = MemoryMapDataset(data=df_train_trans)
valid_ds = MemoryMapDataset(data=df_valid_trans)

train_data=ColesDataset(train_ds, splitter = SampleSlices(split_count=5, cnt_min=20, cnt_max=200))
valid_data=ColesDataset(valid_ds, splitter = SampleSlices(split_count=5, cnt_min=20, cnt_max=200))

dl = PtlsDataModule(
    train_data=train_data, train_num_workers=16, train_batch_size=256, 
    valid_data=valid_data, valid_num_workers=16, valid_batch_size=256
)

### Trainer

In [8]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=50,
    limit_val_batches=100,
    gpus=[0],
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, dl)
print(trainer.logged_metrics)

logger.version = 8


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 78.6 M
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Sequential      | 787 K 
-------------------------------------------------------
79.4 M    Trainable params
0         Non-trainable params
79.4 M    Total params
317.659   Total estimated model params size (MB)


In [None]:
torch.save(model.state_dict(), "only_coles.pt")

# Inference

In [20]:
model.load_state_dict(torch.load("only_coles.pt"))

<All keys matched successfully>

In [21]:
%%time

import tqdm, torch

df_trans = pq.read_table('data/trans_filtered_only_coles.parquet').to_pandas()

cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']

for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)

100%|██████████| 4/4 [00:24<00:00,  6.04s/it]

CPU times: user 26.7 s, sys: 17.9 s, total: 44.6 s
Wall time: 43.7 s





In [22]:
%%time
import tqdm
from ptls.data_load.datasets import inference_data_loader
import numpy as np

def pooling_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.seq_encoder.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            X += [features]
    return X

def embed_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = model.seq_encoder(batch.to(device))
            X += [features]
    return X

dl = inference_data_loader(df_trans.to_dict(orient='records'), num_workers=0, batch_size=64)
X_coles = torch.vstack(embed_inference(model, dl, )).cpu().numpy()
X_pool = torch.vstack(pooling_inference(model, dl, )).cpu().numpy()
X_embeds = np.concatenate([X_coles, X_pool], axis=1)


df_embeds = pd.DataFrame(X_embeds, columns=[f"embed_{e}" for e in range(X_embeds.shape[1])])
df_embeds['user_id'] = df_trans['user_id']

6490it [20:53,  5.18it/s]
6490it [07:06, 15.22it/s]


CPU times: user 4h 58min 18s, sys: 4min 23s, total: 5h 2min 41s
Wall time: 28min 6s


In [23]:
#df_embeds.to_csv('./data/coles_lstm.csv', index=False)

## Downstream

In [24]:
%%time

import bisect
import numpy as np

# df_embeds = pd.read_csv('./data/coles_lstm.csv')
df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = df_public['age']
y_age = np.array(list(map(age_bucket, y_age)))
y_gender = np.array(df_public['is_male'])

X = df_public
X = X.merge(df_embeds, on="user_id", how='inner')
del X['user_id'], X['age'], X['is_male']

CPU times: user 3.55 s, sys: 851 ms, total: 4.4 s
Wall time: 4.39 s


## Gender

In [27]:
# %%time
from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X[not_na_gender], y_gender[not_na_gender], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=1000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42)
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender))

Learning rate set to 0.122426
0:	learn: 0.6393034	test: 0.6393754	best: 0.6393754 (0)	total: 2.18s	remaining: 36m 20s
100:	learn: 0.4422040	test: 0.4452797	best: 0.4452797 (100)	total: 2m 35s	remaining: 23m 3s
200:	learn: 0.4287064	test: 0.4395790	best: 0.4395790 (200)	total: 5m 12s	remaining: 20m 40s
300:	learn: 0.4184682	test: 0.4376129	best: 0.4376129 (300)	total: 7m 45s	remaining: 18m 1s
400:	learn: 0.4090518	test: 0.4367126	best: 0.4367126 (400)	total: 10m 20s	remaining: 15m 27s
500:	learn: 0.4002822	test: 0.4360447	best: 0.4360447 (500)	total: 12m 57s	remaining: 12m 53s
600:	learn: 0.3921046	test: 0.4357301	best: 0.4357301 (600)	total: 15m 30s	remaining: 10m 17s
700:	learn: 0.3841785	test: 0.4353638	best: 0.4353638 (700)	total: 18m 4s	remaining: 7m 42s
800:	learn: 0.3764826	test: 0.4350842	best: 0.4350842 (800)	total: 20m 38s	remaining: 5m 7s
900:	learn: 0.3690239	test: 0.4347667	best: 0.4347667 (900)	total: 23m 11s	remaining: 2m 32s
999:	learn: 0.3619670	test: 0.4348991	best: 0.

<catboost.core.CatBoostClassifier at 0x7fbe5078b8e0>

In [28]:
roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1])

0.8801127265591512

In [29]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.760


# Age

In [30]:
%%time

from sklearn.metrics import classification_report

not_na_age = ~np.isnan(y_age)
x_train, x_test_age, y_train, y_test_age = train_test_split(X[not_na_age], y_age[not_na_age], test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=1000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age))

Learning rate set to 0.120515
0:	learn: 1.8251087	test: 1.8238996	best: 1.8238996 (0)	total: 6.3s	remaining: 1h 44m 51s
100:	learn: 1.2650372	test: 1.2692289	best: 1.2692289 (100)	total: 8m 17s	remaining: 1h 13m 44s
200:	learn: 1.2322038	test: 1.2521927	best: 1.2521927 (200)	total: 15m 51s	remaining: 1h 3m 3s
300:	learn: 1.2109436	test: 1.2462008	best: 1.2462008 (300)	total: 23m 19s	remaining: 54m 10s
400:	learn: 1.1932141	test: 1.2425920	best: 1.2425920 (400)	total: 30m 52s	remaining: 46m 6s
500:	learn: 1.1769405	test: 1.2401486	best: 1.2401486 (500)	total: 38m 24s	remaining: 38m 15s
600:	learn: 1.1616836	test: 1.2384183	best: 1.2384183 (600)	total: 45m 51s	remaining: 30m 26s
700:	learn: 1.1477002	test: 1.2370752	best: 1.2370752 (700)	total: 53m 13s	remaining: 22m 42s
800:	learn: 1.1335268	test: 1.2358397	best: 1.2358397 (800)	total: 1h 43s	remaining: 15m 5s
900:	learn: 1.1202586	test: 1.2350700	best: 1.2350700 (900)	total: 1h 8m 13s	remaining: 7m 29s
999:	learn: 1.1072665	test: 1.234

<catboost.core.CatBoostClassifier at 0x7fb014734820>

In [31]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+']))

              precision    recall  f1-score   support

         <18       0.00      0.00      0.00       109
       18-25       0.54      0.42      0.47      3238
       25-34       0.52      0.63      0.57      8863
       35-44       0.43      0.51      0.47      7773
       45-54       0.39      0.27      0.32      4218
       55-65       0.41      0.27      0.32      2254
         65+       0.41      0.03      0.05       545

    accuracy                           0.47     27000
   macro avg       0.39      0.31      0.32     27000
weighted avg       0.46      0.47      0.46     27000



In [None]:
0.760 + 2*0.46

In [None]:
0.79 + 2*0.49