In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch


libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [2]:
import pandas as pd
embedding_matrix = pd.read_pickle('artifacts/url_host_96.pickle').embeddings
url_embeddings = torch.nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix), freeze=True)

## Train COLES

### Model definition

In [38]:
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
import ptls

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'request_cnt': 'identity'
    },
    embeddings={
        #'price': {'in': 10, 'out': 2},
        #'region_name': {'in': 81, 'out': 4},
        #'city_name': {'in': 985, 'out': 16},
        #'cpe_manufacturer_name': {'in': 37, 'out': 4},
        #'cpe_model_name': {'in': 599, 'out': 16},
        #'cpe_type_cd': {'in': 4, 'out': 2}, 
        #'cpe_model_os_type': {'in': 3, 'out': 2}, 
        'part_of_day': {'in': 4, 'out': 1},
        'url_host': url_embeddings
    }
)


seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=1024,
    num_layers=2,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.0001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
    head = torch.nn.Sequential(torch.nn.Linear(1024, 512), torch.nn.ReLU(), torch.nn.Linear(512, 512), 
            ptls.nn.Head(use_norm_encoder = True))
)


In [39]:

model.load_state_dict(torch.load("coles-emb-lstm.pt"))

<All keys matched successfully>

# Finetune

In [5]:
%%time

import tqdm, torch


df_trans = pq.read_table('data/trans_filtered_pretrained.parquet').to_pandas()

cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time']
for col in tqdm.tqdm(cols):
    df_trans[col] = df_trans[col].apply(torch.tensor)

100%|██████████| 4/4 [00:12<00:00,  3.20s/it]

CPU times: user 23 s, sys: 15.5 s, total: 38.5 s
Wall time: 37.5 s





In [6]:
df_trans = df_trans.drop('price', axis =1)

In [7]:
SEED = 42  # todo 0, 1, 2, 3, 4 done 
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fd9ec2638d0>

In [24]:
from sklearn.model_selection import train_test_split
import numpy as np

import bisect

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

# Combined target age_gender
df_public['age'] = list(map(age_bucket, df_public['age'] ))
df_public = df_public[(df_public['age'] != 'NA') & (df_public['is_male'] != 'NA')]
df_public = df_public.dropna()
df_public['target'] = df_public['age'].astype(int) + 7 * df_public['is_male'].astype(int)

# Merge
df_finetune = df_trans.merge(df_public[['user_id', 'target']], on='user_id')

train_ft, valid_ft = train_test_split(df_finetune, test_size = 0.3, random_state = SEED)

train_ft = train_ft.to_dict(orient='records')
valid_ft = valid_ft.to_dict(orient='records')

In [40]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.frames.supervised import SeqToTargetDataset
from ptls.frames import PtlsDataModule
from ptls.data_load.iterable_processing import SeqLenFilter

from ptls.data_load.augmentations import AllTimeShuffle, DropoutTrx, RandomSlice
from  ptls.data_load.datasets import AugmentationDataset

def get_dataset(data, aug=False):
    ds = MemoryMapDataset(data=data, i_filters=[SeqLenFilter(max_seq_len=1000),])
    if aug:
        ds = AugmentationDataset(ds, f_augmentations = [DropoutTrx(trx_dropout=0.01), RandomSlice(30, 100)])
    return SeqToTargetDataset(ds, target_col_name='target',)

finetune_dm = PtlsDataModule(
    train_data=get_dataset(train_ft, aug=True),
    valid_data=get_dataset(valid_ft),
    train_num_workers=4,
    train_batch_size=128,)

In [41]:
from functools import partial
import torch
import torchmetrics
from ptls.frames.supervised import SequenceToTarget
from ptls.nn import Head

model_finetuned = SequenceToTarget(
    seq_encoder=model.seq_encoder,
    head=Head(
        input_size=model.seq_encoder.embedding_size,
        use_batch_norm=True,
        objective='classification',
        num_classes=14,
    ),
    loss=torch.nn.NLLLoss(),
    metric_list=torchmetrics.Accuracy(compute_on_step=False),
    pretrained_lr=0.0001,
    optimizer_partial=partial(torch.optim.Adam, lr=0.01, weight_decay=1e-5),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
)

In [42]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [43]:
from pytorch_lightning.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor="valid/Accuracy",
    min_delta=0.001,
    patience=5,
    mode='max'
)

trainer_ft = pl.Trainer(
    max_epochs=10,
    gpus=[0],
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [44]:
print(f'logger.version = {trainer_ft.logger.version}')
trainer_ft.fit(model_finetuned, finetune_dm)
print(trainer_ft.logged_metrics)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name          | Type          | Params
------------------------------------------------
0 | seq_encoder   | RnnSeqEncoder | 28.9 M
1 | head          | Head          | 16.4 K
2 | loss          | NLLLoss       | 0     
3 | train_metrics | ModuleDict    | 0     
4 | valid_metrics | ModuleDict    | 0     
5 | test_metrics  | ModuleDict    | 0     
------------------------------------------------
9.8 M     Trainable params
19.2 M    Non-trainable params
28.9 M    Total params
115.754   Total estimated model params size (MB)


logger.version = 33
{'loss': tensor(1.0725), 'seq_len': tensor(54.6667), 'y': tensor(5.6667), 'val_loss': tensor(1.8501), 'valid/Accuracy': tensor(0.3385), 'train/Accuracy': tensor(0.3524)}


# Infernece

In [17]:
%%time
import tqdm
from ptls.data_load.datasets import inference_data_loader
import numpy as np

def pooling_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.seq_encoder.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            X += [features]
    return X

def embed_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = model.seq_encoder(batch.to(device))
            #features = model(batch.to(device))
            X += [features]
    return X

dl = inference_data_loader(valid_ft, num_workers=0, batch_size=64)
X_coles = torch.vstack(embed_inference(model_finetuned, dl, )).cpu().numpy()
X_pool = torch.vstack(pooling_inference(model_finetuned, dl, )).cpu().numpy()
X_embeds = np.concatenate([X_coles, X_pool], axis=1)


df_embeds = pd.DataFrame(X_embeds, columns=[f"embed_{e}" for e in range(X_embeds.shape[1])])
df_embeds['user_id'] = pd.DataFrame(valid_ft)['user_id']
df_embeds.to_csv('./data/coles_sup.csv', index=False)

1240it [03:59,  5.19it/s]
1240it [00:13, 89.13it/s] 


CPU times: user 54min 8s, sys: 55.7 s, total: 55min 4s
Wall time: 5min 15s


In [18]:
#df_embeds = pd.DataFrame(np.argmax(X_coles, axis = 1)//7, columns = ['y_pred'])
#df_embeds['user_id'] = pd.DataFrame(valid_ft)['user_id']
#df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')
#df = df_embeds.merge(df_public, on = 'user_id')
#df = df.loc[df['is_male']!='NA']

## Downstream

## Targets

In [46]:
%%time

import bisect
import numpy as np

df_embeds = pd.read_csv('./data/coles_sup.csv')
df_public = pq.read_table('data/public_train.pqt').to_pandas().sort_values(by='user_id')

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_gender = np.array(df_public['is_male'].loc[df_public["user_id"].isin(df_embeds["user_id"])])

X = df_public
X = X.merge(df_embeds, on="user_id", how='inner')
del X['user_id']

CPU times: user 10.9 s, sys: 562 ms, total: 11.5 s
Wall time: 11.5 s


## Gender

In [20]:
# %%time

from catboost import CatBoostClassifier, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
not_na_gender = (y_gender != 'NA') & (y_gender != None)
x_train, x_test_gender, y_train, y_test_gender = train_test_split(X.loc[X['is_male']!='NA'].drop('is_male', axis = 1), X['is_male'].loc[X['is_male']!='NA'], test_size = 0.1, random_state = 42)

clf_gender = CatBoostClassifier(
    iterations=1000,
    custom_metric=[metrics.AUC()],
    use_best_model=True,
    random_seed=42)
clf_gender.fit(x_train, y_train, metric_period=100, eval_set=(x_test_gender, y_test_gender))

Learning rate set to 0.090934
0:	learn: 0.6524280	test: 0.6516542	best: 0.6516542 (0)	total: 176ms	remaining: 2m 55s
100:	learn: 0.4270304	test: 0.4271746	best: 0.4271746 (100)	total: 16.7s	remaining: 2m 28s
200:	learn: 0.4052526	test: 0.4237997	best: 0.4237997 (200)	total: 33.3s	remaining: 2m 12s
300:	learn: 0.3854017	test: 0.4217245	best: 0.4217245 (300)	total: 49.7s	remaining: 1m 55s
400:	learn: 0.3672467	test: 0.4208100	best: 0.4208100 (400)	total: 1m 6s	remaining: 1m 39s
500:	learn: 0.3508429	test: 0.4206350	best: 0.4206350 (500)	total: 1m 23s	remaining: 1m 23s
600:	learn: 0.3351512	test: 0.4204208	best: 0.4204208 (600)	total: 1m 40s	remaining: 1m 6s
700:	learn: 0.3205854	test: 0.4204629	best: 0.4204208 (600)	total: 1m 58s	remaining: 50.7s
800:	learn: 0.3065195	test: 0.4211557	best: 0.4204208 (600)	total: 2m 16s	remaining: 34s
900:	learn: 0.2936782	test: 0.4220627	best: 0.4204208 (600)	total: 2m 35s	remaining: 17.1s
999:	learn: 0.2812450	test: 0.4222082	best: 0.4204208 (600)	total

<catboost.core.CatBoostClassifier at 0x7fda91341a90>

In [21]:
print(f'GINI по полу {2 * roc_auc_score(y_test_gender, clf_gender.predict_proba(x_test_gender)[:,1]) - 1:2.3f}')

GINI по полу 0.776


In [47]:
del X['is_male']

In [50]:
len(X)

79298

# Age

In [58]:
y_age = X['age']
y_age = np.array(list(map(age_bucket, y_age)))

In [61]:
not_na_age = ~np.isnan(y_age)

In [66]:
np.sum(not_na_age)

79298

In [65]:
X

Unnamed: 0,age,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_1406,embed_1407,embed_1408,embed_1409,embed_1410,embed_1411,embed_1412,embed_1413,embed_1414,embed_1415
0,28.0,-0.350410,0.632318,-0.091818,-0.139895,-0.364928,-0.056713,0.316604,0.152586,0.361785,...,0.132889,0.258747,0.114576,0.168389,0.973048,0.266641,0.481007,0.148171,0.135358,0.377541
1,26.0,-0.618453,0.041251,-0.125465,0.288959,-0.376473,0.819247,0.747219,0.356705,0.318829,...,0.131841,0.349963,0.159134,0.231037,1.422003,0.343415,0.698219,0.180706,0.130586,0.389599
2,64.0,-0.550785,-0.130076,-0.009930,-0.172358,-0.431696,0.619458,0.129012,0.113982,0.499727,...,0.066438,0.176206,0.074067,0.111473,0.762464,0.190660,0.376643,0.069524,0.060303,0.249724
3,35.0,0.993353,-0.244377,-0.402240,-0.359734,-0.445476,0.855482,0.334890,0.401199,0.487183,...,0.274220,0.406721,0.254292,0.352025,1.582506,0.513740,0.849216,0.441074,0.246063,0.904952
4,53.0,-0.298454,0.923003,-0.091783,0.599111,-0.278931,-0.022032,0.225055,0.144129,-0.005188,...,0.298794,0.353357,0.268092,0.302239,1.416814,0.500933,0.744435,0.320501,0.268699,0.950382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79293,51.0,0.061581,-0.111805,-0.095579,-0.013955,-0.019694,0.020475,0.086698,-0.010747,0.038798,...,0.001480,0.009398,0.004048,0.006489,0.044710,0.011708,0.021264,0.004362,0.001059,0.013196
79294,43.0,0.061375,-0.051445,-0.095874,-0.011291,-0.026515,0.014542,0.086127,-0.010796,0.050994,...,0.002598,0.013941,0.002766,0.005044,0.048268,0.007690,0.027109,0.003564,0.001958,0.012952
79295,46.0,0.056445,-0.121738,-0.096878,-0.017490,-0.016458,0.016179,0.086451,-0.010834,-0.004027,...,0.006083,0.021648,0.006546,0.009354,0.085983,0.016298,0.042680,0.011954,0.005967,0.030698
79296,39.0,0.062650,-0.122971,-0.095426,-0.012222,-0.045224,-0.007861,0.086063,-0.010403,0.081159,...,0.003106,0.010327,0.004108,0.006131,0.042917,0.011026,0.019267,0.003577,0.000124,0.011799


In [68]:
def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x)

y_age = X['age'].loc[not_na_age]
y_age = np.array(list(map(age_bucket, y_age)))

In [None]:
x_test_age

In [70]:
%%time

from sklearn.metrics import classification_report

x_train, x_test_age, y_train, y_test_age = train_test_split(X.loc[not_na_age].drop('age', axis = 1), y_age, test_size = 0.1, random_state = 42)

clf_age = CatBoostClassifier(iterations=1000,
    custom_metric=[metrics.Accuracy()],
    use_best_model=True,
    random_seed=42)
clf_age.fit(x_train, y_train, metric_period=100, eval_set=(x_test_age, y_test_age))

Learning rate set to 0.117597
0:	learn: 1.8284115	test: 1.8296475	best: 1.8296475 (0)	total: 486ms	remaining: 8m 5s
100:	learn: 1.2161826	test: 1.2660904	best: 1.2660904 (100)	total: 38.7s	remaining: 5m 44s
200:	learn: 1.1622871	test: 1.2541235	best: 1.2541235 (200)	total: 1m 15s	remaining: 5m
300:	learn: 1.1178347	test: 1.2518849	best: 1.2518849 (300)	total: 1m 52s	remaining: 4m 21s
400:	learn: 1.0804902	test: 1.2501002	best: 1.2501002 (400)	total: 2m 29s	remaining: 3m 43s
500:	learn: 1.0450870	test: 1.2504674	best: 1.2501002 (400)	total: 3m 6s	remaining: 3m 5s
600:	learn: 1.0123656	test: 1.2502112	best: 1.2501002 (400)	total: 3m 43s	remaining: 2m 28s
700:	learn: 0.9812439	test: 1.2505977	best: 1.2501002 (400)	total: 4m 19s	remaining: 1m 50s
800:	learn: 0.9497614	test: 1.2525041	best: 1.2501002 (400)	total: 4m 56s	remaining: 1m 13s
900:	learn: 0.9209456	test: 1.2520867	best: 1.2501002 (400)	total: 5m 33s	remaining: 36.7s
999:	learn: 0.8949287	test: 1.2533028	best: 1.2501002 (400)	tota

<catboost.core.CatBoostClassifier at 0x7fda28af8d90>

In [71]:
print(classification_report(y_test_age, clf_age.predict(x_test_age), \
                            target_names = ['<18', '18-25','25-34', '35-44', '45-54', '55-65', '65+'], digits = 3))

              precision    recall  f1-score   support

         <18      0.000     0.000     0.000        25
       18-25      0.553     0.377     0.448       953
       25-34      0.531     0.623     0.573      2641
       35-44      0.416     0.546     0.472      2183
       45-54      0.410     0.282     0.334      1277
       55-65      0.398     0.256     0.311       673
         65+      0.800     0.022     0.044       178

    accuracy                          0.471      7930
   macro avg      0.444     0.301     0.312      7930
weighted avg      0.475     0.471     0.456      7930



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
0.781 + 2*0.48

1.741