In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
import numpy as np
pd.Series([1,2,3,3,4]).nunique()

4

# Prepare data

In [3]:
%%time
df = pq.read_table('data/competition_data_final_pqt').select(['user_id', 'url_host', 'request_cnt', 'part_of_day', 'date', 'price']).to_pandas()

CPU times: user 1min 14s, sys: 8min 25s, total: 9min 40s
Wall time: 52.6 s


In [4]:
df.shape

(322899435, 6)

In [5]:
list(df.columns)

['user_id', 'url_host', 'request_cnt', 'part_of_day', 'date', 'price']

In [6]:
import tqdm

print("Unique values in column.")
for c in df.columns:
    if c != 'user_id':
        print(c, ":", len(df[c].unique()))

Unique values in column.
url_host : 199683
request_cnt : 15
part_of_day : 4
date : 396
price : 29323


In [7]:
df.part_of_day.value_counts()

day        107328399
evening     96239286
morning     85236015
night       34095735
Name: part_of_day, dtype: int64

### Impute price, create date+day_part event_time

In [8]:
df.price = df.price.fillna(0)
df.price = pd.qcut(df.price, 10, labels=False)

In [9]:
df['event_time'] = pd.to_datetime(df["date"]).values.astype('datetime64[h]').view('int64')

In [10]:
df['event_time'] += df['part_of_day'].map({'morning': 0, 'day': 6, 'evening': 12, 'night': 18})

In [11]:
df.head()

Unnamed: 0,user_id,url_host,request_cnt,part_of_day,date,price,event_time
0,45098,ad.adriver.ru,1,morning,2022-06-15,4,459792
1,45098,apple.com,1,morning,2022-06-19,4,459888
2,45098,avatars.mds.yandex.net,1,day,2022-06-12,4,459726
3,45098,googleads.g.doubleclick.net,1,day,2022-05-16,4,459078
4,45098,googleads.g.doubleclick.net,1,day,2022-05-30,4,459414


In [12]:
del df['date']

# Create transactional data

In [13]:
df.head()

Unnamed: 0,user_id,url_host,request_cnt,part_of_day,price,event_time
0,45098,ad.adriver.ru,1,morning,4,459792
1,45098,apple.com,1,morning,4,459888
2,45098,avatars.mds.yandex.net,1,day,4,459726
3,45098,googleads.g.doubleclick.net,1,day,4,459078
4,45098,googleads.g.doubleclick.net,1,day,4,459414


In [14]:
import pyarrow.parquet as pq
pretrained_embedder = pd.read_pickle('artifacts/url_host_96.pickle')

In [15]:
pretrained_dict = pretrained_embedder.item_to_id

In [18]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='event_time',
    event_time_transformation='none',
    cols_category=['part_of_day'],
    cols_pretrained= {'url_host': pretrained_dict},
    #cols_category=['price', 'region_name', 'city_name', 'cpe_manufacturer_name', 'cpe_model_name', 'url_host', 'cpe_type_cd', 'cpe_model_os_type', 'part_of_day'],
    cols_numerical=['request_cnt'],
    return_records=False,
)


libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [19]:
%%time
import pickle

trans = preprocessor.fit_transform(df)

with open('preprocessor_pretrained.p', 'wb') as f:
    pickle.dump(preprocessor, f)

CPU times: user 4min 27s, sys: 56.1 s, total: 5min 23s
Wall time: 5min 23s


In [20]:
%%time

import pickle
import numpy as np


cols = ['url_host', 'request_cnt', 'part_of_day', 'event_time', 'price']

trans_save = trans
for col in tqdm.tqdm(cols):
    trans_save[col] = trans_save[col].apply(np.array)
trans_save
trans_save.to_parquet('./data/trans_filtered_pretrained.parquet')

100%|██████████| 5/5 [00:08<00:00,  1.64s/it]


CPU times: user 26.7 s, sys: 8.76 s, total: 35.4 s
Wall time: 35.1 s
