In [1]:
import datasets
import polars as pl
import pandas as pd
import numpy as np

from concurrent.futures import ProcessPoolExecutor, as_completed

from tqdm import tqdm
from IPython.display import display

tqdm.pandas()

SEED = 69

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pl.read_parquet("hh_recsys_train_hh.pq").to_pandas()
test = pl.read_parquet("hh_recsys_test_hh.pq").to_pandas()

In [3]:
train

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_332060,s_28301374,"[v_2571684, v_488179, v_2389179, v_1393783, v_...","[2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, ...","[2023-11-01T00:40:58.105000000, 2023-11-01T00:..."
1,u_1057881,s_33868982,[v_665861],[2],[2023-11-01T00:23:51.452000000]
2,u_1036784,s_32474802,[v_2594840],[2],[2023-11-01T00:52:34.023000000]
3,u_786220,s_14060785,"[v_1473781, v_1622905, v_1621959, v_2289180, v...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, ...","[2023-11-01T00:58:20.793000000, 2023-11-01T01:..."
4,u_639152,s_23205986,"[v_695738, v_22433, v_1590524, v_502496, v_200...","[2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, ...","[2023-11-01T01:14:20.828000000, 2023-11-01T00:..."
...,...,...,...,...,...
3463059,u_202578,s_8306993,[v_1499243],[2],[2023-11-13T21:01:41.813000000]
3463060,u_895531,s_7293998,[v_2041381],[2],[2023-11-14T17:51:22.169000000]
3463061,u_317562,s_9866576,[v_503192],[2],[2023-11-14T20:03:54.988000000]
3463062,u_225581,s_19724962,[v_2526106],[2],[2023-11-14T20:52:51.985000000]


In [4]:
test

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_482520,s_25018731,"[v_2597196, v_1223061, v_1223061]","[2, 2, 1]","[2023-11-19T12:03:13.089000000, 2023-11-19T12:..."
1,u_582132,s_481216,"[v_470400, v_470400, v_1530783]","[2, 1, 2]","[2023-11-21T15:39:47.981000000, 2023-11-21T15:..."
2,u_212584,s_16918781,"[v_1572055, v_1572055, v_1572055, v_953153, v_...","[2, 3, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1]","[2023-11-16T08:41:47.031000000, 2023-11-16T08:..."
3,u_425177,s_17505104,"[v_1375331, v_1922852]","[2, 2]","[2023-11-17T12:42:18.513000000, 2023-11-17T12:..."
4,u_700997,s_15528830,"[v_2152997, v_2152997, v_1217630]","[2, 1, 2]","[2023-11-16T17:22:53.530000000, 2023-11-16T17:..."
...,...,...,...,...,...
83184,u_499368,s_19158589,"[v_1716634, v_2232327, v_2232327, v_2232327, v...","[2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2]","[2023-11-18T03:15:15.530000000, 2023-11-18T03:..."
83185,u_1084868,s_24756162,"[v_395706, v_2405231]","[2, 2]","[2023-11-15T18:06:16.550000000, 2023-11-15T18:..."
83186,u_1138032,s_25291467,"[v_1213925, v_1213925, v_2359832, v_2359832]","[2, 1, 2, 1]","[2023-11-19T23:19:44.092000000, 2023-11-19T23:..."
83187,u_608102,s_18813020,"[v_933773, v_933773, v_2021127, v_1508520, v_2...","[3, 2, 2, 2, 2, 1, 2]","[2023-11-16T15:40:09.408000000, 2023-11-16T15:..."


In [5]:
train_pairs = train.explode(["vacancy_id", "action_type", "action_dt"])
test_pairs = test.explode(["vacancy_id", "action_type", "action_dt"])
train_pairs

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_332060,s_28301374,v_2571684,2,2023-11-01 00:40:58.105
0,u_332060,s_28301374,v_488179,2,2023-11-01 00:58:13.091
0,u_332060,s_28301374,v_2389179,2,2023-11-01 01:42:19.664
0,u_332060,s_28301374,v_1393783,2,2023-11-01 01:24:21.471
0,u_332060,s_28301374,v_2608935,2,2023-11-01 01:39:45.256
...,...,...,...,...,...
3463059,u_202578,s_8306993,v_1499243,2,2023-11-13 21:01:41.813
3463060,u_895531,s_7293998,v_2041381,2,2023-11-14 17:51:22.169
3463061,u_317562,s_9866576,v_503192,2,2023-11-14 20:03:54.988
3463062,u_225581,s_19724962,v_2526106,2,2023-11-14 20:52:51.985


In [6]:
train_pairs.isna().sum()

user_id        0
session_id     0
vacancy_id     0
action_type    0
action_dt      0
dtype: int64

In [7]:
test_pairs.isna().sum()

user_id        0
session_id     0
vacancy_id     0
action_type    0
action_dt      0
dtype: int64

In [8]:
train_pairs.groupby("user_id").count().describe()

Unnamed: 0,session_id,vacancy_id,action_type,action_dt
count,882409.0,882409.0,882409.0,882409.0
mean,24.38333,24.38333,24.38333,24.38333
std,289.57993,289.57993,289.57993,289.57993
min,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0
50%,7.0,7.0,7.0,7.0
75%,22.0,22.0,22.0,22.0
max,260689.0,260689.0,260689.0,260689.0


In [9]:
grouped = train_pairs.groupby("user_id").count().sort_values("session_id")
grouped

Unnamed: 0_level_0,session_id,vacancy_id,action_type,action_dt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u_0,1,1,1,1
u_22470,1,1,1,1
u_520916,1,1,1,1
u_520920,1,1,1,1
u_773478,1,1,1,1
...,...,...,...,...
u_1100222,4901,4901,4901,4901
u_370418,6483,6483,6483,6483
u_623716,7281,7281,7281,7281
u_1142370,51975,51975,51975,51975


In [10]:
subset = grouped[(grouped["session_id"] > 1) & (grouped["session_id"] < 1000)]
subset

Unnamed: 0_level_0,session_id,vacancy_id,action_type,action_dt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u_286332,2,2,2,2
u_536987,2,2,2,2
u_463606,2,2,2,2
u_45264,2,2,2,2
u_215132,2,2,2,2
...,...,...,...,...
u_281860,995,995,995,995
u_618629,996,996,996,996
u_141582,996,996,996,996
u_61774,996,996,996,996


In [11]:
train_pairs.shape, test_pairs.shape

((21516070, 5), (547642, 5))

In [12]:
# удалим странных пользователей с тысячами вакансий

train_pairs = train_pairs[train_pairs["user_id"].isin(subset.reset_index()["user_id"])].copy()
train_pairs

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_332060,s_28301374,v_2571684,2,2023-11-01 00:40:58.105
0,u_332060,s_28301374,v_488179,2,2023-11-01 00:58:13.091
0,u_332060,s_28301374,v_2389179,2,2023-11-01 01:42:19.664
0,u_332060,s_28301374,v_1393783,2,2023-11-01 01:24:21.471
0,u_332060,s_28301374,v_2608935,2,2023-11-01 01:39:45.256
...,...,...,...,...,...
3463059,u_202578,s_8306993,v_1499243,2,2023-11-13 21:01:41.813
3463060,u_895531,s_7293998,v_2041381,2,2023-11-14 17:51:22.169
3463061,u_317562,s_9866576,v_503192,2,2023-11-14 20:03:54.988
3463062,u_225581,s_19724962,v_2526106,2,2023-11-14 20:52:51.985


In [13]:
# 1 - отклик, 2 - просмотр (открытие страницы вакансии), 3 - добавление вакансии в избранное

train_pairs.loc[:, "action_type"] = train_pairs["action_type"].map({1: 4.0, 2: 1.0, 3: 2.0})
test_pairs.loc[:, "action_type"] = test_pairs["action_type"].map({1: 4.0, 2: 1.0, 3: 2.0})
train_pairs.head()

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_332060,s_28301374,v_2571684,1.0,2023-11-01 00:40:58.105
0,u_332060,s_28301374,v_488179,1.0,2023-11-01 00:58:13.091
0,u_332060,s_28301374,v_2389179,1.0,2023-11-01 01:42:19.664
0,u_332060,s_28301374,v_1393783,1.0,2023-11-01 01:24:21.471
0,u_332060,s_28301374,v_2608935,1.0,2023-11-01 01:39:45.256


In [14]:
test_pairs.head()

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_482520,s_25018731,v_2597196,1.0,2023-11-19 12:03:13.089
0,u_482520,s_25018731,v_1223061,1.0,2023-11-19 12:03:30.396
0,u_482520,s_25018731,v_1223061,4.0,2023-11-19 12:05:03.473
1,u_582132,s_481216,v_470400,1.0,2023-11-21 15:39:47.981
1,u_582132,s_481216,v_470400,4.0,2023-11-21 15:43:57.620


In [15]:
vac_dataset = datasets.load_from_disk("vacancies_dataset")
vac_dataset.set_format("numpy")

vac2vec = {el["vacancy_id"]: el["embedding"] for el in tqdm(vac_dataset)}

len(vac2vec.keys()) == len(vac_dataset)

100%|██████████| 2734129/2734129 [02:22<00:00, 19201.14it/s]


True

In [16]:
train_pairs.loc[:, "embedding"] = train_pairs["vacancy_id"].progress_apply(lambda t: vac2vec[t])
test_pairs.loc[:, "embedding"] = test_pairs["vacancy_id"].progress_apply(lambda t: vac2vec[t])

100%|██████████| 20525199/20525199 [00:14<00:00, 1382145.52it/s]
100%|██████████| 547642/547642 [00:00<00:00, 1412993.91it/s]


In [17]:
test_pairs.loc[:, "key"] = test_pairs["user_id"] + "/" + test_pairs["session_id"]
test_pairs.head()

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt,embedding,key
0,u_482520,s_25018731,v_2597196,1.0,2023-11-19 12:03:13.089,"[0.5978287, -0.49921575, -0.21153846, -0.99969...",u_482520/s_25018731
0,u_482520,s_25018731,v_1223061,1.0,2023-11-19 12:03:30.396,"[-0.6776678, -0.017538857, 0.07692308, -0.9997...",u_482520/s_25018731
0,u_482520,s_25018731,v_1223061,4.0,2023-11-19 12:05:03.473,"[-0.6776678, -0.017538857, 0.07692308, -0.9997...",u_482520/s_25018731
1,u_582132,s_481216,v_470400,1.0,2023-11-21 15:39:47.981,"[0.6852401, -0.49921575, -0.21153846, -0.50955...",u_582132/s_481216
1,u_582132,s_481216,v_470400,4.0,2023-11-21 15:43:57.620,"[0.6852401, -0.49921575, -0.21153846, -0.50955...",u_582132/s_481216


In [18]:
SAMPLE_SIZE = 101  # размер пачки + 1, последний элемент идёт как таргет

In [19]:
unique_users = train_pairs["user_id"].unique()[:100_000]


def process_train(user: str) -> list[dict[str, pd.DataFrame | str]]:
    user_subset = train_pairs[train_pairs["user_id"] == user].sort_values("action_dt").reset_index(drop=True).copy()
    user_subset.loc[:, "time_delta"] = user_subset["action_dt"].diff().dt.total_seconds().fillna(0).astype(float) / 60.

    samples = []
    for i in range(0, user_subset.shape[0], SAMPLE_SIZE):
        sample = user_subset[i:i+SAMPLE_SIZE]
        sample.loc[:, "embedding"] *= sample["action_type"]  # попробуем такой трюк, мб значимость фичи будет повыше
        sample_array = np.hstack([sample[["time_delta"]].to_numpy().astype(np.float32), np.vstack(sample["embedding"])])[:-1]

        # заполняем нулями для одинакового размера
        if sample_array.shape[0] < SAMPLE_SIZE - 1:
            sample_array = np.vstack([np.zeros((SAMPLE_SIZE - 1 - sample_array.shape[0], sample_array.shape[1]), dtype=np.float32), sample_array])

        samples.append({"data": sample_array, "target": vac2vec[sample.iloc[-1]["vacancy_id"]]})

    return samples

In [19]:
samples = []

with ProcessPoolExecutor() as executor:
    futures = []
    for user in tqdm(unique_users):
        futures.append(executor.submit(process_train, user))

    for future in tqdm(as_completed(futures), total=len(unique_users)):
        samples += future.result()

len(samples)

100%|██████████| 100000/100000 [00:03<00:00, 27021.43it/s]
100%|██████████| 100000/100000 [1:18:03<00:00, 21.35it/s]


131238

In [21]:
import pickle

with open("/HDD/train_users_sample.pickle", "wb") as f:
    pickle.dump(samples, f)

In [23]:
train_user_dataset = datasets.Dataset.from_list(samples[:25000])
train_user_dataset

Dataset({
    features: ['data', 'target'],
    num_rows: 25000
})

In [24]:
train_user_dataset.set_format("torch")

In [25]:
train_user_dataset[:10]["data"].shape

torch.Size([10, 100, 798])

In [26]:
vac_dataset[:10]["embedding"].shape

(10, 797)

In [27]:
train_user_dataset.save_to_disk("/HDD/train_users_dataset")

Saving the dataset (17/17 shards): 100%|██████████| 25000/25000 [00:03<00:00, 6388.05 examples/s]


In [20]:
unique_keys = test_pairs["key"].unique()


def process_test(key: str) -> list[dict[str, pd.DataFrame | str]]:
    user_subset = test_pairs[test_pairs["key"] == key].sort_values("action_dt").reset_index(drop=True).copy()
    user_subset.loc[:, "time_delta"] = user_subset["action_dt"].diff().dt.total_seconds().fillna(0).astype(float) / 60.

    samples = []
    for i in range(0, user_subset.shape[0], SAMPLE_SIZE):
        sample = user_subset[i:i+SAMPLE_SIZE]
        sample.loc[:, "embedding"] *= sample["action_type"]  # попробуем такой трюк, мб значимость фичи будет повыше
        sample_array = np.hstack([sample[["time_delta"]].to_numpy().astype(np.float32), np.vstack(sample["embedding"])])[:-1]

        # заполняем нулями для одинакового размера
        if sample_array.shape[0] < SAMPLE_SIZE - 1:
            sample_array = np.vstack([np.zeros((SAMPLE_SIZE - 1 - sample_array.shape[0], sample_array.shape[1]), dtype=np.float32), sample_array])

        samples.append({"data": sample_array, "target": vac2vec[sample.iloc[-1]["vacancy_id"]], "key": key})

    return samples

In [15]:
samples = []

with ProcessPoolExecutor() as executor:
    futures = []
    for key in tqdm(unique_keys):
        futures.append(executor.submit(process_test, key))

    for future in tqdm(as_completed(futures), total=len(unique_keys)):
        samples += future.result()

len(samples)

100%|██████████| 83189/83189 [00:06<00:00, 13708.87it/s]
100%|██████████| 83189/83189 [02:05<00:00, 660.68it/s]


83189

In [None]:
# import pickle

# with open("/HDD/test_users_samples.pickle", "wb") as f:
#     pickle.dump(samples, f)

In [None]:
# import pickle
# import datasets

# with open("/HDD/test_users_samples.pickle", "rb") as f:
#     samples = pickle.load(f)

# len(samples)

  from .autonotebook import tqdm as notebook_tqdm


84543

In [18]:
test_user_dataset = datasets.Dataset.from_list(samples[:26000])
test_user_dataset

Dataset({
    features: ['data', 'target', 'key'],
    num_rows: 26000
})

In [19]:
test_user_dataset.set_format("torch")

In [20]:
test_user_dataset[:10]["data"].shape

torch.Size([10, 100, 798])

In [21]:
vac_dataset[:10]["embedding"].shape

(10, 797)

In [22]:
test_user_dataset.save_to_disk("test_users_dataset")

Saving the dataset (17/17 shards): 100%|██████████| 26000/26000 [00:02<00:00, 9036.62 examples/s]
