In [1]:
import warnings
from collections import defaultdict
from itertools import chain, product
from pathlib import Path

import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

tf.logging.set_verbosity(tf.logging.ERROR)
warnings.filterwarnings('ignore')

In [2]:
fpath = Path('../data/prepared_data.csv')

In [3]:
data = pd.read_csv(fpath, index_col=0)

In [4]:
data.head(10)

Unnamed: 0,customer_id,mccs,mccs_count,transaction_month
0,0001f322716470bf9bfc1708f06f00fc,6011601160116011601160116011,7,2
1,0001f322716470bf9bfc1708f06f00fc,6011601160116011601160116011,7,3
2,0001f322716470bf9bfc1708f06f00fc,554160116011601160116011601160116011,9,4
3,0001f322716470bf9bfc1708f06f00fc,541154115541591260116011601160116011,9,5
4,0001f322716470bf9bfc1708f06f00fc,"5411,5411,5411,5411,5499,5541,5541,5999,5999,6...",18,6
5,0001f322716470bf9bfc1708f06f00fc,5411541155415999601160116011,7,7
6,0001f322716470bf9bfc1708f06f00fc,5411541160116011601160116011,7,8
7,0001f322716470bf9bfc1708f06f00fc,"5211,5411,5541,6011,6011,6011,6011,6011,6011,6...",11,9
8,0007297d86e14bd68bd87b1dbdefe302,601160116011601160116011,6,2
9,0007297d86e14bd68bd87b1dbdefe302,601160116011601160116011,6,3


В столбце `mccs` в `data` записан список всех категорий товаров, приобретённых в данном месяце, разделённых запятой.

In [5]:
def deserialize_int_list(list_string):
    return [int(item) for item in list_string.split(',')]

In [6]:
baskets = [
    deserialize_int_list(list_string)
    for list_string in data['mccs']
]

In [7]:
all_items = list(chain.from_iterable(baskets))

unique_items = sorted(list(set(all_items)))

In [8]:
unique_customers = data['customer_id'].unique().tolist()

Получаем векторы корзин с помощью Bag of words.

In [9]:
def encode_basket(basket, vocabulary):
    vector = np.zeros(len(vocabulary))
    
    for item in basket:
        item_id = vocabulary.index(item)
        
        vector[item_id] += 1
    
    return vector

In [10]:
encoded_baskets = np.array([encode_basket(basket, unique_items) for basket in baskets])

Масштабируем значения в векторах:

* логарифмируем
* располагаем между 0 и 1

In [11]:
encoded_baskets = np.log2(encoded_baskets + 1)

In [12]:
scaler = MinMaxScaler()

encoded_baskets = scaler.fit_transform(encoded_baskets)

Следующая функция получает на вход длину сиквенса `sequence_length` и возвращает обучающую и тестовую выборки для данной длины. Для каждого пользователя:

1. собираем все корзины.
2. получаем из них всевозможные последовательности корзин длины `sequence_length`, причём, если в качестве `x` мы берём слайс `baskets[i:j]`, то в качестве соответствующего `y` выступит `baskets[i+1 : j+1]`.
3. все `y` бинаризируем по правилу `y = (y > 0).astype(int)`.
4. все такие последовательности кроме последней (для которой `y` содержит самую последнюю корзину) складываем в обучающую выборку, а последнюю - в тестовую.

In [14]:
customer2baskets = {
    customer_id: encoded_baskets[data['customer_id'] == customer_id]
    for customer_id in tqdm(unique_customers)
}

100%|██████████| 9988/9988 [00:25<00:00, 397.87it/s]


In [17]:
def get_train_and_test_data(sequence_length=3):
    x_train = list()
    y_train = list()

    x_test = list()
    y_test = list()

    for customer_id in unique_customers:
        customer_baskets = customer2baskets[customer_id]
        baskets_count = len(customer_baskets)

        sequence_count = baskets_count - sequence_length

        if sequence_count < 2:
            continue

        for start_id in range(sequence_count):
            end_id = start_id + sequence_length

            x = customer_baskets[start_id : end_id].copy()
            y = customer_baskets[start_id + 1 : end_id + 1].copy()
            y = (y > 0).astype(int)

            if start_id < sequence_count - 1:
                x_train.append(x)
                y_train.append(y)
            else:
                x_test.append(x)
                y_test.append(y)

    x_train = np.array(x_train)
    y_train = np.array(y_train)

    x_test = np.array(x_test)
    y_test = np.array(y_test)
    
    return x_train, x_test, y_train, y_test

Собираем последовательности всевозможной длины.

In [19]:
x_train = dict()
y_train = dict()

x_test = dict()
y_test = dict()

for sequence_length in range(1, 13):
    x_train_, x_test_, y_train_, y_test_ = get_train_and_test_data(sequence_length)
    
    if len(x_train_) == 0:
        break
    
    x_train[sequence_length] = x_train_
    y_train[sequence_length] = y_train_
    
    x_test[sequence_length] = x_test_
    y_test[sequence_length] = y_test_

In [31]:
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(
        512,
        input_shape=(None, len(unique_items)),
        return_sequences=True,
    ),
    tf.keras.layers.Dense(
        512,
        activation='sigmoid',
    ),
    tf.keras.layers.Dense(
        len(unique_items),
        activation='sigmoid',
    ),
])

In [32]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
)

Последовательно обучаем модель на сиквенсах каждой длины.

In [33]:
epochs = 10

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}.')
    
    for sequence_length in x_train:
        model.fit(
            x_train[sequence_length],
            y_train[sequence_length],
            verbose=1,
            validation_data=(x_test[sequence_length], y_test[sequence_length]),
        )
    
    print()

Epoch 1.
Train on 46501 samples, validate on 9617 samples
Train on 36884 samples, validate on 9165 samples
Train on 27719 samples, validate on 8549 samples
Train on 19170 samples, validate on 7703 samples
Train on 11467 samples, validate on 6149 samples
Train on 5318 samples, validate on 4840 samples
Train on 478 samples, validate on 478 samples

Epoch 2.
Train on 46501 samples, validate on 9617 samples
Train on 36884 samples, validate on 9165 samples
Train on 27719 samples, validate on 8549 samples
Train on 19170 samples, validate on 7703 samples
Train on 11467 samples, validate on 6149 samples
Train on 5318 samples, validate on 4840 samples
Train on 478 samples, validate on 478 samples

Epoch 3.
Train on 46501 samples, validate on 9617 samples
Train on 36884 samples, validate on 9165 samples
Train on 27719 samples, validate on 8549 samples
Train on 19170 samples, validate on 7703 samples
Train on 11467 samples, validate on 6149 samples
Train on 5318 samples, validate on 4840 samples


Train on 36884 samples, validate on 9165 samples
Train on 27719 samples, validate on 8549 samples
Train on 19170 samples, validate on 7703 samples
Train on 11467 samples, validate on 6149 samples
Train on 5318 samples, validate on 4840 samples
Train on 478 samples, validate on 478 samples

Epoch 10.
Train on 46501 samples, validate on 9617 samples
Train on 36884 samples, validate on 9165 samples
Train on 27719 samples, validate on 8549 samples
Train on 19170 samples, validate on 7703 samples
Train on 11467 samples, validate on 6149 samples
Train on 5318 samples, validate on 4840 samples
Train on 478 samples, validate on 478 samples



Проверяем результаты.

In [34]:
def f1(y_test, y_pred):
    f1s = list()
    
    for test, pred in zip(y_test, y_pred):
        test = test[-1]
        pred = pred[-1]
        
        f1_ = f1_score(test, (pred > 0.5).astype(int))
        
        f1s.append(f1_)
    
    return np.mean(f1s)

In [35]:
print('seq_len\tf1')

for sequence_length in x_test:
    y_pred = model.predict(x_test[sequence_length])
    
    print(f'{sequence_length}\t{f1(y_test[sequence_length], y_pred)}')

seq_len	f1
1	0.5161623532610122
2	0.547015217037395
3	0.5521767385920417
4	0.5515519010051602
5	0.5683910411441858
6	0.5851163189989425
7	0.5915963292597508


Топ-5 предсказанных категорий.

In [36]:
def get_nlargest_ids(array, top=5):
    return array.argsort()[:-top - 1:-1]

In [37]:
unique_items = np.array(unique_items)

In [38]:
for customer_id in np.random.choice(unique_customers, 5, replace=False):
    customer_baskets = encoded_baskets[data['customer_id'] == customer_id]

    next_basket_real = customer_baskets[-1]

    top_5_ids_real = get_nlargest_ids(next_basket_real)
    top_5_results_real = unique_items[top_5_ids_real]

    customer_baskets = customer_baskets[:-1]
    customer_baskets = customer_baskets[np.newaxis, :]
    
    next_baskets_pred = model.predict(customer_baskets)[0]
    next_basket_pred = next_baskets_pred[-1]

    top_5_ids_pred = get_nlargest_ids(next_basket_pred)
    top_5_results_pred = unique_items[top_5_ids_pred]
    
    top_5_results_real.sort()
    top_5_results_pred.sort()
    
    print(f'customer id: {customer_id}')
    
    print(f'real: {top_5_results_real}')
    print(f'pred: {top_5_results_pred}\n')

customer id: 9c228cd3cbeb00c93ed885ae7e197e58
real: [4814 5541 5921 5999 6011]
pred: [5331 5411 5499 5921 6011]

customer id: 8962a7c28f574f50c2b9ffb00a0f55ec
real: [5411 5541 5812 5814 6011]
pred: [5411 5541 5812 5814 6011]

customer id: 38bf8ee18ffcc950a9d43fe31fe0bb20
real: [5072 5074 5311 5411 6011]
pred: [5311 5411 5651 5921 6011]

customer id: f20fd2d988da3846af01432ecd5b24b1
real: [5411 5691 5699 5814 7230]
pred: [5411 5541 5812 5814 6011]

customer id: b6dd3b2f0866b2e4fd97ffe5a26a4928
real: [5499 5511 5651 5912 6011]
pred: [5411 5541 5814 5912 6011]

