In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from tqdm.notebook import tqdm
import implicit

from tqdm.notebook import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from mapk import mapk
from train_test_split import holdout

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)


INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# articles = pd.read_csv('../data/articles.csv')
# customers = pd.read_csv('../data/customers.csv')
transactions_train = pd.read_csv('../data/transactions_train.csv')

In [3]:
# from sklearn.model_selection import train_test_split
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train.query('t_dat > "2019-08-16"', inplace=True)
# transactions_train.query('t_dat > "2020-09-01"', inplace=True)

# train

In [4]:
train, test = holdout(transactions_train)

In [5]:
# отфильтруем товары, которые были куплены реже чем N-раз
N = 1000  # при меньше 1000 ядро падает
popular_items = train.groupby('article_id').size()\
                                            .to_frame(name='cnt')\
                                            .reset_index()\
                                            .query('cnt > @N').article_id.tolist()

train.query('article_id in @popular_items', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.query('article_id in @popular_items', inplace=True)


In [6]:
# удалим из обучающей выборки пользователей, которые есть в тестовой
test_users = test[['customer_id']]

train_users = set(
    (
        pd.merge(train[['customer_id']], test_users, indicator=True, how='outer')
        .query('_merge=="left_only"')
        .drop('_merge', axis=1)
    ).customer_id
)

train = train.query('customer_id in @train_users')

In [7]:
date_from = "2020-08-16"
actual_items = train.query('t_dat > @date_from').article_id.unique()

drop_items = list(train.query('article_id not in @actual_items').article_id.unique())

In [8]:
# # получаем полный набор маппингов
# all_users = transactions_train[['customer_id']]
# all_items = transactions_train[['article_id']]

In [9]:
# df = pd.DataFrame([[1, 1, 5], [2, 2, 5]], columns=['one', 'two', 'three'])
# df

# len(df.one.values)

# interactions = sparse.coo_matrix(
#     (df.three.values, (df.one.values, df.two.values)), 
#     shape=(len(df.one.values)+1, len(df.two.values)+1), 
#     dtype=np.float64
# ).tocsr()

# interactions

In [10]:
def create_mappings(df, column):
    '''
    create_mappings(df, column)
        Создаёт маппинг между оригинальными ключами словаря и новыми порядковыми.
        
        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame с данными.
            
        column : str
            Название колонки, содержащей нужны ключи. 
        
        Returns
        -------
        code_to_idx : dict
            Словарь с маппингом: "оригинальный ключ" -> "новый ключ".
        
        idx_to_code : dict
            Словарь с маппингом: "новый ключ" -> "оригинальный ключ".
    '''
    
    code_to_idx = {}
    idx_to_code = {}
    
    # for idx, code in enumerate(df[column].to_list()):
    for idx, code in enumerate(set(df[column])):
        code_to_idx[code] = idx
        idx_to_code[idx] = code
        
    return code_to_idx, idx_to_code


def map_ids(row, mapping):
    '''
    Вспомогательная функция
    '''
    return mapping[row]

In [13]:
%%time

# словари с маппингами, нужно для tocsr матрицы
user_to_idx, idx_to_user = create_mappings(transactions_train, 'customer_id')
item_to_idx, idx_to_item = create_mappings(transactions_train, 'article_id')

# собираем матрицу взаимодействий 
U = transactions_train['customer_id'].apply(map_ids, args=[user_to_idx]).values
I = transactions_train['article_id'].apply(map_ids, args=[item_to_idx]).values

# тут пока бинарная маска будет лежать для упрощения
# можно попробовать поиграться с ценами
values = np.ones(transactions_train.shape[0])
    
interactions = sparse.coo_matrix(
    (values, (U, I)), 
    shape=(len(transactions_train['customer_id'].unique()), len(transactions_train['article_id'].unique())), 
    dtype=np.float64
).tocsr()


CPU times: user 24.5 s, sys: 785 ms, total: 25.3 s
Wall time: 25.4 s


In [30]:
mappings = (idx_to_user, user_to_idx), (idx_to_item, item_to_idx) 

X_test=test.copy()
col='customer_id'
N=12

(idx_to_uid, uid_to_idx), (idx_to_oid, oid_to_idx) = mappings
    
# predictions = X_test[[col]].copy()
# predictions['target'] = pd.Series(dtype=object)
# predictions = predictions.set_index(col)

base_model = implicit.als.AlternatingLeastSquares(
    factors=5, 
    iterations=75, 
    regularization=0.05, 
    num_threads = 12,
    random_state=42
)

base_model.fit(interactions.T)


  0%|          | 0/75 [00:00<?, ?it/s]

In [31]:
%%time
# https://github.com/benfred/implicit/blob/553010d5d600111fb45c2aeb5e8ac2004d15e827/implicit/recommender_base.pyx

recommendations = base_model.recommend_all(
    interactions,
    N=N,
    filter_already_liked_items=True,  # если пользователь уже купил товар, больше не предлагать
    filter_items=[item_to_idx[i] for i in drop_items],  # сюда положить несезонные товары
)



  0%|          | 0/1025182 [00:00<?, ?it/s]

CPU times: user 22min 8s, sys: 8.23 s, total: 22min 16s
Wall time: 5min 47s


In [32]:
%%time

t = test.iloc[:1_000].copy()
# t = test.copy()
t['predict'] = t.progress_apply(lambda row: [idx_to_item[i] for i in recommendations[user_to_idx[row.customer_id]]], 
                                axis=1)




  0%|          | 0/1000 [00:00<?, ?it/s]

CPU times: user 284 ms, sys: 5.04 ms, total: 290 ms
Wall time: 49.1 ms


In [33]:
mapk(t['target'].to_list(), t['predict'].to_list())

0.0

In [35]:
t

Unnamed: 0,customer_id,target,predict
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[624486001],"[372860001, 158340001, 156231001, 372860002, 1..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[827487003],"[706016001, 759871002, 706016002, 448509014, 7..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[757926001, 788575004, 640021019]","[706016003, 706016015, 706016006, 706016019, 4..."
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[874110016],"[759871002, 720125001, 706016001, 448509014, 7..."
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[903762001, 879189005, 158340001, 867966009, 9...","[759871002, 806388003, 685814001, 685816002, 6..."
...,...,...,...
995,039ce97fc66af4f7e7b1d48253e5ed8e7aabe8b6563c91...,"[893141002, 918525001, 914118001, 673677002]","[751471001, 783346001, 562245046, 796210001, 8..."
996,039d76dd323437c52d439bb8a7ffb7b12b31e727893692...,[915526001],"[759871002, 720125001, 448509014, 806388002, 8..."
997,039eccfe83fe082cc0c0e16aec1374cdd334d0689a2702...,"[902362001, 568601045, 806131012]","[751471001, 783346001, 562245046, 610776002, 7..."
998,039ed8943b80c61940f21540ca83dd6d0561c4960697ed...,"[579541001, 579541075, 579541077, 759814022]","[751471001, 610776002, 783346001, 562245046, 6..."
