https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/af84b7162e320ee72622265b3b2857a8dc772f55/movie_recommender/src/knn_recommender.py#L224

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize

from tqdm.notebook import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from mapk import mapk
from train_test_split import holdout

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# articles = pd.read_csv('../data/articles.csv')
# customers = pd.read_csv('../data/customers.csv')
transactions_train = pd.read_csv('../data/transactions_train.csv')

In [3]:
# from sklearn.model_selection import train_test_split
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train.query('t_dat > "2019-08-16"', inplace=True)

# train

In [4]:
train, test = holdout(transactions_train)

In [5]:
# отфильтруем товары, которые были куплены реже чем N-раз
N = 1000
popular_items = train.groupby('article_id').size()\
                                                .to_frame(name='cnt')\
                                                .reset_index()\
                                                .query('cnt > @N').article_id.tolist()

In [6]:
train = train.query('article_id in @popular_items').copy()

In [7]:
def create_mappings(df, column):
    '''
    create_mappings(df, column)
        Создаёт маппинг между оригинальными ключами словаря и новыми порядковыми.
        
        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame с данными.
            
        column : str
            Название колонки, содержащей нужны ключи. 
        
        Returns
        -------
        code_to_idx : dict
            Словарь с маппингом: "оригинальный ключ" -> "новый ключ".
        
        idx_to_code : dict
            Словарь с маппингом: "новый ключ" -> "оригинальный ключ".
    '''
    
    code_to_idx = {}
    idx_to_code = {}

    for idx, code in enumerate(set(df[column])):
        code_to_idx[code] = idx
        idx_to_code[idx] = code
        
    return code_to_idx, idx_to_code


def map_ids(row, mapping):
    '''
    Вспомогательная функция
    '''
    return mapping[row]


def make_predict_most_pop(train):
    most_pop = train.query('t_dat > "2020-09-05"')\
        .groupby('article_id').size()\
        .sort_values(ascending=False).index.values
    return list(most_pop)
    
def make_predict_parallel(row):
    """
    # что у первого юзера в истории
    start = set(i for i in np.nonzero(interactions[1]))[1].tolist()
    start

    # рекомендации минус исторические данные
    set([i for i in fit[1].toarray().argsort()[0][-10:].tolist()]) - start
    
    """
    def make_predict_random_walk(customer_id, **params):
        """
        проверить порядок объектов, сет может перемешать сортировку
        """
        fit = params['model']
        
        user_idx = user_to_idx[customer_id]
        # товары, которые пользователь уже купил
        start = set(i for i in np.nonzero(interactions.getrow(user_idx))[1].tolist())
        # рекомендации
        predict = fit.getrow(user_idx).toarray().argsort()[0][-50:].tolist()
        return [idx_to_item[i] for i in predict], [idx_to_item[i] for i in start]

    knn_predict = []
    drop_item = []
    customer_id = row

    if customer_id in user_to_idx:
        knn_predict, drop_item = make_predict_random_walk(customer_id, **params)
        return list(set(knn_predict) - set(drop_item))[-12:]
    else:
        return make_predict_most_pop(train)



In [8]:
%%time

# словари с маппингами, нужно для tocsr матрицы
user_to_idx, idx_to_user = create_mappings(train, 'customer_id')
item_to_idx, idx_to_item = create_mappings(train, 'article_id')

# собираем матрицу взаимодействий 
U = train['customer_id'].apply(map_ids, args=[user_to_idx]).values
I = train['article_id'].apply(map_ids, args=[item_to_idx]).values

# тут пока бинарная маска будет лежать для упрощения
# можно попробовать поиграться с ценами
values = np.ones(train.shape[0])
    
interactions = sparse.coo_matrix(
    (values, (U, I)), 
    shape=(len(train['customer_id'].unique()), len(train['article_id'].unique())), 
    dtype=np.float64
).tocsr()


CPU times: user 11.2 s, sys: 323 ms, total: 11.5 s
Wall time: 11.5 s


In [9]:
# можно отнормировать, если матрица не бинарная
# Pui = normalize(interactions, norm='l2', axis=1)

# косинусное растояние
# sim = Pui.T * Pui

# товары, похожие на 1
# [i for i in sim[1].toarray().argsort()[0][-5:]]

In [10]:
%%time
interactions_transpose = interactions.transpose(copy=True)
Piu = normalize(interactions_transpose, norm='l2', axis=1)
Pui = normalize(interactions, norm='l2', axis=1)
fit = Pui * (Piu * Pui)

# 889598x3786 * (3786x889596 * 889598x3786) = 889598x3786 * 3786x3786 = 889598x3786

CPU times: user 1min 30s, sys: 1min 10s, total: 2min 40s
Wall time: 3min 51s


In [11]:
%%time
params = dict()
params['model'] = fit
    
t = test.iloc[:1_000].copy()
# нестабильно работает с аргументами
t['predict'] = t.customer_id.parallel_apply(make_predict_parallel)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=84), Label(value='0 / 84'))), HBox…

CPU times: user 975 ms, sys: 601 ms, total: 1.58 s
Wall time: 6.11 s


In [12]:
mapk(t['target'].to_list(), t['predict'].to_list())

0.006217467532467533

In [13]:
t

Unnamed: 0,customer_id,target,predict
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[624486001],"[156231001, 841383002, 537116001, 372860002, 3..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[827487003],"[554450001, 448509014, 841383002, 372860001, 3..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[757926001, 788575004, 640021019]","[610776002, 720125001, 554450001, 448509014, 3..."
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[874110016],"[909370001, 918522001, 448509014, 751471001, 9..."
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[903762001, 879189005, 158340001, 867966009, 9...","[803757001, 673677002, 711053003, 448509001, 4..."
...,...,...,...
995,039ce97fc66af4f7e7b1d48253e5ed8e7aabe8b6563c91...,"[893141002, 918525001, 914118001, 673677002]","[896152002, 610776002, 723469001, 803757001, 6..."
996,039d76dd323437c52d439bb8a7ffb7b12b31e727893692...,[915526001],"[673677002, 720125001, 822946001, 448509014, 8..."
997,039eccfe83fe082cc0c0e16aec1374cdd334d0689a2702...,"[902362001, 568601045, 806131012]","[610776002, 762205001, 803757001, 720125001, 6..."
998,039ed8943b80c61940f21540ca83dd6d0561c4960697ed...,"[579541001, 579541075, 579541077, 759814022]","[664074059, 783346001, 695632087, 695632089, 3..."
