https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/af84b7162e320ee72622265b3b2857a8dc772f55/movie_recommender/src/knn_recommender.py#L224

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize

from tqdm.notebook import tqdm
tqdm.pandas()

from mapk import mapk
from train_test_split import holdout

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# articles = pd.read_csv('../data/articles.csv')
# customers = pd.read_csv('../data/customers.csv')
transactions_train = pd.read_csv('../data/transactions_train.csv')

In [3]:
# from sklearn.model_selection import train_test_split
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train = transactions_train.query('t_dat > "2019-08-16"')

# train

In [4]:
train, test = holdout(transactions_train)

In [5]:
# фильтрация товаров, которые пользователь из тест-группы уже купил
drop_items = train.groupby('customer_id')['article_id'].apply(list).to_frame('target').reset_index()

In [7]:
def create_mappings(df, column):
    '''
    create_mappings(df, column)
        Создаёт маппинг между оригинальными ключами словаря и новыми порядковыми.
        
        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame с данными.
            
        column : str
            Название колонки, содержащей нужны ключи. 
        
        Returns
        -------
        code_to_idx : dict
            Словарь с маппингом: "оригинальный ключ" -> "новый ключ".
        
        idx_to_code : dict
            Словарь с маппингом: "новый ключ" -> "оригинальный ключ".
    '''
    
    code_to_idx = {}
    idx_to_code = {}

    for idx, code in enumerate(set(df[column])):
        code_to_idx[code] = idx
        idx_to_code[idx] = code
        
    return code_to_idx, idx_to_code


def map_ids(row, mapping):
    '''
    Вспомогательная функция
    '''
    return mapping[row]


def make_predict_most_pop(train):
    most_pop = train.query('t_dat > "2020-09-05"')\
        .groupby('article_id').size()\
        .sort_values(ascending=False).index.values
    return list(most_pop)


def make_predict(row, train, interactions, **params):

    def make_predict_knn(user, train, interactions, **params):
        n_neighbors = params['n_neighbors']
        date_from = params['date_from']
        model = params['model']

        idx = user_to_idx[user]
        distances, indices = model.kneighbors(interactions[idx], n_neighbors=n_neighbors)

        neighbor_users = [idx_to_user[i] for i in indices[0][1:]]

        # query() - покупки за последние n-дней
        local_pop = train.loc[train.customer_id.isin(neighbor_users)]\
                        .query('t_dat > @date_from')\
                        .groupby('article_id').size()\
                        .sort_values(ascending=False).index.values
        return list(local_pop)

    def make_predict_random_walk():
        start = set(i for i in np.nonzero(interactions[1]))[1].tolist()
        # рекомендации
        predict = set([i for i in fit[1].toarray().argsort()[0][-100:].tolist()]) - start
        return predict
        


    knn_predict = []
    drop_item = []
    customer_id = row.customer_id 

    if customer_id in user_to_idx:
        knn_predict = make_predict_knn(customer_id, train, interactions, **params)
        drop_item = drop_items.loc[drop_items.customer_id == customer_id].target.to_list()[0]

    most_pop = make_predict_most_pop(train) #вынести из функции
    
#     acc = []
#     acc = [i for i in (knn_predict + most_pop) if ((i not in acc) and (i not in drop_item))]
#     return acc[:12]
    return [i for i in (knn_predict + most_pop) if ((i not in acc) and (i not in drop_item))][:12]


In [8]:
%%time

# словари с маппингами, нужно для tocsr матрицы
user_to_idx, idx_to_user = create_mappings(train, 'customer_id')
item_to_idx, idx_to_item = create_mappings(train, 'article_id')

# собираем матрицу взаимодействий 
U = train['customer_id'].apply(map_ids, args=[user_to_idx]).values
I = train['article_id'].apply(map_ids, args=[item_to_idx]).values

# тут пока бинарная маска будет лежать для упрощения
# можно попробовать поиграться с ценами
values = np.ones(train.shape[0])
    
interactions = sparse.coo_matrix(
    (values, (U, I)), 
    shape=(len(train['customer_id'].unique()), len(train['article_id'].unique())), 
    dtype=np.float64
).tocsr()


CPU times: user 23.7 s, sys: 840 ms, total: 24.5 s
Wall time: 24.6 s


In [15]:
interactions

<1017822x73335 sparse matrix of type '<class 'numpy.float64'>'
	with 13798926 stored elements in Compressed Sparse Row format>

In [9]:
# можно отнормировать, если матрица не бинарная
# interactions = normalize(interactions, norm='l2', axis=1)

# косинусное растояние
sim = interactions.T * interactions

In [10]:
sim

<73335x73335 sparse matrix of type '<class 'numpy.float64'>'
	with 266189269 stored elements in Compressed Sparse Column format>

In [16]:
# товары, похожие на 1
[i for i in sim[1].toarray().argsort()[0][-5:]]

[37477, 36780, 33022, 69274, 1]

In [None]:
interactions

In [None]:
%%time
# interactions_transpose = interactions.transpose(copy=True)
# Pui = normalize(interactions_transpose, norm='l2', axis=1)
# fit = Pui * Piu * Pui

fit = interactions.T * interactions * interactions.T

In [18]:
fit

<1017822x73335 sparse matrix of type '<class 'numpy.float64'>'
	with 13798926 stored elements in Compressed Sparse Row format>

In [None]:
# предпочтения 1го юзера
start = set(i for i in np.nonzero(interactions[1]))[1].tolist()
start

In [None]:
# рекомендации
set([i for i in fit[1].toarray().argsort()[0][-100:].tolist()]) - start

In [93]:
%%time
params = dict()
params['n_neighbors'] = 100
params['date_from'] = "2020-08-16"
params['model'] = model
    
t = test.iloc[:1_000].copy()
t['predict'] = t.progress_apply(make_predict, 
                                train=train, 
                                interactions=interactions,
                                **params,
                                axis=1)

  0%|          | 0/1000 [00:00<?, ?it/s]

CPU times: user 44min 57s, sys: 8min 25s, total: 53min 22s
Wall time: 31min 10s


In [94]:
mapk(t['target'].to_list(), t['predict'].to_list())

0.006310349927849928

In [95]:
t

Unnamed: 0,customer_id,target,predict
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[624486001],"[297067002, 158340001, 876410006, 923134007, 9..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[827487003],"[744276001, 886557002, 797988002, 748588001, 8..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[757926001, 788575004, 640021019]","[237347045, 909370001, 918522001, 448509014, 7..."
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[874110016],"[909370001, 918522001, 448509014, 751471001, 8..."
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[903762001, 879189005, 158340001, 867966009, 9...","[715624001, 803772002, 677930086, 803757015, 8..."
...,...,...,...
995,039ce97fc66af4f7e7b1d48253e5ed8e7aabe8b6563c91...,"[893141002, 918525001, 914118001, 673677002]","[909370001, 918522001, 448509014, 751471001, 8..."
996,039d76dd323437c52d439bb8a7ffb7b12b31e727893692...,[915526001],"[832997004, 408875001, 823118004, 885586001, 8..."
997,039eccfe83fe082cc0c0e16aec1374cdd334d0689a2702...,"[902362001, 568601045, 806131012]","[916468003, 909370001, 918522001, 448509014, 7..."
998,039ed8943b80c61940f21540ca83dd6d0561c4960697ed...,"[579541001, 579541075, 579541077, 759814022]","[664074079, 886569005, 664074001, 886569004, 8..."
