https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/af84b7162e320ee72622265b3b2857a8dc772f55/movie_recommender/src/knn_recommender.py#L224

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors

from mapk import mapk
from train_test_split import holdout

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# articles = pd.read_csv('../data/articles.csv')
# customers = pd.read_csv('../data/customers.csv')
transactions_train = pd.read_csv('../data/transactions_train.csv')

In [3]:
# from sklearn.model_selection import train_test_split
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train = transactions_train.query('t_dat > "2020-08-16"')

# train

In [4]:
train, test = holdout(transactions_train)

In [5]:
# удалим из обучающей выборки пользователей, которые есть в тестовой
test_users = test[['customer_id']]

train_users = set(
    (
        pd.merge(train[['customer_id']], test_users, indicator=True, how='outer')
        .query('_merge=="left_only"')
        .drop('_merge', axis=1)
    ).customer_id
)

train = train.query('customer_id in @train_users')

In [6]:
def create_mappings(df, column):
    '''
    create_mappings(df, column)
        Создаёт маппинг между оригинальными ключами словаря и новыми порядковыми.
        
        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame с данными.
            
        column : str
            Название колонки, содержащей нужны ключи. 
        
        Returns
        -------
        code_to_idx : dict
            Словарь с маппингом: "оригинальный ключ" -> "новый ключ".
        
        idx_to_code : dict
            Словарь с маппингом: "новый ключ" -> "оригинальный ключ".
    '''
    
    code_to_idx = {}
    idx_to_code = {}
    
    # for idx, code in enumerate(df[column].to_list()):
    for idx, code in enumerate(set(df[column])):
        code_to_idx[code] = idx
        idx_to_code[idx] = code
        
    return code_to_idx, idx_to_code


def map_ids(row, mapping):
    '''
    Вспомогательная функция
    '''
    return mapping[row]

In [7]:
%%time

# словари с маппингами, нужно для tocsr матрицы
user_to_idx, idx_to_user = create_mappings(transactions_train, 'customer_id')
item_to_idx, idx_to_item = create_mappings(transactions_train, 'article_id')

# собираем матрицу взаимодействий 
U = transactions_train['customer_id'].apply(map_ids, args=[user_to_idx]).values
I = transactions_train['article_id'].apply(map_ids, args=[item_to_idx]).values

# тут пока бинарная маска будет лежать для упрощения
# можно попробовать поиграться с ценами
values = np.ones(transactions_train.shape[0])
    
interactions = sparse.coo_matrix(
    (values, (U, I)), 
    shape=(len(transactions_train['customer_id'].unique()), len(transactions_train['article_id'].unique())), 
    dtype=np.float64
).tocsr()


CPU times: user 1.72 s, sys: 54.6 ms, total: 1.77 s
Wall time: 1.77 s


In [8]:
model = NearestNeighbors(n_neighbors=20, metric='cosine', n_jobs=-1)
model.fit(interactions)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=20)

In [9]:
def make_predict(x, model, interactions=interactions):
    user = x.customer_id
    idx = user_to_idx[user]
    
    distances, indices = model.kneighbors(interactions[idx], return_distance=100)

    neighbor_users = [idx_to_user[i] for i in indices[0][1:]]
    
    # local_pop = train.query('customer_id in @neighbor_users')\
    #                 .groupby('article_id').size()\
    #                 .sort_values(ascending=False)[:12].index.values
    local_pop = train.loc[train.customer_id.isin(neighbor_users)]\
                    .groupby('article_id').size()\
                    .sort_values(ascending=False)[:12].index.values
    return list(local_pop)

In [10]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [16]:
t = test.iloc[:10000].copy()

In [17]:
%%time
t['predict'] = t.progress_apply(make_predict, 
                                     model=model,
                                     interactions=interactions,  
                                     axis=1)

  0%|          | 0/10000 [00:00<?, ?it/s]

CPU times: user 14min 57s, sys: 2min 28s, total: 17min 25s
Wall time: 12min 37s


In [18]:
mapk(t['target'].to_list(), t['predict'].to_list())

0.558611424725507

In [19]:
t

Unnamed: 0,customer_id,target,predict
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[624486001],"[624486001, 855769003, 902388001, 908799002, 9..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[827487003],"[621381012, 640021012, 905914002, 623072001, 9..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[757926001, 788575004, 640021019]","[640021019, 399136061, 556255001, 732842021, 7..."
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[874110016],[874110016]
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[903762001, 879189005, 158340001, 867966009, 9...","[936622001, 935892001, 933989002, 929744001, 9..."
...,...,...,...
9995,24ca3d0100a4ec36b9a2b9ee0f42f1047b4d66ed1586ae...,"[878604002, 871710001]","[871710001, 791587009]"
9996,24cb7d813a28c0ab44741eb9118d74d9a6dda573a10a87...,"[822416001, 685814033, 708132001]",[685814033]
9997,24cc164bd06a174d4319d4251f36d252363b94204fa1d3...,[888727010],"[888727010, 868161002, 855827009, 759423002, 9..."
9998,24cc5b5e9d6bb480c3cf101d88d033d3a5377cfd7ce6b2...,"[832253002, 832253002, 934212003, 865799005, 8...","[865799005, 885077001, 903926002, 914805002, 9..."
