https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/af84b7162e320ee72622265b3b2857a8dc772f55/movie_recommender/src/knn_recommender.py#L224

In [63]:
import pandas as pd
import numpy as np
from scipy import sparse
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors

from mapk import mapk
from train_test_split import holdout

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [64]:
# articles = pd.read_csv('../data/articles.csv')
# customers = pd.read_csv('../data/customers.csv')
transactions_train = pd.read_csv('../data/transactions_train.csv')

In [65]:
# from sklearn.model_selection import train_test_split
transactions_train['t_dat'] = pd.to_datetime(transactions_train['t_dat'])
transactions_train = transactions_train.query('t_dat > "2019-08-16"')

# train

In [66]:
train, test = holdout(transactions_train)

In [67]:
# удалим из обучающей выборки пользователей, которые есть в тестовой
# test_users = test[['customer_id']]

# train_users = set(
#     (
#         pd.merge(train[['customer_id']], test_users, indicator=True, how='outer')
#         .query('_merge=="left_only"')
#         .drop('_merge', axis=1)
#     ).customer_id
# )

# train = train.query('customer_id in @train_users')

In [68]:
def create_mappings(df, column):
    '''
    create_mappings(df, column)
        Создаёт маппинг между оригинальными ключами словаря и новыми порядковыми.
        
        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame с данными.
            
        column : str
            Название колонки, содержащей нужны ключи. 
        
        Returns
        -------
        code_to_idx : dict
            Словарь с маппингом: "оригинальный ключ" -> "новый ключ".
        
        idx_to_code : dict
            Словарь с маппингом: "новый ключ" -> "оригинальный ключ".
    '''
    
    code_to_idx = {}
    idx_to_code = {}
    
    # for idx, code in enumerate(df[column].to_list()):
    for idx, code in enumerate(set(df[column])):
        code_to_idx[code] = idx
        idx_to_code[idx] = code
        
    return code_to_idx, idx_to_code


def map_ids(row, mapping):
    '''
    Вспомогательная функция
    '''
    return mapping[row]

In [69]:
%%time

# словари с маппингами, нужно для tocsr матрицы
user_to_idx, idx_to_user = create_mappings(train, 'customer_id')
item_to_idx, idx_to_item = create_mappings(train, 'article_id')

# собираем матрицу взаимодействий 
U = train['customer_id'].apply(map_ids, args=[user_to_idx]).values
I = train['article_id'].apply(map_ids, args=[item_to_idx]).values

# тут пока бинарная маска будет лежать для упрощения
# можно попробовать поиграться с ценами
values = np.ones(train.shape[0])
    
interactions = sparse.coo_matrix(
    (values, (U, I)), 
    shape=(len(train['customer_id'].unique()), len(train['article_id'].unique())), 
    dtype=np.float64
).tocsr()


CPU times: user 22.3 s, sys: 729 ms, total: 23 s
Wall time: 23 s


In [70]:
model = NearestNeighbors(metric='cosine', n_jobs=-1)
model.fit(interactions)

NearestNeighbors(metric='cosine', n_jobs=-1)

In [92]:
def make_predict_most_pop(train=train):
    most_pop = train.query('t_dat > "2020-09-05"')\
        .groupby('article_id').size()\
        .sort_values(ascending=False)[:12].index.values
    return list(most_pop) 

def make_predict(x, model, interactions=interactions, **params):
    n_neighbors = params['n_neighbors']
    date_from = params['date_from']
    
    user = x.customer_id
    if user in user_to_idx:
        idx = user_to_idx[user]

        distances, indices = model.kneighbors(interactions[idx], n_neighbors=n_neighbors)

        neighbor_users = [idx_to_user[i] for i in indices[0][1:]]
    
        # query('t_dat > "2020-08-16"') - покупки за последний месяц
        local_pop = train.loc[train.customer_id.isin(neighbor_users)]\
                        .query('t_dat > @date_from')\
                        .groupby('article_id').size()\
                        .sort_values(ascending=False)[:12].index.values
        return list(local_pop)
    else:
        most_pop = make_predict_most_pop()
        return most_pop

In [86]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [95]:
%%time
params = dict()
params['n_neighbors'] = 100
params['date_from'] = "2020-08-16"
    
t = test.iloc[:100].copy()
t['predict'] = t.progress_apply(make_predict, 
                                     model=model,
                                     interactions=interactions,
                                     **params,
                                     axis=1)

  0%|          | 0/100 [00:00<?, ?it/s]

CPU times: user 3min 51s, sys: 34.8 s, total: 4min 26s
Wall time: 2min 9s


In [96]:
mapk(t['target'].to_list(), t['predict'].to_list())

0.005083333333333333

In [97]:
t

Unnamed: 0,customer_id,target,predict
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[624486001],"[671607001, 297067002, 436261001, 158340001, 8..."
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[827487003],"[873217004, 621381012, 886557002, 744276001, 7..."
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[757926001, 788575004, 640021019]","[706016001, 706016002, 237347045]"
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[874110016],"[909370001, 918522001, 448509014, 751471001, 8..."
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[903762001, 879189005, 158340001, 867966009, 9...","[806388001, 715624001, 806388002, 803772002, 6..."
...,...,...,...
95,00686b4fe3ea577b41d9caa524b37d707a1e46aa4c7013...,[900387001],"[872537002, 763275002, 751664001, 919273002, 7..."
96,006908fb1f581e64408d2c5a57bf65e06b26939dce50b8...,"[907149001, 714790024, 640021012, 918890001]","[153115020, 568601023, 751471038, 828928001, 8..."
97,006a396de0a8edc6694241b5aafc5fad652547dfd42b7e...,"[908728002, 909059002]","[372860002, 683001028, 699755061, 700765001, 7..."
98,006a7b180527417a034cf3d6544baf906f12805d8271ee...,"[805000010, 873679002, 711416003, 859400005, 9...","[805000001, 762846008, 878800003, 911870004, 9..."
