# Classification-based collaborative filtering using market basket data
Jong-Seok Lee, Chi-Hyuck Jun*, Jaewook Lee, Sooyoung Kim  
Department of Industrial Engineering, Pohang University of Science and Technology, San 31 Hyoja-dong, Pohang 790-784, South Korea

$$P_{aj} = k_a \sum_{i=1}^n w(a,i)v_{ij}$$
$$ 
w(a,i) = \frac
{\sum_{j}(v_{aj} - \bar{v_a}) * (v_{ij} - \bar{v_i}) }
{\sum_{j}(v_{aj} - \bar{v_a})^2 \sum_{j}(v_{ij} - \bar{v_i})^2}
$$
$k_a$ is a normalizing factor such that the absolute values of the weights sum to unity

$$k_a = \frac
{1}
{\sum_{i}|w(a,i)|}$$

<img src="img\wai.png">

C - prior users products  
D - train users products  
A - split from train  
B - split from train  


In [5]:
import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = '../data/'

pd.options.display.max_rows = 25
pd.options.display.max_columns = 25

In [3]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

###

print('computing product f')
prods = pd.DataFrame()

# кол-во заказов по продуктам
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32) 
# кол-во перезаказов по продуктам
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods


print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
computing product f
add order info to priors
computing user f
user f (206209, 6)


# Данные
Юзеры из трейна и теста не пересекаються.

In [17]:
priors[priors.order_id == 1187899]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [21]:

us_test = orders[orders.eval_set == 'test'].user_id


In [26]:
set(us_train).intersection(set(us_test))

set()

### Сделаем разбивку трейн_Тест

us_train - A side  
us_test - B

In [69]:
from sklearn.model_selection import train_test_split
us_all = orders[orders.eval_set == 'train'].user_id
# len(us_train), len(us_train.unique())# юзеры и заказы в трейне уникальны
us_train, us_test = train_test_split(us_all, test_size=0.2, random_state=42)

### Create C and D parts

C - prior users products  
user_prior_products_str  
C - должна быть по всем юзерам и из A и B множества
С выборка содержит сет по всем заказам

D - train users products  
user_train_products_str  
D - должна быть по всем юзерам и из A и B множества

In [78]:
user_prior_products_str_C = users.loc[us_all.values, 'all_products'].\
    apply(lambda all_products: " ".join([str(prod_id) + '_C' for prod_id in all_products]))

In [77]:
#orders[orders.eval_set == 'train']
train[train.order_id == 1492625]

train = train.join(orders, on='order_id', rsuffix='_')
train.drop('order_id_', inplace=True, axis=1)


users_train = pd.DataFrame()
#users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users_train['all_products'] = train.groupby('user_id')['product_id'].apply(set)

user_train_products_str_D = users_train.loc[us_all.values, 'all_products'].\
    apply(lambda all_products: " ".join([str(prod_id) + '_D' for prod_id in all_products]))

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

In [84]:
#X = user_train_products_str_D.str.cat(user_prior_products_str_C, sep=' ')
#cntvect = CountVectorizer()
#cntvect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [92]:
X_D = CountVectorizer().fit_transform(user_train_products_str_D)
X_C = CountVectorizer().fit_transform(user_prior_products_str_C)

In [104]:
X_D = (X_D > 0).astype(np.int16)
X_C = (X_C > 0).astype(np.int16)

Расчет w для X_D

$$P_{aj} = k_a \sum_{i=1}^n w(a,i)v_{ij}$$
$$ 
w(a,i) = \frac
{\sum_{j}(v_{aj} - \bar{v_a}) * (v_{ij} - \bar{v_i}) }
{\sum_{j}(v_{aj} - \bar{v_a})^2 \sum_{j}(v_{ij} - \bar{v_i})^2}
$$
$k_a$ is a normalizing factor such that the absolute values of the weights sum to unity

$$k_a = \frac
{1}
{\sum_{i}|w(a,i)|}$$

In [124]:
u_v_mean = X_C.mean(axis=1)
#X_C_center = 
m_test = X_C[:2,:].todense() # тестовая матрица из 2ух ползователей

In [125]:
wm_test

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int16)