In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

In [2]:
data = pd.read_csv('retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
data_train['price'] = data_train['sales_value'] / (np.maximum(data_train['quantity'], 1))
data_train['price'].max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train['price'] = data_train['sales_value'] / (np.maximum(data_train['quantity'], 1))


499.99

In [4]:
data_train['price'].quantile(0.20)

0.99

In [5]:
data_train['price'].quantile(0.99995)

82.01435450003483

In [6]:
def prefilter_items(data):
    # Уберем самые популярные товары (их и так купят)
    popularity = (data.groupby('item_id')['user_id'].nunique() / data['user_id'].nunique()).reset_index()
    popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
    
    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]
    
    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]
    
    # Уберем товары, которые не продавались за последние 12 месяцев
    last_12_months = data.loc[data['day'] < 365, 'item_id'].unique()
    data = data[data['item_id'].isin(last_12_months)]
    
    # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб. 
    data = data[data['price'] >= 1]
    
    # Уберем слишком дорогие товарыs
    data = data[data['price'] < 100]

    return data

In [7]:
data_train = prefilter_items(data_train)

In [8]:
data_train

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,2375,26984851472,1,1004906,1,1.39,364,-0.60,1631,1,0.0,0.0,1.39
4,2375,26984851472,1,8160430,1,1.50,364,-0.39,1631,1,0.0,0.0,1.50
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0,1.57
7,2375,26984851516,1,1085983,1,2.99,364,-0.40,1642,1,0.0,0.0,2.99
8,2375,26984851516,1,1102651,1,1.89,364,0.00,1642,1,0.0,0.0,1.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2282313,1541,41297771177,635,972931,1,1.99,304,0.00,1300,91,0.0,0.0,1.99
2282317,1168,41297772063,635,908531,1,1.00,304,-0.89,1526,91,0.0,0.0,1.00
2282321,462,41297773713,635,993339,1,1.99,304,0.00,2040,91,0.0,0.0,1.99
2282323,462,41297773713,635,10180324,1,3.00,304,-0.29,2040,91,0.0,0.0,3.00
