In [1]:
import implicit
import pandas as pd
import numpy as np
from collections import Counter
from scipy import sparse
from scipy.sparse import csr_matrix
from implicit.evaluation import precision_at_k, train_test_split, mean_average_precision_at_k
from implicit.als import AlternatingLeastSquares

In [4]:
orders = pd.read_csv('data/orders.csv', sep=';')
products = pd.read_csv('data/products.csv', sep=';')
users = pd.read_csv('data/users.csv', sep=';')

In [6]:
last_orders_1 = orders.groupby('user_id').agg({'order_id':'last'}).reset_index()
train_orders = orders[~orders.order_id.isin(list(last_orders_1['order_id']))]
last_orders_2 = train_orders.groupby('user_id').agg({'order_id':'last'}).reset_index()

last_orders = pd.concat([last_orders_1, last_orders_2])

test_orders = orders[orders.order_id.isin(list(last_orders['order_id']))]
train_orders = orders[~orders.order_id.isin(list(last_orders['order_id']))]

In [7]:
user_id = 11
test_orders[test_orders.user_id == user_id].drop(columns=['Unnamed: 0']).to_csv("test_user_id_11.csv", sep=';', index=False)

In [8]:
test_orders[test_orders.user_id == 3].drop(columns=['Unnamed: 0'])

Unnamed: 0,order_id,product_id,user_id,day
29967041,3160850,39190,3,28
29967042,3160850,47766,3,28
29967043,3160850,16797,3,28
29967044,3160850,43961,3,28
29967045,3160850,48523,3,28
30582483,3225766,39190,3,29
30582484,3225766,22035,3,29
30582485,3225766,43961,3,29
30582486,3225766,18599,3,29
30582487,3225766,21903,3,29


In [9]:
train_orders['cnt'] = (np.log2(train_orders['day']+1) + 1)/ (np.log2(31) + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [10]:
data_test = np.array([1]*len(test_orders))
row_ind_test = np.array(test_orders['user_id'])
col_ind_test = np.array(test_orders['product_id'])

In [11]:
item_user_data_test = csr_matrix((data_test, (row_ind_test, col_ind_test)), \
                                 shape=(len(users)+1, len(products)+1)).T.tocsr()

In [12]:
data_train = np.array(train_orders['cnt'])
row_ind_train = np.array(train_orders['user_id'])
col_ind_train = np.array(train_orders['product_id'])

In [13]:
item_user_data_train = csr_matrix((data_train, (row_ind_train, col_ind_train)), \
                                 shape=(len(users)+1, len(products)+1)).T.tocsr()

In [14]:
model = AlternatingLeastSquares(factors=100, iterations=15, calculate_training_loss=True)
model.fit(item_user_data_train)

100%|██████████| 15.0/15 [02:57<00:00, 11.01s/it, loss=0.0013]


In [27]:
for idx, score in model.similar_items(55):
    print(products[products.product_id == idx]['product_name'].values[0], score)

Lasting Color Shampoo 0.0417644
Lasting Color Conditioner 0.041692156
Lemongrass Pure Essential Oil 0.04167495
Grapefruit & Green Tea Dishwashing Liquid 0.04167036
Lactase Enzyme 0.04166927
Mediterranean Fig Hand Wash 0.041667435
Superfood+ Turmeric Tamari Almonds 0.041666467
Ecopouf Dual Cleansing Pad 0.041665778
Organic White Balsamic Condiment 0.04166556
Res-Q Ointment 0.0416653


In [None]:
import pickle

In [76]:
pickle.dump(model, open('model.pkl', 'wb'))

In [149]:
user_items_train = item_user_data_train.T.tocsr()

In [245]:
userid = 21

In [247]:
recommendations = model.recommend(userid, user_items_train, N=10)

In [248]:
recommendations[-20:]

[(21709, 0.6420796),
 (33198, 0.62010527),
 (49235, 0.46374303),
 (30776, 0.43378443),
 (5785, 0.43195465),
 (20119, 0.39857566),
 (26620, 0.37768528),
 (47141, 0.3681677),
 (5876, 0.3483123),
 (1940, 0.33878064)]

In [249]:
for idx, _ in recommendations:
    print(products.loc[products.product_id == idx, 'product_name'].values[0])

Sparkling Lemon Water
Sparkling Natural Mineral Water
Organic Half & Half
Organic Raw Kombucha Gingerade
Organic Reduced Fat 2% Milk
Sparkling Water Berry
Peach Pear Flavored Sparkling Water
Cola
Organic Lemon
Organic 2% Reduced Fat Milk


In [250]:
for idx, row in test_orders[(test_orders.user_id == userid)].iterrows():
    print(products.loc[products.product_id == row['product_id'], 'product_name'].values[0])

Morning Fog Chardonnay
Chardonnay
Pinot Noir
Clara
Pepperidge Farm® Goldfish Xtra Cheddar Baked Snack Crackers
White Cheddar Baked Snack Crackers
Classic Hummus with Rold Gold Petzels Snack
Boomchickapop Sea Salt Popcorn
Easy Mac Original Flavor Macaroni & Cheese Dinner
Total 2% All Natural Greek Strained Yogurt with Honey
Vanilla Skyr Nonfat Yogurt
Icelandic Style Skyr Blueberry Non-fat Yogurt
Non Fat Acai & Mixed Berries Yogurt
Natural Almonds 100 Calorie Packs
Sugarfree Energy Drink
Organic Fuji Apple
Original Semisoft Cheese
Organic Half & Half


In [89]:
test_orders[test_orders.user_id == userid]

Unnamed: 0.1,Unnamed: 0,order_id,product_id,user_id,day
29474805,29474805,3108588,12427,1,28
29474806,29474806,3108588,196,1,28
29474807,29474807,3108588,10258,1,28
29474808,29474808,3108588,25133,1,28
29474809,29474809,3108588,46149,1,28
29474810,29474810,3108588,49235,1,28
31927070,31927070,3367565,196,1,30
31927071,31927071,3367565,12427,1,30
31927072,31927072,3367565,10258,1,30
31927073,31927073,3367565,25133,1,30


## Метрики

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
metrics_df = pd.DataFrame(columns=['factors', 'iterations', 'regularization', 'type', 'precision', 'map'])

In [None]:
factors=[20, 30, 50, 100, 150] 
iterations=[10, 15, 20] 
regularization=[0.01, 0.05, 0.1]
for f in factors:
    for i in iterations:
        for r in regularization:
            model = AlternatingLeastSquares(factors=f, iterations=i, regularization=r, calculate_training_loss=True)
            model.fit(item_user_data_train)
            p = precision_at_k(model, item_user_data_train.T.tocsr(), item_user_data_test.T.tocsr(), K=10, num_threads=4)
            map_k = mean_average_precision_at_k(model, item_user_data_train.T.tocsr(), item_user_data_test.T.tocsr(), K=10, num_threads=4)
            print 'Factors: ' + str(f) + ' iters: ' + str(i) + ' regul: ' + str(r) + \
                ' Precision : ' + str(p), ' MAP: ' + str(map_k)
            type_label = 'f' + str(f) + '_iter' + str(i) + '_reg' + str(r)
            metrics_df = metrics_df.append({'factors': f, 'iterations': i, 'regularization': r, \
                                            'type': type_label, 'precision': p, 'map': map_k}, ignore_index=True)

In [None]:
sns.set(rc = {'figure.figsize':(20,7)})
ax = sns.lineplot(x = 'type', y = 'precision', data=metrics_df_cp, markers=True)
plt.xticks(metrics_df_cp['type'], metrics_df_cp['type'], rotation='vertical')
plt.show()

In [None]:
sns.set(rc = {'figure.figsize':(20,8.27)})
ax = sns.lineplot(x = 'type', y = 'map', data=metrics_df_cp, markers=True)
plt.xticks(metrics_df_cp['type'], metrics_df_cp['type'], rotation='vertical')
plt.show()