# import module

In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
import sys
import argparse
import implicit
from make_datasets_SASRec import make_datasets

In [2]:
# Data Preprocessing

In [3]:
file_path = 'C:\\Users\\CHOYEONGKYU\\Desktop\\l point\\purchase_information.csv'
data = pd.read_csv(file_path, encoding='utf-8')
removed_data = data[~(data.pd_c == 'unknown')]
removed_data = removed_data[~(removed_data.buy_ct == 0)]
removed_data = removed_data.sort_values(by=['trans_id','trans_seq'])
removed_data = removed_data.astype({'pd_c':'int'})
new_data = removed_data[['trans_id','trans_seq', 'pd_c']]
new_data.rename(columns={'trans_id':'user','trans_seq':'timestamps','pd_c':'item'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [4]:
new_data.head()

Unnamed: 0,user,timestamps,item
478560,1,1,897
478481,1,2,548
478503,1,3,894
478521,1,4,1447
478543,1,5,1529


In [4]:
def parse_args():
  parser = argparse.ArgumentParser(description='Matrix Factorization')
  parser.add_argument('--max_len', default=50, type=int )
  return parser.parse_args(args=[])

In [5]:
args = parse_args()

In [7]:
item_dataset = make_datasets(new_data, args.max_len)
d_train, d_test, d_info = item_dataset

In [8]:
len(d_train)

38760

In [9]:
num_usr, num_item, items_usr_clicked, _, _ = d_info    # items_usr_clicked = all item bought sequence in each session 
all_items = [i for i in range(num_item)]

In [10]:
print(num_usr)
print(num_item)
print(len(items_usr_clicked))

38760
1182
38760


In [32]:
d_test.target.values[0]

[6]

In [42]:
test_items = [ x[0] for x in d_test.target.values]
print(len(set(test_items)))


966


In [11]:
for x in items_usr_clicked.keys():
  items_usr_clicked[x] = items_usr_clicked[x][:-1]   # get rid of last sequence(test item)
  

In [12]:
items_usr_clicked[2]

[7, 8, 9, 6, 10, 11, 12]

In [13]:
df = pd.DataFrame.from_dict(items_usr_clicked, orient = "index") \
         .sort_index() \
         .stack() \
         .astype(int) \
         .reset_index(level=1, drop=True) \
         .reset_index()
df.columns = ['session_id','item']
print(df[:10])

   session_id  item
0           1     1
1           1     2
2           1     3
3           1     4
4           1     5
5           2     7
6           2     8
7           2     9
8           2     6
9           2    10


In [14]:
df['quantity'] = 1

In [15]:
df.head()

Unnamed: 0,session_id,item,quantity
0,1,1,1
1,1,2,1
2,1,3,1
3,1,4,1
4,1,5,1


In [16]:
grouped_df = df.groupby(['session_id','item']).sum().reset_index()

In [17]:
grouped_df[grouped_df.session_id==5]

Unnamed: 0,session_id,item,quantity
25,5,17,2
26,5,18,2
27,5,23,1
28,5,24,1
29,5,25,2
30,5,26,1
31,5,27,1
32,5,28,2
33,5,29,1
34,5,30,1


In [18]:
df[df.session_id==5]

Unnamed: 0,session_id,item,quantity
25,5,23,1
26,5,17,1
27,5,18,1
28,5,24,1
29,5,25,1
30,5,18,1
31,5,17,1
32,5,25,1
33,5,26,1
34,5,27,1


In [19]:
sessions = list(np.sort(grouped_df.session_id.unique()))
products = list((grouped_df.item.unique()))
quantity = list((grouped_df.quantity))


In [46]:
?np.in1d(np.array())

[1;31mSignature:[0m [0mnp[0m[1;33m.[0m[0min1d[0m[1;33m([0m[0mar1[0m[1;33m,[0m [0mar2[0m[1;33m,[0m [0massume_unique[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0minvert[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Test whether each element of a 1-D array is also present in a second array.

Returns a boolean array the same length as `ar1` that is True
where an element of `ar1` is in `ar2` and False otherwise.

We recommend using :func:`isin` instead of `in1d` for new code.

Parameters
----------
ar1 : (M,) array_like
    Input array.
ar2 : array_like
    The values against which to test each value of `ar1`.
assume_unique : bool, optional
    If True, the input arrays are both assumed to be unique, which
    can speed up the calculation.  Default is False.
invert : bool, optional
    If True, the values in the returned array are inverted (that is,
    False where an element of `ar1` is in `ar2` and True otherwise).
    De

In [20]:
print(len(products))
print(len(sessions))

1161
38760


In [21]:
rows = grouped_df.session_id.astype('category').cat.codes
cols = grouped_df.item.astype('category').cat.codes
purchase_spares = sparse.csr_matrix((quantity, (rows,cols)), shape=(len(sessions),len(products)))

In [22]:
purchase_spares

<38760x1161 sparse matrix of type '<class 'numpy.int32'>'
	with 292296 stored elements in Compressed Sparse Row format>

In [23]:
matrix_size = purchase_spares.shape[0]*purchase_spares.shape[1]
num_purchases = len(purchase_spares.nonzero()[0])
sparsity = 100*(1- (num_purchases/matrix_size))

In [24]:
sparsity  # below 99.5% (for collaborative filtering to work)

99.3504585296651

# implicit recommendation

In [46]:
alpha = 40
user_vecs, item_vecs = implicit.alternating_least_squares((purchase_spares*alpha).astype('double'), 
                                                          factors=20, 
                                                          regularization = 0.1, 
                                                         iterations = 50)

This method is deprecated. Please use the AlternatingLeastSquares class instead
100%|██████████| 50.0/50 [00:08<00:00,  5.58it/s]


In [48]:
print(user_vecs.shape)
print(item_vecs.shape)

(38760, 20)
(1161, 20)


In [50]:
pred = sparse.csr_matrix(user_vecs).dot(sparse.csr_matrix(item_vecs.T))

In [54]:
pred.toarray()
np.shape(pred.toarray())

(38760, 1161)

In [62]:
d_test.target.values[0][0]

6