In [1]:
import random
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from scipy import stats
from sklearn.preprocessing import MinMaxScaler



In [2]:
raw_data = pd.read_json("./sample_data",lines=True,encoding="utf-8")
raw_data.head()

Unnamed: 0,time,user_id,product_id,belong_cate_lvl1_id,belong_cate_lvl2_id,belong_cate_lvl3_id,belong_cate_lvl1_name,belong_cate_lvl2_name,belong_cate_lvl3_name,href
0,2016-03-31 00:00:00,7892FA71D8D1EAEBE438135718A5C420,2920660,8,9,11,Thời trang nữ,Áo nữ,Áo kiểu,https://www.sendo.vn/san-pham/ao-kieu-from-dai...
1,2016-03-31 00:00:06,2CEBC824EBEF1B52EA7562B716D020D8,1885579,1722,1726,1748,Túi xách,Túi xách nam,Túi đeo chéo nam,https://www.sendo.vn/san-pham/tui-deo-may-tinh...
2,2016-03-31 00:00:07,81ADFE4DC80154EAF829FC0CBADD71EA,2633702,8,52,53,Thời trang nữ,"Đồ lót, đồ ngủ và đồ mặc nhà",Áo ngực,https://www.sendo.vn/san-pham/ao-nguc-7-kieu-c...
3,2016-03-31 00:00:13,F942D7B0377D64F1280EA7BB8DCC3A47,2936475,528,529,542,Phụ kiện công nghệ,Phụ kiện Điện thoại,Phụ kiện khác,https://www.sendo.vn/san-pham/kinh-thuc-te-ao-...
4,2016-03-31 00:00:16,C8BFABB5F293BC9890124E192FBFC898,2146364,8,52,56,Thời trang nữ,"Đồ lót, đồ ngủ và đồ mặc nhà",Đồ lót bộ,https://www.sendo.vn/san-pham/set-ao-yem-quan-...


In [3]:
raw_data = raw_data.groupby(["user_id","product_id"]).count().reset_index()
raw_data.head()

Unnamed: 0,user_id,product_id,time,belong_cate_lvl1_id,belong_cate_lvl2_id,belong_cate_lvl3_id,belong_cate_lvl1_name,belong_cate_lvl2_name,belong_cate_lvl3_name,href
0,0009C208AA7A874A5791FF58F4904BA1,2551554,3,3,3,3,3,3,3,3
1,00287A7AE65343DA3028BE1518910877,1348373,1,1,1,1,1,1,1,1
2,002A34CB240ED93E8DDECDB800C717B5,2344633,1,1,1,1,1,1,1,1
3,00492A3DF96D01889EA3A13D0DEDA181,1903157,1,1,1,1,1,1,1,1
4,006C598E51FA386A33570B0205F7F7BF,2437450,1,1,1,1,1,1,1,1


In [5]:
raw_data.head()

Unnamed: 0,user_id,product_id,time
0,0009C208AA7A874A5791FF58F4904BA1,2551554,3
1,00287A7AE65343DA3028BE1518910877,1348373,1
2,002A34CB240ED93E8DDECDB800C717B5,2344633,1
3,00492A3DF96D01889EA3A13D0DEDA181,1903157,1
4,006C598E51FA386A33570B0205F7F7BF,2437450,1


In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2389 entries, 0 to 2388
Data columns (total 3 columns):
user_id       2389 non-null object
product_id    2389 non-null int64
time          2389 non-null int64
dtypes: int64(2), object(1)
memory usage: 56.1+ KB


In [7]:
# Drop rows with missing values
data = raw_data[["user_id","product_id","time"]].dropna()

# Convert artists names into numerical IDs
data['user_id_code'] = data['user_id'].astype("category").cat.codes
data['product_id_code'] = data['product_id'].astype("category").cat.codes

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2389 entries, 0 to 2388
Data columns (total 5 columns):
user_id            2389 non-null object
product_id         2389 non-null int64
time               2389 non-null int64
user_id_code       2389 non-null int16
product_id_code    2389 non-null int16
dtypes: int16(2), int64(2), object(1)
memory usage: 84.0+ KB


In [9]:
data.head()

Unnamed: 0,user_id,product_id,time,user_id_code,product_id_code
0,0009C208AA7A874A5791FF58F4904BA1,2551554,3,0,927
1,00287A7AE65343DA3028BE1518910877,1348373,1,1,150
2,002A34CB240ED93E8DDECDB800C717B5,2344633,1,2,635
3,00492A3DF96D01889EA3A13D0DEDA181,1903157,1,3,245
4,006C598E51FA386A33570B0205F7F7BF,2437450,1,4,730


In [10]:
product_names=json.load(open("data/docs/product_name_mapping.json","r"))

In [11]:
item_lookup = data[['product_id', 'product_id_code']].drop_duplicates()
item_lookup['product_id_code'] = item_lookup.product_id_code.astype(str)
item_lookup['product_name'] = item_lookup.product_id.apply(lambda x: product_names[str(x)].replace("-"," "))
item_lookup.head()

Unnamed: 0,product_id,product_id_code,product_name
0,2551554,927,moc khoa kiem katana
1,1348373,150,gn016 giay the thao tre trung
2,2344633,635,giay nike thea ms 131
3,1903157,245,dong ho nam casio efr 535l 1avdf
4,2437450,730,nhan da my inox cao cap bh vinh vien nb043


In [12]:
data = data.drop(['user_id', 'product_id'], axis=1)

In [13]:

# # Drop any rows that have 0 plays
# data = data.loc[data.time != 0]
# data.head()

Unnamed: 0,time,user_id_code,product_id_code
0,3,0,927
1,1,1,150
2,1,2,635
3,1,3,245
4,1,4,730


In [14]:
users = list(np.sort(data.user_id_code.unique()))
products = list(np.sort(data.product_id_code.unique()))
time = list(data.time)

In [15]:
users[:5]

[0, 1, 2, 3, 4]

In [16]:
len(users)

859

In [17]:
products[:5]

[0, 1, 2, 3, 4]

In [18]:
len(products)

2123

In [19]:
time[:5]

[3, 1, 1, 1, 1]

In [20]:
len(time)

2389

In [21]:
# Get the rows and columns for our new matrix
rows = data.user_id_code.astype(int)
cols = data.product_id_code.astype(int)

In [22]:
data_sparse = sparse.csr_matrix((time, (rows, cols)), shape=(len(users), len(products)))

In [23]:
pd.DataFrame(list(data_sparse.toarray()), columns=range(len(products))).to_csv("./sample_sparse")

In [24]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
alpha_val = 15
data_conf = (data_sparse * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)



HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




In [62]:
item_id = 2 
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

print("product:",item_lookup.product_name.loc[data.product_id_code==item_id].iloc[0])

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(item_lookup.product_name.loc[raw_data.product_id.loc[data.product_id_code == idx].iloc[0]==item_lookup.product_id].iloc[0])

product: giay sandal xuat khau nhat ban nv25
giay sandal xuat khau nhat ban nv25
ao kieu han quoc
giay tay nam da bo that t64
ban phim apple mini
dong ho casio efb 300l 7av chinh hang danh cho nam 2683269
quan tay nam cong so
op nokia lumia 620 sgp nham ben dep oln15
quy cotui xach henry thoi trang
sandal phong cach lee min ho gsd1414
do lot cosplay ho 3 vong sexy ql44


In [33]:
sparse_user_item = sparse.csr_matrix((time, (cols, rows)))

In [44]:
raw_data.columns

Index(['user_id', 'product_id', 'time'], dtype='object')

In [57]:
user_id = 2000

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item, N=len(item_lookup), recalculate_user=True)

product_ids = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    product_ids.append(item_lookup.product_name.loc[raw_data.product_id.loc[data.product_id_code == idx].iloc[0]==item_lookup.product_id].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores

recommendations = pd.DataFrame({'product': product_ids, 'score': scores})


print (recommendations)

                                           product     score
0                                   dau ca omega 3  0.063189
1                       jeans ngan cac loai qtt990  0.050269
2                                bat lua diem xang  0.048660
3       nuoc hoa chanel no5 eau de parfum 50ml b10  0.038022
4         ao so mi phong cach han quoc mk4019 s130  0.037326
..                                             ...       ...
853          tinker bell kids bo gap ngan 3831 bee -0.036661
854  ao khoac jean tui cai bk01 hang loai 1 y hinh -0.038580
855              tai nghe sony q89 gia re nhat hcm -0.039391
856                       danh lua sinh ton gerber -0.051051
857                        ao so mi form dai meari -0.053656

[858 rows x 2 columns]


In [50]:
alpha = 40
user_vecs, item_vecs = implicit.alternating_least_squares((data_sparse*alpha).astype('double'), 
                                                          factors=20, 
                                                          regularization = 0.1, 
                                                          iterations = 20)

NotImplementedError: adding a nonzero scalar to a sparse matrix is not supported

In [25]:
user_vecs.shape

(859, 20)

In [26]:
item_vecs.shape

(2123, 20)

In [48]:
# do it in chunks else we get memory error
u_idx = data['user_id_code'].values
i_idx = data['product_id_code'].values
n_chunks = 30
chunks = np.array_split(np.arange(user_vecs.shape[0]), n_chunks)
res = []
previous_max = 0
for i,idx in enumerate(chunks):
    print(f'Doing Chunk {i+1}/{n_chunks}')
    score = (user_vecs[idx].dot(item_vecs))
    score = (-score).argsort() / score.shape[1]
    sel = (u_idx >= idx.min()) & (u_idx <= idx.max())
    chunk_score = np.asarray(score[u_idx[sel] - previous_max, i_idx[sel]]).reshape(-1)
    res.append(chunk_score)
    previous_max = idx.max() + 1
np.concatenate(res)

Doing Chunk 1/30


ValueError: shapes (29,20) and (2123,20) not aligned: 20 (dim 1) != 2123 (dim 0)

In [28]:
item_id = 1488

# Get the item row for Jay-Z
item_vec = item_vecs[item_id].T

# Calculate the similarity score between Mr Carter and other artists
# and select the top 10 most similar.
scores = item_vecs.dot(item_vec).reshape(1,-1)[0]
top_10 = np.argsort(scores)[::-1][:10]

products = []
product_scores = []

# Get and print the actual artists names and scores
for idx in top_10:
    products.append(item_lookup.product_name.loc[item_lookup.product_id_code == str(idx)].iloc[0])
    product_scores.append(scores[idx])

similar = pd.DataFrame({'product': products, 'score': product_scores})
print("Top 10 similar items of `{}`".format(item_lookup.product_name.loc[item_lookup.product_id_code == str(idx)].iloc[0]))
similar

Top 10 similar items of `giay vai g010`


Unnamed: 0,product,score
0,quan jogger kaki han quoc,16.458139
1,ma mm1011 quan jeans nam tre trung phong cach,15.349257
2,quan jogger kaki han quoc,14.561874
3,giay nam adidas running men,14.519055
4,quan jogger kaki han quoc qk3,13.854018
5,quan nu thoi trang co gian chat dep q330,12.40118
6,quan jogger,10.972501
7,ma mm1005 quan jogger kaki nhieu mau tre trung,10.678747
8,kham rang ca sau game vui,10.43117
9,giay vai g010,10.229251


In [39]:
user_id = 2

#------------------------------
# GET ITEMS CONSUMED BY USER
#------------------------------

# Let's print out what the user has listened to
consumed_idx = data_sparse[user_id,:].nonzero()[1].astype(str)
consumed_items = item_lookup.loc[item_lookup.product_id_code.isin(consumed_idx)]
consumed_items

Unnamed: 0,product_id,product_id_code,product_name
2,2344633,635,giay nike thea ms 131


In [40]:
raw_data[raw_data.product_id==2799724]

Unnamed: 0,user_id,product_id,time
124,0FAE211D8FD332CCF1EFAA525A6907D3,2799724,1
833,5F27822DBFC1BB575DA3B35D779CCAFC,2799724,1


In [47]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

def recommend(user_id, user_vecs, item_vecs, item_lookup, num_items=None):
    """Recommend items for a given user given a trained model
    
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        
        data_sparse (csr_matrix): Our original training data.
        
        user_vecs (csr_matrix): The trained user x features vectors
        
        item_vecs (csr_matrix): The trained item x features vectors
        
        item_lookup (pandas.DataFrame): Used to map artist ids to artist names
        
        num_items (int): How many recommendations we want to return:
        
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    
    """
    # This is where we calculate the recommendation by taking the 
    # dot-product of the user vectors with the item vectors.
    # Pui
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T)
    recommend_vector = rec_vector.reshape(-1,1)[:,0]
   
    # Get all the artist indices in order of recommendations (descending) and
    # select only the top "num_items" items.
    if num_items:
        item_idx = np.argsort(recommend_vector)[::-1][:num_items]
    else:
        item_idx = np.argsort(recommend_vector)[::-1]
    products = []
    scores = []

    # Loop through our recommended artist indicies and look up the actial artist name
    for idx in item_idx:
        products.append(item_lookup.product_name.loc[item_lookup.product_id_code == str(idx)].iloc[0])
        scores.append(recommend_vector[idx])

    # Create a new dataframe with recommended artist names and scores
    recommendations = pd.DataFrame({'product': products, 'score': scores})
    
    return recommendations

# Let's generate and print our recommendations
recommendations = recommend(user_id, user_vecs, item_vecs, item_lookup)


In [46]:
recommendations["rank"] = [100-stats.percentileofscore(recommendations.score.values, x) for x in recommendations.score.values]

In [43]:
recommendations.head()

Unnamed: 0,product,score,rank
0,giay the thao ls0891,0.279376,0.0
1,giay sdidas nu nam sieu nhe,0.270372,0.047103
2,giay nike thea ms 131,0.252442,0.094206
3,giay adidas neo 2015 ms 026,0.236952,0.141309
4,giay tang chieu cao nike air max hang nhap si ...,0.235298,0.188413


In [44]:
recommendations.tail()

Unnamed: 0,product,score,rank
2118,giay the thao pt 28 new 2016,-0.154836,99.764484
2119,haco ao so mi tinh nhan de thuong,-0.159346,99.811587
2120,set ao kem quan ong rong,-0.159904,99.858691
2121,ao so mi nam tron body tay dai cong so,-0.161745,99.905794
2122,ban phim co motospeed k87 tkl rainbow,-0.25267,99.952897


In [35]:
recommendations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2123 entries, 0 to 2122
Data columns (total 3 columns):
product    2123 non-null object
score      2123 non-null float64
rank       2123 non-null float64
dtypes: float64(2), object(1)
memory usage: 49.9+ KB


In [36]:
PR = sum((recommendations["score"] * recommendations["rank"]))/sum(recommendations["score"])

In [37]:
PR

-63.05024613777266