In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import annoy
import scipy.sparse as sparse
import random
from implicit.als import AlternatingLeastSquares as ALS
import implicit.evaluation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_parquet('data/market_final.parquet')

In [3]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_code,sub_category,product,brand,price,user_id,user_session
0,2019-11-01 00:00:00,view,1003461,electronics,smartphone,smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
1,2019-11-01 00:00:00,view,5000088,appliances,sewing_machine,sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2,2019-11-01 00:00:01,view,1306421,computers,notebook,notebook,hp,514.56,514028527,df8184cc-3694-4549-8c8c-6b5171877376
3,2019-11-01 00:00:01,view,1004775,electronics,smartphone,smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2
4,2019-11-01 00:00:01,view,1306894,computers,notebook,notebook,hp,360.09,520772685,816a59f3-f5ae-4ccd-9b23-82aa8c23d33c


제품 카테고리, 브랜드, 가격 정보를 활용한 컨텐츠 기반 필터링 모델

In [4]:
df_cb = df[['product_id', 'category_code', 'sub_category', 'product', 'brand', 'price']]
df_cb = df_cb.drop_duplicates(['product_id'], keep = 'first')
df_cb = df_cb.sort_values(by='product_id', ascending=True)
df_cb.reset_index(drop=True, inplace=True)

In [5]:
df_cb.shape

(69773, 6)

In [6]:
df_cb.head()

Unnamed: 0,product_id,category_code,sub_category,product,brand,price
0,1000365,electronics,smartphone,smartphone,sony,1029.09
1,1000978,electronics,smartphone,smartphone,samsung,301.14
2,1001588,electronics,smartphone,smartphone,meizu,127.87
3,1001606,electronics,smartphone,smartphone,apple,363.07
4,1001618,electronics,smartphone,smartphone,apple,501.66


In [7]:
df_cb_price = df_cb['price'].to_frame()
df_cb_price.head()

Unnamed: 0,price
0,1029.09
1,301.14
2,127.87
3,363.07
4,501.66


In [8]:
# 최대 최소 범위에서 스케일링
scaler = MinMaxScaler()
price_feature_scaled = scaler.fit_transform(df_cb_price)
price_feature_scaled = pd.DataFrame(price_feature_scaled, columns=df_cb_price.columns, index=list(df_cb_price.index.values))
price_feature_scaled.head()

Unnamed: 0,price
0,0.399611
1,0.116726
2,0.049392
3,0.140792
4,0.194649


In [9]:
df_cb['product_features'] = df_cb['category_code'] + ' ' + df_cb['sub_category'] + ' ' + df_cb['product'] + ' ' + df_cb['brand']
df_cb.head()

Unnamed: 0,product_id,category_code,sub_category,product,brand,price,product_features
0,1000365,electronics,smartphone,smartphone,sony,1029.09,electronics smartphone smartphone sony
1,1000978,electronics,smartphone,smartphone,samsung,301.14,electronics smartphone smartphone samsung
2,1001588,electronics,smartphone,smartphone,meizu,127.87,electronics smartphone smartphone meizu
3,1001606,electronics,smartphone,smartphone,apple,363.07,electronics smartphone smartphone apple
4,1001618,electronics,smartphone,smartphone,apple,501.66,electronics smartphone smartphone apple


In [10]:
product_lookup = df_cb['product_id'].to_frame()
product_lookup

Unnamed: 0,product_id
0,1000365
1,1000978
2,1001588
3,1001606
4,1001618
...,...
69768,100028296
69769,100028349
69770,100028351
69771,100028391


In [11]:
tfidf = TfidfVectorizer()
df_tfidf = tfidf.fit_transform(df_cb['product_features'])
df_tfidf = pd.DataFrame(df_tfidf.todense(), columns = tfidf.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,aardwolf,abtoys,accessories,accumaster,acd,acer,aces,acme,acoustic,acqua,...,zobo,zongshen,zoom,zorg,zotac,zpao,zte,zubr,zuru,zwerg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_tfidf.insert(0, 'product_price', price_feature_scaled)

In [13]:
df_tfidf

Unnamed: 0,product_price,aardwolf,abtoys,accessories,accumaster,acd,acer,aces,acme,acoustic,...,zobo,zongshen,zoom,zorg,zotac,zpao,zte,zubr,zuru,zwerg
0,0.399611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.116726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.049392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.140792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.194649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69768,0.044414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69769,0.036611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69770,0.036611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69771,0.028710,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
pca = PCA(n_components=100)
df_tfidf_pca = pca.fit_transform(df_tfidf)
df_tfidf_pca.shape

(69773, 100)

In [15]:
annoy_index = annoy.AnnoyIndex(f=df_tfidf_pca.shape[1], metric='angular')

for idx in range(df_tfidf_pca.shape[0]):
    vector = df_tfidf_pca[idx, :]
    annoy_index.add_item(idx, vector)

In [16]:
annoy_index.build(n_trees=10)
annoy_index.save('cb.ann')

True

In [17]:
get_nns_list = annoy_index.get_nns_by_vector(vector=df_tfidf_pca[36235], n=16, include_distances=True)
get_nns_list

([36235,
  36231,
  36267,
  35173,
  36075,
  36069,
  64204,
  64209,
  64893,
  64930,
  64943,
  35529,
  36194,
  36195,
  36196,
  35537],
 [0.00020413610036484897,
  0.004422598984092474,
  0.02029496431350708,
  0.02046213299036026,
  0.020615287125110626,
  0.020855436101555824,
  0.02091101184487343,
  0.02091101184487343,
  0.023182062432169914,
  0.023182062432169914,
  0.023182062432169914,
  0.023766987025737762,
  0.023766987025737762,
  0.023766987025737762,
  0.023766987025737762,
  0.026500549167394638])

In [18]:
df_cb.iloc[get_nns_list[0]] # 777번 아이템과 유사한 15개의 아이템 추천 리스트

Unnamed: 0,product_id,category_code,sub_category,product,brand,price,product_features
36235,13903348,construction,components,faucet,lava,191.0,construction components faucet lava
36231,13903343,construction,components,faucet,lava,197.28,construction components faucet lava
36267,13903393,construction,components,faucet,elleci,176.15,construction components faucet elleci
35173,13901420,construction,components,faucet,cobra,167.58,construction components faucet cobra
36075,13903045,construction,components,faucet,florentina,174.01,construction components faucet florentina
36069,13903028,construction,components,faucet,damixa,163.73,construction components faucet damixa
64204,100003671,construction,components,faucet,drgans,165.0,construction components faucet drgans
64209,100003707,construction,components,faucet,drgans,165.0,construction components faucet drgans
64893,100005488,construction,components,faucet,jacobdelafon,181.73,construction components faucet jacobdelafon
64930,100005650,construction,components,faucet,jacobdelafon,181.73,construction components faucet jacobdelafon


사용자의 구매 기록(purchase)를 활용한 ALS모델 기반 추천 시스템

In [19]:
df_implicit = df[['user_id', 'product_id', 'event_type']]
df_implicit

Unnamed: 0,user_id,product_id,event_type
0,520088904,1003461,view
1,530496790,5000088,view
2,514028527,1306421,view
3,558856683,1004775,view
4,520772685,1306894,view
...,...,...,...
42018767,562661595,12301059,view
42018768,545223467,28719425,view
42018769,579969851,1004233,view
42018770,531607492,2701706,view


In [20]:
df_implicit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42018772 entries, 0 to 42018771
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   user_id     int32 
 1   product_id  int32 
 2   event_type  object
dtypes: int32(2), object(1)
memory usage: 641.2+ MB


In [21]:
def event_type_score(df, value, score):
    df.loc[df['event_type'] == value, 'event_type'] = score

In [22]:
dict = {'view':np.nan, 'cart':np.nan, 'purchase':1}
for k, v in dict.items():
    event_type_score(df_implicit, k, v)

In [23]:
df_implicit = df_implicit.dropna()
df_implicit.shape

(659251, 3)

In [24]:
df_implicit

Unnamed: 0,user_id,product_id,event_type
83,513351129,1005161,1
389,562958505,1004856,1
528,557746614,1801881,1
530,514166940,5800823,1
612,515240495,30000218,1
...,...,...,...
42018611,574868869,1004767,1
42018622,547804983,1004874,1
42018634,515582054,1005130,1
42018681,579876821,1004767,1


In [25]:
df_implicit['event_type'] = df_implicit['event_type'].astype('int')
df_implicit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 659251 entries, 83 to 42018706
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   user_id     659251 non-null  int32
 1   product_id  659251 non-null  int32
 2   event_type  659251 non-null  int32
dtypes: int32(3)
memory usage: 12.6 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_implicit['event_type'] = df_implicit['event_type'].astype('int')


In [26]:
df_implicit_grouped = df_implicit.groupby(['product_id', 'user_id']).sum().reset_index()
df_implicit_grouped

Unnamed: 0,product_id,user_id,event_type
0,1000978,512405575,1
1,1000978,516351275,1
2,1000978,519261186,1
3,1000978,520341367,1
4,1000978,536719764,1
...,...,...,...
506945,100024607,565302483,1
506946,100025474,521856690,1
506947,100026889,563035784,1
506948,100027045,578925058,1


In [27]:
user_unique = df_implicit_grouped['user_id'].unique()
product_unique = df_implicit_grouped['product_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
product_to_idx = {v:k for k,v in enumerate(product_unique)}

In [28]:
temp_user_data = df_implicit_grouped['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(df_implicit_grouped):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    df_implicit_grouped['user_id'] = temp_user_data   # 데이터프레임['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# product_to_idx을 통해 product_id 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_product_data = df_implicit_grouped['product_id'].map(product_to_idx.get).dropna()
if len(temp_product_data) == len(df_implicit_grouped):
    print('product column indexing OK!!')
    df_implicit_grouped['product_id'] = temp_product_data
else:
    print('product column indexing Fail!!')

user_id column indexing OK!!
product column indexing OK!!


In [29]:
df_implicit_grouped

Unnamed: 0,product_id,user_id,event_type
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
506945,20767,330391,1
506946,20768,326237,1
506947,20769,330392,1
506948,20770,330393,1


In [30]:
num_user = df_implicit_grouped['user_id'].nunique()
num_product = df_implicit_grouped['product_id'].nunique()

implicit_matrix = sparse.csr_matrix((df_implicit_grouped.event_type, (df_implicit_grouped.user_id, df_implicit_grouped.product_id)),
                                    shape=(num_user,num_product))

In [31]:
implicit_matrix.T

<20772x330394 sparse matrix of type '<class 'numpy.intc'>'
	with 506950 stored elements in Compressed Sparse Column format>

In [32]:
matrix_size = implicit_matrix.shape[0]* implicit_matrix.shape[1]
num_purchases = len(implicit_matrix.nonzero()[0])
sparsity = 100 * (1 - (num_purchases / matrix_size))
sparsity

99.99261322855628

In [33]:
def make_train (matrix, percentage = .2):
    '''
    -----------------------------------------------------
    설명
    유저-아이템 행렬 (matrix)에서 
    1. 0 이상의 값을 가지면 1의 값을 갖도록 binary하게 테스트 데이터를 만들고
    2. 훈련 데이터는 원본 행렬에서 percentage 비율만큼 0으로 바뀜
    
    -----------------------------------------------------
    반환
    training_set: 훈련 데이터에서 percentage 비율만큼 0으로 바뀐 행렬
    test_set:     원본 유저-아이템 행렬의 복사본
    user_inds:    훈련 데이터에서 0으로 바뀐 유저의 index
    '''
    test_set = implicit_matrix.copy()
    
    training_set = implicit_matrix.copy()
    nonzero_inds = training_set.nonzero()
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))
    
    random.seed(0)
    num_samples = int(np.ceil(percentage * len(nonzero_pairs)))
    samples = random.sample(nonzero_pairs, num_samples)
    
    user_inds = [index[0] for index in samples]
    item_inds = [index[1] for index in samples]
    
    training_set[user_inds, item_inds] = 0
    training_set.eliminate_zeros()
    
    return training_set, test_set, list(set(user_inds))

# 훈련, 테스트 데이터 생성
mat_train, mat_test, product_users_altered = make_train(implicit_matrix, 0.2)

In [34]:
als_model = ALS(factors=30,
                regularization=0.1,
                alpha = 40,
                iterations = 30,
                calculate_training_loss = True)
als_model.fit(mat_train)

100%|██████████| 30/30 [04:47<00:00,  9.57s/it, loss=0.000998]


In [35]:
implicit.evaluation.precision_at_k(als_model, mat_train, mat_test, K=15, show_progress=True)

100%|██████████| 330394/330394 [00:44<00:00, 7489.55it/s]


0.02781315044727998

In [36]:
# 330394번째 유저의 latent factors 확인
als_model.user_factors[330393]

array([-8.5597418e-10, -1.5552704e-09, -7.2105766e-11, -3.7473702e-10,
        5.9785932e-10, -6.6317152e-10,  6.4601058e-10, -8.3007196e-10,
       -5.1634413e-10,  6.3866651e-10,  5.7529981e-10, -2.6126368e-10,
        3.7322939e-10, -3.2966567e-09,  1.1515937e-09,  2.3807933e-10,
       -3.2644942e-10,  2.6577327e-09,  8.8510732e-10, -2.2930491e-09,
        1.1114054e-10,  5.0200422e-10,  8.0209994e-10,  4.9730757e-09,
       -1.9994324e-09,  1.2762804e-09,  3.1840397e-10,  3.2364753e-09,
       -1.7749727e-10,  4.8948929e-09], dtype=float32)

In [37]:
# 20772번째 제품의 latent factors 확인
als_model.item_factors[20771]

array([5.43810164e-10, 3.77862963e-10, 4.69908445e-10, 1.07930664e-10,
       9.28573107e-10, 2.90957147e-10, 6.72667144e-10, 5.67818959e-10,
       3.18575166e-10, 2.16631990e-10, 6.14132689e-10, 4.55474880e-10,
       3.86160881e-10, 2.38243703e-10, 3.63687358e-10, 8.90356011e-10,
       5.74156278e-11, 2.57902505e-10, 5.08326714e-10, 2.02825534e-11,
       5.42728806e-10, 4.06811779e-10, 6.92490787e-10, 9.82230075e-10,
       7.11281145e-10, 1.01613717e-11, 8.73291439e-11, 1.23528965e-12,
       1.15711107e-10, 9.41056344e-10], dtype=float32)

In [38]:
# 구매한 아이템과 비슷한 아이템에 대한 추천

product_idx = product_to_idx[1004650]
similar_product = als_model.similar_items(product_idx, N=16)

idx_to_product = {v:k for k,v in product_to_idx.items()}
product_id_list = [idx_to_product[i] for i in similar_product[0]]

In [39]:
df_cb[df_cb['product_id'] == 1004650]

Unnamed: 0,product_id,category_code,sub_category,product,brand,price,product_features
777,1004650,electronics,smartphone,smartphone,samsung,622.27,electronics smartphone smartphone samsung


In [40]:
df_cb[df_cb['product_id'].isin(product_id_list)]

Unnamed: 0,product_id,category_code,sub_category,product,brand,price,product_features
471,1004158,electronics,smartphone,smartphone,samsung,733.61,electronics smartphone smartphone samsung
735,1004573,electronics,smartphone,smartphone,samsung,720.45,electronics smartphone smartphone samsung
777,1004650,electronics,smartphone,smartphone,samsung,622.27,electronics smartphone smartphone samsung
779,1004653,electronics,smartphone,smartphone,samsung,588.92,electronics smartphone smartphone samsung
787,1004665,electronics,smartphone,smartphone,samsung,782.22,electronics smartphone smartphone samsung
1099,1005072,electronics,smartphone,smartphone,samsung,990.24,electronics smartphone smartphone samsung
2184,1306610,computers,notebook,notebook,lenovo,720.71,computers notebook notebook lenovo
8866,2900847,appliances,kitchen,microwave,bbk,97.79,appliances kitchen microwave bbk
37961,17000060,computers,desktop,desktop,e-blue,294.86,computers desktop desktop e-blue
45790,21411780,electronics,clocks,clocks,tissot,360.37,electronics clocks clocks tissot


In [41]:
# ID 519261186 유저가 선호할 만한 아이템 추천
user = user_to_idx[519261186]
# recommend에서는 user*item CSR Matrix를 받음.
product_recommend = als_model.recommend(user, implicit_matrix[user], N=15)
product_recommend

(array([739, 672, 670, 536, 731,  18,  13, 462, 452, 510, 523, 738, 778,
         30, 701]),
 array([0.83791304, 0.7931893 , 0.77002525, 0.6492033 , 0.62172914,
        0.6086109 , 0.58955985, 0.56608593, 0.5652786 , 0.5576108 ,
        0.54713184, 0.49755776, 0.49480116, 0.4926841 , 0.48991564],
       dtype=float32))

In [62]:
product_id_list2 = [idx_to_product[i] for i in product_recommend[0]]
df_cb[df_cb['product_id'].isin(product_id_list2)]

Unnamed: 0,product_id,category_code,sub_category,product,brand,price,product_features
28,1002524,electronics,smartphone,smartphone,apple,531.26,electronics smartphone smartphone apple
33,1002532,electronics,smartphone,smartphone,apple,532.57,electronics smartphone smartphone apple
51,1002629,electronics,smartphone,smartphone,apple,358.31,electronics smartphone smartphone apple
851,1004750,electronics,smartphone,smartphone,samsung,195.06,electronics smartphone smartphone samsung
862,1004767,electronics,smartphone,smartphone,samsung,242.63,electronics smartphone smartphone samsung
923,1004839,electronics,smartphone,smartphone,oppo,179.12,electronics smartphone smartphone oppo
938,1004856,electronics,smartphone,smartphone,samsung,128.42,electronics smartphone smartphone samsung
951,1004873,electronics,smartphone,smartphone,samsung,362.29,electronics smartphone smartphone samsung
1124,1005098,electronics,smartphone,smartphone,samsung,139.64,electronics smartphone smartphone samsung
1126,1005100,electronics,smartphone,smartphone,samsung,139.68,electronics smartphone smartphone samsung


In [60]:
# ID 1002532 아이템을 추천한 이유를 추론 가능
explain = als_model.explain(user, implicit_matrix, itemid=672)
explain_list = [(idx_to_product[i[0]], i[1]) for i in explain[1]]
explain_product_list = []
for i in range(0,7):
    explain_product_list.append(explain_list[i][0])

In [61]:
df_cb[df_cb['product_id'].isin(explain_product_list)]

Unnamed: 0,product_id,category_code,sub_category,product,brand,price,product_features
1,1000978,electronics,smartphone,smartphone,samsung,301.14,electronics smartphone smartphone samsung
12,1002101,electronics,smartphone,smartphone,samsung,370.64,electronics smartphone smartphone samsung
52,1002633,electronics,smartphone,smartphone,apple,358.31,electronics smartphone smartphone apple
922,1004838,electronics,smartphone,smartphone,oppo,178.87,electronics smartphone smartphone oppo
1131,1005105,electronics,smartphone,smartphone,apple,1348.61,electronics smartphone smartphone apple
1186,1005160,electronics,smartphone,smartphone,xiaomi,212.08,electronics smartphone smartphone xiaomi
1237,1005212,electronics,smartphone,smartphone,samsung,193.22,electronics smartphone smartphone samsung


In [64]:
# als 모델 저장
als_model.save('als.npz')