In [2]:
# 데이터 불러오기
import pandas as pd
import numpy as np

cart_data = pd.read_csv('data/cart_data.csv',index_col=0)
purchase_data = pd.read_csv('data/purchase_data.csv',index_col=0)
pv_data = pd.read_csv('data/pv_data.csv',index_col=0)

In [3]:
# user_id를 사용할 것이므로 cluster_id는 제거해준다.
del cart_data['cluster_id']
del purchase_data['cluster_id']
del pv_data['cluster_id']

# cart_data와 purchase_data에 정보가 있다면 rating을 2,3점씩 부여한다.
cart_data['rating_cart'] = 2
purchase_data['rating_purchase'] = 3


# user가 동일 상품을 장바구니에 담은 데이터는 중복으로 판단하여 제거한다.
cart_data = cart_data.drop_duplicates(['user_id','item_code'])

# outer join 전 변수명이 헷갈리지 않게 바꿔준다.
cart_data['datetime_cart'] = cart_data['datetime']
del cart_data['datetime']
purchase_data['datetime_purchase'] = purchase_data['datetime']
del purchase_data['datetime']

In [4]:
# user_id와 item_code 기준으로 outer join
item_df = pd.merge(cart_data,purchase_data,on = ['user_id','item_code'],how = 'outer')
item_df

Unnamed: 0,user_id,item_code,rating_cart,datetime_cart,rating_purchase,datetime_purchase
0,4288,368702,2.0,2022-09-14 02:01:55.134303,,
1,4288,370878,2.0,2022-09-14 02:02:12.095274,,
2,1640,375142,2.0,2022-09-14 02:02:19.164900,,
3,1363,374281,2.0,2022-09-14 02:02:50.550594,,
4,4291,211908,2.0,2022-09-14 02:02:53.209671,,
...,...,...,...,...,...,...
13688,3339,277577,,,3.0,2022-09-19 12:31:42.561816
13689,3337,379577,,,3.0,2022-09-19 12:32:25.632943
13690,3340,332558,,,3.0,2022-09-19 12:32:57.696375
13691,3340,332558,,,3.0,2022-09-19 12:32:58.256464


In [5]:
# user_id별 장바구니 + 구매 횟수
#  rating_5.groupby('user_id').count()

In [6]:
# rating 합을 계산 전 na값을 0으로 채워준다.
item_df['rating_cart'] = item_df['rating_cart'].fillna(0)
item_df['rating_purchase'] = item_df['rating_purchase'].fillna(0)

# rating_cart와 rating_purchase의 합을 계산하여 새로운 column을 만들어준다.
item_df['rating'] = item_df['rating_cart'] + item_df['rating_purchase']
del item_df['rating_cart']
del item_df['rating_purchase']

In [7]:
rating_5 = item_df[item_df['rating']==5.0]

In [8]:
rating_5.shape

(2062, 5)

In [9]:
df = pd.pivot_table(item_df,                # 피벗할 데이터프레임
                     index = 'user_id',    # 행 위치에 들어갈 열
                     columns = 'item_code',    # 열 위치에 들어갈 열
                     values = 'rating').fillna(0)

In [10]:
df.head()

item_code,655,2401,8287,11843,13039,13101,13157,13357,13361,13393,...,379661,379664,379665,379674,379676,379682,379686,379699,379711,379950
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# matrix는 pivot_table 값을 numpy matrix로 만든 것 
matrix = df.to_numpy()
# user_ratings_mean은 사용자의 평균 평점 
user_ratings_mean = np.mean(matrix, axis = 1)

# R_user_mean : 사용자-상품에 대해 사용자 평균 평점을 뺀 것.
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)

In [12]:
matrix.shape

(4723, 3625)

In [13]:
user_ratings_mean.shape

(4723,)

In [14]:
matrix_user_mean.shape

(4723, 3625)

In [15]:
pd.DataFrame(matrix_user_mean, columns = df.columns).head()
# 사용자 4723명이 3625개의 상품에 대해 평점을 매긴 값이 존재
# 사용자 4723명의 각각 평균 평점을 구해서
# 사용자 4723명의 3625개의 상품에 대해 평점을 조금 변경 -> 1에서 구한 값 - 2에서 구한 값

item_code,655,2401,8287,11843,13039,13101,13157,13357,13361,13393,...,379661,379664,379665,379674,379676,379682,379686,379699,379711,379950
0,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,...,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655,-0.001655
1,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,...,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828
2,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,...,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828,-0.000828
3,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,...,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552,-0.008552
4,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,...,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379,-0.001379


In [16]:
# 이제 SVD를 이용해 Matrix Factorization을 진행
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(matrix_user_mean, k = 12)

In [17]:
print(U.shape)
print(sigma.shape)
print(Vt.shape)

(4723, 12)
(12,)
(12, 3625)


현재 이 Sigma 행렬은 0이 아닌 값만 1차원 행렬로 표현된 상태  
즉, 0이 포함된 대칭행렬로 변환할 때는 numpy의 diag를 이용해야함.

In [18]:
sigma = np.diag(sigma)
sigma.shape

(12, 12)

In [19]:
sigma[0]

array([24.95271672,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ])

In [20]:
sigma[1]

array([ 0.        , 25.74658596,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ])

현재 까지 상황을 정리하면    

원본 user-item 평점 행렬이 있었음  
이를 user의 평균 점수를 빼서 matrix_user_mean 이라는 행렬로 만듬  
2번의 값을 SVD를 적용해 U, Sigma, Vt 행렬을 구했음  
Sigma 행렬은 현재 0이 포함이 되지 않은 값으로만 구성되어 있음. 이를 대칭행렬로 변환  
이제 여기서 matrix_user_mean을 SVD를 적용해 분해를 한 상태  
다시 원본 행렬로 복구

원본 행렬로 복구시키는 방법

U, Sigma, Vt의 내적을 수행
즉, np.dot(np.dot(U, sigma), Vt)를 수행

그리고 아까 사용자 평균을 빼주었으니 여기서는 더해줌

In [21]:
# U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원이 된다. 
# 거기에 + 사용자 평균 rating을 적용한다. 
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [22]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df.columns)
df_svd_preds.head()

item_code,655,2401,8287,11843,13039,13101,13157,13357,13361,13393,...,379661,379664,379665,379674,379676,379682,379686,379699,379711,379950
0,0.001697,0.001687,0.001697,0.001594,0.001686,0.001698,0.001697,0.001697,0.001677,0.001697,...,0.001361,0.001636,0.001329,0.001563,0.001697,0.001693,0.001564,0.001697,0.001314,0.001697
1,0.000396,0.000544,0.000397,0.003751,0.000488,0.000387,0.000396,0.000395,0.000166,0.000396,...,0.002653,0.00058,0.010229,0.002308,0.000396,0.000376,1.2e-05,0.000396,0.007327,0.000396
2,0.000851,0.000846,0.000851,0.000814,0.000847,0.000851,0.000851,0.000851,0.000838,0.000851,...,0.000679,0.000824,0.000654,0.000748,0.000851,0.000847,0.000794,0.000851,0.000679,0.000851
3,0.007433,0.007282,0.007433,0.003537,0.007427,0.007428,0.007433,0.007432,0.008096,0.007433,...,0.012534,0.008528,0.009578,0.012779,0.007433,0.007524,0.008269,0.007433,0.006912,0.007433
4,0.001402,0.001397,0.001402,0.001245,0.001395,0.001402,0.001402,0.001402,0.001387,0.001402,...,0.000905,0.00134,0.001592,0.00129,0.001402,0.001402,0.001259,0.001402,0.001323,0.001402


In [23]:
df_svd_preds.shape

(4723, 3625)

인자로 사용자 아이디, 상품 정보 테이블, 평점 테이블 등을 받음  
사용자 아이디에 SVD로 나온 결과의 상품 평점이 가장 높은 데이터 순으로 정렬   
사용자가 산 데이터를 제외 
사용자가 안 산 상품에서 평점이 높은 것을 추천  

In [24]:
ratings = pd.read_csv('/Users/yangjeonghyeon/Desktop/2022_02/kdt/ratings.csv',index_col=0)
items = item_df.set_index('item_code')
del items['user_id']
items

Unnamed: 0_level_0,datetime_cart,datetime_purchase,rating
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
368702,2022-09-14 02:01:55.134303,,2.0
370878,2022-09-14 02:02:12.095274,,2.0
375142,2022-09-14 02:02:19.164900,,2.0
374281,2022-09-14 02:02:50.550594,,2.0
211908,2022-09-14 02:02:53.209671,,2.0
...,...,...,...
277577,,2022-09-19 12:31:42.561816,3.0
379577,,2022-09-19 12:32:25.632943,3.0
332558,,2022-09-19 12:32:57.696375,3.0
332558,,2022-09-19 12:32:58.256464,3.0


In [25]:
df_svd_preds.reset_index().to_csv('df_svd_preds.csv',index = False)

In [26]:
df_svd_preds = pd.read_csv('df_svd_preds.csv')

In [27]:
items.to_csv('items.csv')
df_svd_preds.to_csv('df_svd_preds.csv',index=False)
ratings.to_csv('ratings_final.csv',index=False)

items = pd.read_csv('items.csv')
df_svd_preds = pd.read_csv('df_svd_preds.csv')
ratings = pd.read_csv('ratings_final.csv')

In [32]:
def recommend_items(df_svd_preds, user_id, ori_items_df, ori_ratings_df, num_recommendations=3):
    
    #현재는 index로 적용이 되어있으므로 user_id - 1을 해야함.
    user_row_number = user_id - 1 
    
    # 최종적으로 만든 pred_df에서 사용자 index에 따라 상품 데이터 정렬 -> 상품 평점이 높은 순으로 정렬 된다
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    
    # 원본 평점 데이터에서 user id에 해당하는 데이터를 뽑아낸다. 
    user_data = ratings[ratings.user_id == user_id]
    
    # 위에서 뽑은 user_data와 원본 상품 데이터를 합친다. 
    ori_items_df = ori_items_df.reset_index()
    del ori_items_df['rating']
    user_history = user_data.merge(ori_items_df, on = 'item_code').sort_values(['rating'], ascending=False)
    
    # 원본 상품 데이터에서 사용자가 본 상품 데이터를 제외한 데이터를 추출
    recommendations = ori_items_df[~ori_items_df['item_code'].isin(user_history['item_code'])]
    # 사용자의 상품 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다. 
    recommendations = recommendations.merge( pd.DataFrame(sorted_user_predictions).reset_index(), on = 'item_code')
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]
    del recommendations['datetime_cart']
    del recommendations['datetime_purchase']
    
    return user_history, recommendations