In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import scipy.sparse as sparse
import random
import implicit
from implicit.als import AlternatingLeastSquares as ALS

%cd /home/user_3/medistream-recsys/Script
from preprocessing import drop_columns,dict_to_column,dict_to_set,set_to_column,key_to_element

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

/home/user_3/medistream-recsys/Script


# 1. LoadData

In [3]:
# products name 확인 용
products_df = pd.read_json("/fastcampus-data/products/products.json")
products_df = key_to_element(['_id'],products_df)
df = pd.read_json('/fastcampus-data/select_column_version_3.json')

100%|██████████| 5141/5141 [00:00<00:00, 789041.16it/s]


In [4]:
date_state = "2022-05-12"

# paid orders만 가져오기
df['date_paid'] = pd.to_datetime(df['date_paid'])
df_only_paid = df[~df['date_paid'].isna()]
# 3개월치 데이터만 가져오기
df_date = df_only_paid[df_only_paid['date_paid'] > date_state]
# 취소 안된 것만 가져오기
complete_df = df_date[(df_date['paid'] == True) & (df_date['cancelled']==False)]
# 도서 카테고리만 가져오기
only_book = complete_df[complete_df['name'] == '도서']

# 유저가 중복으로 아이템 구매 삭제
df_duplicated_book = only_book.drop_duplicates(subset=['customer_id','product_ids'])
df_book = df_duplicated_book.sort_values(by='date_paid').reset_index(drop=True)

# 전체
# df_book = complete_df[complete_df['name'].isin(['도서','소모품'])].sort_values(by='date_paid')

# 2. Train_Test_split

In [5]:
from datetime import datetime, timedelta

date = df_book['date_paid'].max()-timedelta(weeks=3)
train_before_preprocess = df_book[df_book['date_paid'] < date]
test_before_preprocess = df_book[df_book['date_paid'] >= date]
# train 3개 이상 구매 기록유저에 대한 학습 진행
# train_drop_row_index = train_before_preprocess['customer_id'].value_counts()[train_before_preprocess['customer_id'].value_counts()>2].index
# train = train_before_preprocess[train_before_preprocess['customer_id'].isin(train_drop_row_index)]

In [6]:
# 아이템 중복확인
# product_ids, name_x 수는 일치
len(df_book.product_ids.unique()), len(df_book.name_x.unique())

(252, 252)

In [7]:
len(train_before_preprocess.product_ids.unique()),len(test_before_preprocess.product_ids.unique())

(243, 130)

In [8]:
# test 아이템에 train 없는 아이템 확인
len(set(test_before_preprocess.product_ids.unique())-set(train_before_preprocess.product_ids.unique()))

9

In [9]:
# test 만 있는 item 제거
only_test_items = set(test_before_preprocess.product_ids.unique())-set(train_before_preprocess.product_ids.unique())
test = test_before_preprocess[~test_before_preprocess['product_ids'].isin(only_test_items)]
train = train_before_preprocess

In [10]:
len(test.customer_id.unique()), len(train_before_preprocess.customer_id.unique())

(491, 2323)

In [11]:
##################################################################################################################

# 3. 도서 Sparse matrix 만들기

In [12]:
# # 고객과 아이템 얼마나 구매했는가
# grouped_purchased = train.groupby(['customer_id','product_ids']).sum().reset_index()
# customers = list(np.sort(grouped_purchased['customer_id'].unique()))
# products = list (grouped_purchased['product_ids'].unique())
# quantity = list(grouped_purchased['paid'])

# rows = grouped_purchased['customer_id'].astype('category').cat.codes
# cols = grouped_purchased['product_ids'].astype('category').cat.codes
# print(len(customers)) 
# print(len(products)) 

# # most popular matrix
# most_popular_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape = (len(customers),len(products)))
# # ALS matrix
# purchase_sparse = sparse.csr_matrix(([1 for _ in range(len(quantity))], (rows, cols)), shape = (len(customers),len(products)))
# purchase_sparse

In [13]:
PdIds = train.product_ids.unique()

PdIdToIndex = {}
indexToPdId = {}

colIdx = 0

for PdId in PdIds:
    PdIdToIndex[PdId] = colIdx
    indexToPdId[colIdx] = PdId
    colIdx += 1
    
userIds = train.customer_id.unique()

userIdToIndex = {}
indexToUserId = {}

rowIdx = 0

for userId in userIds:
    userIdToIndex[userId] = rowIdx
    indexToUserId[rowIdx] = userId
    rowIdx += 1

import scipy.sparse as sp

rows = []
cols = []
vals = []

for row in train.itertuples():
    rows.append(userIdToIndex[row.customer_id])
    cols.append(PdIdToIndex[row.product_ids])
    vals.append(1)

purchase_sparse = sp.csr_matrix((vals, (rows, cols)), shape=(rowIdx,colIdx))

matrix = purchase_sparse.todense()
matrix

matrix([[1, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 1, 0]])

In [14]:
# Sparsity: 얼마나 비어있나?
matrix_size = purchase_sparse.shape[0]* purchase_sparse.shape[1]
num_purchases = len(purchase_sparse.nonzero()[0])
sparsity = 100 * (1 - (num_purchases / matrix_size))
sparsity

99.10999151444935

# 4. Test에 Random 추천

In [15]:
np.random.seed(42)

In [16]:
item_ids = df_book['product_ids'].unique()
user_ids = test['customer_id'].unique()

In [17]:
# test 실제 값
ground_trues = []
for user_id in user_ids:
    ground_trues.append({'id': user_id,\
    'items':list(test[test['customer_id']==user_id].product_ids)
    })

In [19]:
## # test 예측값| 랜덤 추천( 100개 추천)
predict_list = []
products = train['product_ids'].unique()
np.random.seed(42)
for user_id in test['customer_id'].unique():
    random_products_num = np.random.randint(len(products),size=len(products))
    random_products = [indexToPdId[num] for num in random_products_num]
    train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
    predict_list.append({'id':user_id ,'items':[product for product in random_products \
                                               if product not in train_purchase_list]})
for idx, pred_list in enumerate(predict_list):
    predict_list[idx]['items'] = pred_list['items'][:100]
# ALS 
evaluator = CustomEvaluator()
evaluator.evaluate(ground_trues, predict_list)

nDCG: 0.0882093


In [30]:
len(predict_list[0]['items'])

100

# 5. 평가

## 평가지표

In [18]:
class CustomEvaluator:
    # relavence 모두 1로 동일하게 봄
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))
    

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(1000)]
    '''
    idcgs 예시, item 3개 추천되므로 3.074281787960283 가 됩니다.
    [0, 1.4426950408889634, 2.352934267515801, 3.074281787960283]
    '''

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_list, rec_list):
        gt_dict = {g["id"]: g for g in gt_list}
        ndcg_score = 0.0

        for rec in rec_list:
            gt = gt_dict[rec["id"]]
            ndcg_score += self._ndcg(gt["items"], rec["items"])


        ndcg_score = ndcg_score / len(rec_list)


        return ndcg_score

    def evaluate(self, gt_list, rec_list):
        try:
            ndcg_score = self._eval(gt_list, rec_list)
            print(f"nDCG: {ndcg_score:.6}")
        except Exception as e:
            print(e)


In [32]:
# ALS 
evaluator = CustomEvaluator()
evaluator.evaluate(ground_trues, predict_list)

nDCG: 0.0870465


In [33]:
len(predict_list),len(ground_trues)

(491, 491)

In [34]:
# 아이템 맞춘 개수
cnt = 0
total = 0
for gt, pred_list in zip(ground_trues, predict_list):
    for pred in pred_list['items']:
        total += 1
        if pred in gt['items']:
            cnt += 1
cnt,total

(245, 49100)