# Test Evaluation
- train(약 5개월), test(마지막 3주) evaluation 최종 평가를 진행합니다.
- 결과적으로, 앙상블 결과가 가장 높은 NDCG 점수와 entropy 점수를 보였으며 base model MP 넘는 성능을 보임(8.결론 참조)

# 1.Dataload

In [582]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import scipy.sparse as sparse
import random
import implicit
from implicit.als import AlternatingLeastSquares as ALS

%cd ../../util
from utils import *

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [583]:
# products name 확인 용
products_df = pd.read_json("/fastcampus-data/products/products.json")
products_df = key_to_element(['_id'],products_df)

100%|██████████| 5141/5141 [00:00<00:00, 573085.55it/s]


In [505]:
df = pd.read_json('/fastcampus-data/select_column_version_4.json')

In [506]:
from dateutil.relativedelta import relativedelta
from datetime import datetime

df['date_paid'] = pd.to_datetime(df['date_paid'])
# 5개월 전 날짜 확인
df['date_paid'].max()-relativedelta(months=5)

Timestamp('2022-04-13 08:59:21.151000+0000', tz='UTC')

In [507]:
def product_name_fill(product_name_preprocess_df):
    # 각 마지막 product_ids, name으로 채우기
    product_ids_to_name = {}
    for idx, row in product_name_preprocess_df.iterrows():
        product_ids_to_name[row.product_ids] = row.name_x
    product_name_preprocess_df['name_x'] = product_name_preprocess_df['product_ids'].apply(lambda x: product_ids_to_name[x])

    name_to_product_ids = {}
    for idx, row in product_name_preprocess_df.iterrows():
        name_to_product_ids[row.name_x] = row.product_ids
    product_name_preprocess_df['product_ids'] = product_name_preprocess_df['name_x'].apply(lambda x: name_to_product_ids[x])
    return product_name_preprocess_df

In [508]:
# medirecommend 만들기
df = df.dropna(subset=['product_ids','name_x'])

# 나오는 개월 수 적기
date_state = "2022-04-13"
# paid orders만 가져오기
df['date_paid'] = pd.to_datetime(df['date_paid'])
df_only_paid = df[~df['date_paid'].isna()]
# 취소 안된 것만 가져오기
complete_df = df_only_paid[(df_only_paid['paid'] == True) & (df_only_paid['cancelled']==False)]
# 도서 카테고리만 가져오기
only_book = complete_df[complete_df['name'] == '도서']

# 유저가 중복으로 아이템 구매 삭제
df_duplicated_book = only_book.drop_duplicates(subset=['customer_id','product_ids'])

df_sort = df_duplicated_book.sort_values(by='date_paid').reset_index(drop=True)
df_sort = product_name_fill(df_sort)
df_sort = df_sort.drop_duplicates(subset=['customer_id','product_ids']).reset_index(drop=True)

# 5개월치 데이터만 가져오기
df_book = df_sort[df_sort['date_paid'] >= date_state].reset_index(drop=True)

# 마지막 3주 제외한 medirecommend 만들기
mediprediction_all_df = df_sort[df_sort['date_paid'] < '2022-08-23'].reset_index(drop=True)

In [509]:
# none 값 확인하기
df_book.isna().sum()

_id                 0
date_created        0
regular_price       0
sale_price          0
three_months        0
date_paid           0
customer_id         0
paid                0
name_x              0
category_id_y       0
product_ids         0
quantity            0
price               0
price_total         0
age_group        1078
한의사 여부              2
사업자 여부              2
cancelled           0
name                0
slug                0
dtype: int64

## 전체 데이터 EDA

In [510]:
print('중복 제거 전:',len(only_book), '중복 제거 후:',len(df_book))

중복 제거 전: 38395 중복 제거 후: 6957


In [511]:
print('전체 데이터 수:',len(df_book))

전체 데이터 수: 6957


In [512]:
print('아이템 수:',len(df_book.product_ids.unique()),'유저 수:',len(df_book.customer_id.unique()))

아이템 수: 267 유저 수: 2902


# promotion 전처리 함수
- train 만 전처리하여 학습 진행

In [513]:
def promotion_proprof(df):
    from datetime import datetime

    preprocessed_book_df_date = df.copy()

    promotion_book_df = preprocessed_book_df_date[preprocessed_book_df_date['date_paid'] >= '2022-01-01']
    promotion_book_df['date_paid_date'] = promotion_book_df['date_paid'].dt.date
    promotion_book_df['date_paid_week'] = promotion_book_df['date_paid_date'].apply(lambda x: x.isocalendar()[1])

    promotion_dict = {
        2:['트리거포인트 침치료'],
        3:['藥徵, 약의 징표','파킨슨병 한의진료','침의 과학적 접근의 이해','길익동동','Medical acupuncture 침의 과학적 접근과 임상활용',\
          '동의보감 약선','수화론(水火論)'],
        4:['실전한약가이드','음양승강으로 해석하는 사상의학: 생리병리'],
        5:['음양승강으로 해석하는 사상의학: 생리병리'],
        6:['윤상훈·권병조의 알짜 근육학','임상 한의사를 위한 기본 한약처방 강의 2판','트리거포인트 침치료','KCD 한방내과 진찰진단 가이드라인',\
          '실전한약가이드','음양승강으로 해석하는 사상의학: 생리병리','藥徵, 약의 징표','증보운곡본초학','통증치료를 위한 근육 초음파와 주사 테크닉'],
        7:['오국통 온병명방'],
        9:['병태생리 Visual map','NEO 인턴 핸드북','보험한약 브런치 the # 2판 개정판','Kendall 자세와 통증치료에 있어서 근육의 기능과 검사 5판',\
          '사상방 사용설명서','실전한약가이드','일차진료 한의사를 위한 보험한약입문 - 둘째 판','증보운곡본초학'],
        10:['한눈에 보는 스트레칭 해부학'],
        11:['임산부에게 사용할 수 있는 한방처방'],
        12:['임산부에게 사용할 수 있는 한방처방'],
        13:['MRI 자신감 키우기_족부편'],
        14:['장골의 PI 변위는 없다'],
        15:['윤상훈·권병조의 알짜 근육학','임상 한의사를 위한 기본 한약처방 강의 2판','KCD 한방내과 진찰진단 가이드라인','트리거포인트 침치료',\
           '음양승강으로 해석하는 사상의학: 생리병리','침의 과학적 접근의 이해','실전한약가이드','임산부에게 사용할 수 있는 한방처방','한눈에 보는 스트레칭 해부학',\
           'MRI 자신감 키우기_족부편'],
        16:['환자상담의 달인','병의원 경영과 자산 관리 클리닉','우리 병원의 문제? 현장에서 답을 찾다!','근육학','스파이랄 및 키네지오 테이핑',\
           '요양병원 주치의 진료핵심'],
        17:['오당 본초강론','운동기능장애 치료 매뉴얼','K. 한의학 임상총론','한방 활용 가이드','최강통증매선','암 치료에 이용되는 천연약물',\
           '왕문원 임상 평형침법','중국 왕문원 평형침구학'],
        18:['초음파 가이드 근골격계 통증 치료의 정석'],
        19:['초음파 가이드 근골격계 통증 치료의 정석','섭혜민 명의경방험안'],
        20:['카이로프랙틱 기본테크닉론'],
        21:['흔히보는 정형외과 외래진료 가이드북'],
        22:['趙紹琴(조소금) 내과학','한의학 상담','숨찬 세상, 호흡기를 편하게',\
         '의학심오(醫學心悟)','안면마비 침구치료','중경서 독법 강해(상,하) /개정판'],
        23:['선생님, 이제 그만 저 좀 포기해 주세요','한의학 상담','숨찬 세상, 호흡기를 편하게',\
        '의학심오(醫學心悟)','중경서 독법 강해(상,하) /개정판','안면마비 침구치료'],
     24:['황황교수의 임상의를 위한 근거기반 상한금궤 처방 매뉴얼','황황교수의 개원 한의사를 위한 상한금궤 처방 강의록',\
        '선생님, 이제 그만 저 좀 포기해 주세요'],\
     25:['황황교수의 임상의를 위한 근거기반 상한금궤 처방 매뉴얼',\
       '황황교수의 개원 한의사를 위한 상한금궤 처방 강의록','약침의 정석 –통증편','갑상선 진료 완전정복',\
       '신경학 증상의 감별법','이것이 알고싶다! 당뇨병진료','어지럼질환의 진단과 치료','증례와 함께 하는 한약처방',\
       '뇌의학의 첫걸음','HAPPY 소아청소년 진료'],\
     26:['약침의 정석 –통증편','갑상선 진료 완전정복','신경학 증상의 감별법',\
       '증례와 함께 하는 한약처방','이것이 알고싶다! 당뇨병진료','HAPPY 소아청소년 진료','어지럼질환의 진단과 치료',\
       '뇌의학의 첫걸음','실전, 임상한의학 내과질환을 중심으로','실전, 임상한의학 알레르기질환','침구대성','평주온열경위'],
     27:['침구과 진료매뉴얼','실전, 임상한의학 내과질환을 중심으로','실전, 임상한의학 알레르기질환','내과학 5권세트','한방순환 신경내과학',\
        '침구대성'],
     28:['감별진단의 정석','기본통증진료학','약처방의 정석 (1, 2권 세트)','QBook: Case based Review',\
         'SMART 내과 1권 : 바이탈, 감염, 종양, 류마티스','일차진료아카데미 처방가이드'],
     29:['비만문답','사암침의 해석과 임상'],
     30:['플로차트 정형외과 진단','침구과 진료매뉴얼','내과학 5권세트','한방순환 신경내과학'],
     31:['외래에서 꼭 알아야 할 통증증후군 137가지'],
     32:['SMART 기본 일차진료매뉴얼 3판(세트)','SMART 소아진료매뉴얼 3판','SMART 응급진료매뉴얼(세트)'],
     33:['SMART 기본 일차진료매뉴얼 3판(세트)','SMART 소아진료매뉴얼 3판','SMART 응급진료매뉴얼(세트)'],
     34:['초음파 유도하 침 시술 가이드북'],
     35:['영어 진료 가이드북','초음파 유도하 침 시술 가이드북'],
     36:['영어 진료 가이드북','소아피부질환해설'],
     37:['소아피부질환해설','醫學心悟(의학심오) 톺아보기'],}

    promotion_item_list = []
    for promotion_items in promotion_dict.values():
        for item in promotion_items:
            promotion_item_list.append(item)

    # set(promotion_item_list), len(set(promotion_item_list))

    preprocess_promotion_df = promotion_book_df[~((promotion_book_df['name_x'].str.contains('침의 과학적 접근과 임상활용')) & \
                            (promotion_book_df['date_paid_week']==3))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('의학심오')) & \
                                (preprocess_promotion_df['date_paid_week']==22))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('의학심오')) & \
                                (preprocess_promotion_df['date_paid_week']==23))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('약처방의 정석')) & \
                                (preprocess_promotion_df['date_paid_week']==28))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('초음파 유도하 침')) & \
                                (preprocess_promotion_df['date_paid_week']==34))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('초음파 유도하 침')) & \
                                (preprocess_promotion_df['date_paid_week']==34))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('영어 진료 가이드북')) & \
                                (preprocess_promotion_df['date_paid_week']==35))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('영어 진료 가이드북')) & \
                                (preprocess_promotion_df['date_paid_week']==36))]
    all_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('의학심오')) & \
                                (preprocess_promotion_df['date_paid_week']==37))]

    for key,value in promotion_dict.items():
        all_promotion_df = all_promotion_df[~((all_promotion_df['name_x'].isin(value)) & (all_promotion_df['date_paid_week']==key))]
    
    return all_promotion_df

# 2.train test split
- 마지막 3주 분량을 test로 선정합니다.
- train 없는 test 아이템을 삭제 진행합니다.

In [514]:
from datetime import datetime, timedelta
df_book['date_paid'].max()

Timestamp('2022-09-13 08:51:40+0000', tz='UTC')

In [515]:
datetime(2022,9,13)-timedelta(days=21)

datetime.datetime(2022, 8, 23, 0, 0)

In [516]:
date = '2022-08-23'
train_before = df_book[df_book['date_paid'] < date]
train_before_preprocess = promotion_proprof(train_before)
test_before_preprocess = df_book[df_book['date_paid'] >= date]

In [517]:
len(train_before)

5983

In [518]:
len(set(train_before.customer_id)), len(set(train_before.product_ids))

(2555, 256)

In [519]:
len(train_before_preprocess)

2887

In [520]:
len(set(train_before_preprocess.customer_id)), len(set(train_before_preprocess.product_ids))

(1482, 251)

## 전체 아이템 중복 확인

In [521]:
# product_ids, name_x 수는 일치
len(df_book.product_ids.unique()), len(df_book.name_x.unique())

(267, 267)

In [522]:
# 중복 제거 후 수 비교 확인
# 252로 일치하여 문제 없음
len(df_book.drop_duplicates(subset=['product_ids','name_x']).name_x.unique())

267

## train test 아이템 중복 확인

In [523]:
len(train_before_preprocess.product_ids.unique()),len(test_before_preprocess.product_ids.unique())

(251, 131)

In [524]:
len(set(train_before_preprocess.product_ids.unique())-set(test_before_preprocess.product_ids.unique()))

131

In [525]:
# test 아이템에 train 없는 아이템 확인
len(set(test_before_preprocess.product_ids.unique())-set(train_before_preprocess.product_ids.unique()))

11

In [526]:
# test 만 있는 item 제거
only_test_items = set(test_before_preprocess.product_ids.unique())-set(train_before_preprocess.product_ids.unique())
if_prepro_test = test_before_preprocess[~test_before_preprocess['product_ids'].isin(only_test_items)]

In [527]:
test = test_before_preprocess.copy()

In [528]:
# train 변수 명 변경
train = train_before_preprocess.copy()

In [529]:
# test 전처리 진행했을 경우
print('원본 test 수:', len(test))
print('전처리 진행했을 경우 test 수:', len(if_prepro_test))

원본 test 수: 974
전처리 진행했을 경우 test 수: 384


# train test eda

### 전처리 전후 비교

In [530]:
print('train 전처리 전:',len(train_before), 'train 전처리 후:',len(train))

train 전처리 전: 5983 train 전처리 후: 2887


In [531]:
print('test 전처리 전:',len(test_before_preprocess), 'test 전처리 후:',len(test))

test 전처리 전: 974 test 전처리 후: 974


### user 수 비교 

In [532]:
print('전처리 전 train 유저수 : ',len(train_before.customer_id.unique()), '전처리 후 train 유저 수:',len(train.customer_id.unique()))

전처리 전 train 유저수 :  2555 전처리 후 train 유저 수: 1482


In [533]:
print('test 유저 수:',len(test.customer_id.unique()))

test 유저 수: 744


In [534]:
# 신규 유저는 MP 같은 다른 방법으로 추천 진행해야 함
print('test 만 있는 신규 유저 :',len(set(test['customer_id'].unique())- set(train['customer_id'].unique())))

test 만 있는 신규 유저 : 514


### item 개수 비교

In [535]:
print('전처리 전 train 아이템 수:',len(set(train_before.product_ids)), '전처리 후 train 아이템 수 :',len(set(train.product_ids)))

전처리 전 train 아이템 수: 256 전처리 후 train 아이템 수 : 251


In [536]:
print('test 아이템 수 :',len(set(test.product_ids)))

test 아이템 수 : 131


In [537]:
print('train 만 있는 아이템 수:',  len(set(train.product_ids)-set(test.product_ids)))

train 만 있는 아이템 수: 131


In [538]:
print('test 만 있는 아이템 수:', len(set(test.product_ids) - set(train.product_ids)))

test 만 있는 아이템 수: 11


# 3. sparse matrix 만들기

## ALS MF Matrix

In [539]:
PdIds = train.product_ids.unique()

PdIdToIndex = {}
indexToPdId = {}

colIdx = 0

for PdId in PdIds:
    PdIdToIndex[PdId] = colIdx
    indexToPdId[colIdx] = PdId
    colIdx += 1
    
userIds = train.customer_id.unique()

userIdToIndex = {}
indexToUserId = {}

rowIdx = 0

for userId in userIds:
    userIdToIndex[userId] = rowIdx
    indexToUserId[rowIdx] = userId
    rowIdx += 1

import scipy.sparse as sp

rows = []
cols = []
vals = []

for row in train.itertuples():
    rows.append(userIdToIndex[row.customer_id])
    cols.append(PdIdToIndex[row.product_ids])
    vals.append(1)

purchase_sparse = sp.csr_matrix((vals, (rows, cols)), shape=(rowIdx,colIdx))

matrix = purchase_sparse.todense()
matrix

matrix([[1, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

### Most_popular_matrix

In [540]:
most_popular = mediprediction_all_df.groupby(['product_ids']).count()['customer_id'].reset_index()

### Medistream_prediction_matrix
- 메디스트림 메디마켓에서 제공하는 정렬 추천 성능 비교를 위한 df 구현
- 인기도순, 최신순, 과거순, 높은 가격순, 낮은 가격순, 이름순 (총 6 가지)
- 각각 구현해보고 학습 모델 대비 성능 비교

In [541]:
medistream_prediction_df = mediprediction_all_df[['date_created','regular_price','sale_price','three_months','product_ids','name_x']]
medistream_prediction_preprop_df = medistream_prediction_df.drop_duplicates(subset=['product_ids'], ignore_index=True)
medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
# sale_prices가 0이면 regular_price 값으로 채워넣어야하는데 0이 없음(전처리 필요 무)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])


# Sparsity 확인

In [542]:
# Sparsity: 얼마나 비어있나?
matrix_size = purchase_sparse.shape[0]* purchase_sparse.shape[1]
num_purchases = len(purchase_sparse.nonzero()[0])
sparsity = 100 * (1 - (num_purchases / matrix_size))
sparsity

99.22388717733654

# 4. MP & Base Model & prediction
- Model 학습 진행 및 predict 진행
- MP 모델과 base model prediction 진행

In [543]:
# real test 
ground_trues = []
for user_id in test['customer_id'].unique():
    ground_trues.append({'id': user_id,\
    'items':list(test[test['customer_id']==user_id].product_ids)
    })

# most popular prediction

In [554]:
# 전체 도서에 대한 판매 만큼 정렬 후 넣기
most_popular_list = most_popular.sort_values(by='customer_id',ascending=False).index

# test 예측값, 이미 구매 했을 경우 제외
predict_popular_list = []
for user_id in test['customer_id'].unique():
    train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
    predict_popular_list.append({'id':user_id ,'items':[most_popular.product_ids.loc[num] for num in most_popular_list \
                                                            if most_popular.product_ids.loc[num] not in train_purchase_list \
                                                            ]})

# 15 개만 예측하기
for idx, pred_list in enumerate(predict_popular_list):
    predict_popular_list[idx]['items'] = pred_list['items'][:15]

# medistream prediction
- 메디스트림 메디마켓에서 제공하는 정렬 추천 성능 비교
- 인기도순, 최신순, 과거순, 높은 가격순, 낮은 가격순, 이름순 (총 6 가지)
- 각각 구현해보고 학습 모델 대비 성능 비교

In [555]:
# 인기도순
medistream_popular_list = medistream_prediction_preprop_df.sort_values(by='three_months', ascending=False).index
# 최신순
medistream_latest_list = medistream_prediction_preprop_df.sort_values(by='date_created', ascending=False).index
# 오랜된 순
medistream_oldest_list = medistream_prediction_preprop_df.sort_values(by='date_created', ascending=True).index
# 높은 가격 순
medistream_high_price_list = medistream_prediction_preprop_df.sort_values(by='sale_price', ascending=False).index
# 낮은 가격 순
medistream_low_price_list = medistream_prediction_preprop_df.sort_values(by='sale_price', ascending=True).index
# 이름 순
medistream_name_sort_list = medistream_prediction_preprop_df.sort_values(by='name_x',ascending=True).index

def medistream_prediction_method(predict_num:int ,medi_predict_list:list)->list:
    medistream_predict_list = []
    for user_id in test['customer_id'].unique():
        medistream_predict_list.append({'id':user_id ,'items':[medistream_prediction_preprop_df.product_ids.loc[num] \
                                                                       for num in medi_predict_list]})

    # 15 개만 예측하기
    for idx, pred_list in enumerate(medistream_predict_list):
        medistream_predict_list[idx]['items'] = pred_list['items'][:predict_num]
        
    return medistream_predict_list

In [556]:
medistream_predict_popular_list = medistream_prediction_method(15, medistream_popular_list)
medistream_predict_latest_list = medistream_prediction_method(15, medistream_latest_list)
medistream_predict_oldest_list = medistream_prediction_method(15, medistream_oldest_list)
medistream_predict_high_price_list = medistream_prediction_method(15, medistream_high_price_list)
medistream_predict_low_price_list = medistream_prediction_method(15, medistream_low_price_list)
medistream_predict_name_sort_list = medistream_prediction_method(15, medistream_name_sort_list)

# 5. evaluation

## NDCG & Entropy Diversity 평가지표

In [557]:
class CustomEvaluator:
    # relavence 모두 1로 동일하게 봄
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))
    

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(1000)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]
    
    def _entropy_diversity(self,rec_list):
        import six
        import math
        
        topn = len(rec_list[0]['items'])
        users = [i.get('id',None) for i in rec_list]
        sz = float(len(users)) * topn
        freq = {}
        for rec in rec_list:
            for r in rec['items']:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent

    def _eval(self, gt_list, rec_list):
        gt_dict = {g["id"]: g for g in gt_list}
        ndcg_score = 0.0

        for rec in rec_list:
            gt = gt_dict[rec["id"]]
            ndcg_score += self._ndcg(gt["items"], rec["items"])


        ndcg_score = ndcg_score / len(rec_list)
        ent = self._entropy_diversity(rec_list)
        
        return ndcg_score, ent

    def evaluate(self, gt_list, rec_list):
        try:
            ndcg_score, ent_score = self._eval(gt_list, rec_list)
            print(f"NDCG: {ndcg_score:.6}")
            print(f"Entropy Diversity: {ent_score:.6} ")
        except Exception as e:
            print(e)


# most popular NDCG & Entropy

In [567]:
# most popular
evaluator = CustomEvaluator()
evaluator.evaluate(ground_trues, predict_popular_list)

NDCG: 0.0538018
Entropy Diversity: 2.74294 


## medistream prediction NDCG & Entropy

In [570]:
def medistream_prediction(ground_trues:list, predict_list:list):
    evaluator = CustomEvaluator()
    ndcg, entropy = evaluator._eval(ground_trues, predict_list)
    
    assert len(predict_list) == len(ground_trues)
    
    cnt = 0
    for gt, pred_list in zip(ground_trues, predict_list):
        for pred in pred_list['items']:
            if pred in gt['items']:
                cnt += 1
    return ndcg, entropy, cnt

In [571]:
medistream_predict_score = {'medistream_predict':['medi_popular','latest','oldest','high_price','low_price','name_sort'], \
                            'ndcg':[], 'entropy':[], 'cnt':[]}

medistream_predict_list = [medistream_predict_popular_list, medistream_predict_latest_list, medistream_predict_oldest_list,\
                          medistream_predict_high_price_list, medistream_predict_low_price_list, medistream_predict_name_sort_list]

for medistream_predict in medistream_predict_list:
    ndcg, entropy, cnt = medistream_prediction(ground_trues, medistream_predict)
    medistream_predict_score['ndcg'].append(ndcg)
    medistream_predict_score['entropy'].append(entropy)
    medistream_predict_score['cnt'].append(cnt)
pd.DataFrame(medistream_predict_score)

Unnamed: 0,medistream_predict,ndcg,entropy,cnt
0,medi_popular,0.061266,2.70805,151
1,latest,0.016588,2.70805,52
2,oldest,0.015806,2.70805,45
3,high_price,0.003865,2.70805,12
4,low_price,0.001136,2.70805,7
5,name_sort,0.007651,2.70805,22


# 6. hyper parameter tuned model train & evaluation

## 6-1. ALS MF hypter parameter tuned

In [572]:
als_mf_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'NDCG':[],'entropy':[]}

factors = [5]
regularizations = [0.01]
iterations = [5]

for factor in factors:
    for regularization in regularizations:
        for iteration in iterations:
            als_model = ALS(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
            als_model.fit(purchase_sparse, show_progress=False)

            # 신규 유저인 경우 mp로 넣기
            # 전체 도서에 대한 판매 만큼 정렬 후 넣기
            most_popular_list = most_popular.sort_values(by='customer_id',ascending=False).index

            # test 예측값, 이미 구매 했을 경우 제외
            als_predict_list = []
            for user_id in test['customer_id'].unique():
                try:
                    result = als_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=15)
                    als_predict_list.append({'id':user_id ,'items':[indexToPdId[num] for num in result[0]]})
                except:
                    train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                    als_predict_list.append({'id':user_id ,'items':[most_popular.product_ids.loc[num] for num in most_popular_list \
                                                                        if most_popular.product_ids.loc[num] not in train_purchase_list \
                                                                        ]})

            # 15 개만 예측하기
            for idx, pred_list in enumerate(als_predict_list):
                als_predict_list[idx]['items'] = pred_list['items'][:15]

            # ALS 
            evaluator = CustomEvaluator()
            ndcg, entropy = evaluator._eval(ground_trues, als_predict_list)

            als_mf_hyper_parameter['factor'].append(factor)
            als_mf_hyper_parameter['regularization'].append(regularization)
            als_mf_hyper_parameter['iteration'].append(iteration)
            als_mf_hyper_parameter['NDCG'].append(ndcg)
            als_mf_hyper_parameter['entropy'].append(entropy)

In [573]:
pd.DataFrame(als_mf_hyper_parameter).sort_values(by='NDCG',ascending=False).head()

Unnamed: 0,factor,regularization,iteration,NDCG,entropy
0,5,0.01,5,0.053031,3.335207


## 6-2. LMF hypter parameter tuned

In [574]:
lmf_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'NDCG':[],'entropy':[]}

factors = [15]
regularizations = [0.005]
iterations = [50]

for factor in factors:
    for regularization in regularizations:
        for iteration in iterations:
            lmf_model = LMF(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
            lmf_model.fit(purchase_sparse, show_progress=False)
            
            # 신규 유저 mp로 넣기
            most_popular_list = most_popular.sort_values(by='customer_id',ascending=False).index

            # test 예측값
            lmf_predict_list = []
            for user_id in test['customer_id'].unique():
                try:
                    result = lmf_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=20)
                    lmf_predict_list.append({'id':user_id ,'items':[indexToPdId[num] for num in result[0]]})
                except:
                    train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                    lmf_predict_list.append({'id':user_id ,'items':[most_popular.product_ids.loc[num] for num in most_popular_list \
                                                                        if most_popular.product_ids.loc[num] not in train_purchase_list \
                                                                        ]})

            # 15 개만 예측하기
            for idx, pred_list in enumerate(lmf_predict_list):
                lmf_predict_list[idx]['items'] = pred_list['items'][:15]
                
            # LMF
            evaluator = CustomEvaluator()
            ndcg, entropy = evaluator._eval(ground_trues, lmf_predict_list)
            
            lmf_hyper_parameter['factor'].append(factor)
            lmf_hyper_parameter['regularization'].append(regularization)
            lmf_hyper_parameter['iteration'].append(iteration)
            lmf_hyper_parameter['NDCG'].append(ndcg)
            lmf_hyper_parameter['entropy'].append(entropy)

In [575]:
pd.DataFrame(lmf_hyper_parameter).sort_values(by='NDCG',ascending=False).head()

Unnamed: 0,factor,regularization,iteration,NDCG,entropy
0,15,0.005,50,0.054209,3.551187


# 6-3 ensemble (medi popular & lmf MIX) hypter parameter tuned
- top5,3 모두 평가하여 결과 확인

### 1) ensemble top5

In [576]:
top5_medipop_lmf_mix_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'top':[],'NDCG':[],'entropy':[]}

factors = [40]
regularizations = [0.005]
iterations = [50]
tops = [5]

for factor in factors:
    for regularization in regularizations:
        for iteration in iterations:
            for top in tops:
                lmf_model = LMF(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
                lmf_model.fit(purchase_sparse, show_progress=False)

                # test 예측값
                lmf_predict_list = []
                for user_id in test['customer_id'].unique():
                    try:
                        train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                        medi_popular_top_three = medistream_popular_list[:top]
                        medi_popular_top_three_list = [medistream_prediction_preprop_df.product_ids.loc[num] for num in medi_popular_top_three \
                                                                            if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                            ]
                        result = lmf_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=20)
                        result_list = [indexToPdId[num] for num in result[0]]
                        medi_pop_lmf_list = list(dict.fromkeys(medi_popular_top_three_list + result_list))
                        lmf_predict_list.append({'id':user_id ,'items':medi_pop_lmf_list})
                    except:
                        train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                        lmf_predict_list.append({'id':user_id ,'items':[medistream_prediction_preprop_df.product_ids.loc[num] for num in medistream_popular_list \
                                                                            if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                            ]})

                # 15 개만 예측하기
                for idx, pred_list in enumerate(lmf_predict_list):
                    lmf_predict_list[idx]['items'] = pred_list['items'][:15]

                # LMF
                evaluator = CustomEvaluator()
                ndcg, entropy = evaluator._eval(ground_trues, lmf_predict_list)

                top5_medipop_lmf_mix_hyper_parameter['factor'].append(factor)
                top5_medipop_lmf_mix_hyper_parameter['regularization'].append(regularization)
                top5_medipop_lmf_mix_hyper_parameter['iteration'].append(iteration)
                top5_medipop_lmf_mix_hyper_parameter['top'].append(top)
                top5_medipop_lmf_mix_hyper_parameter['NDCG'].append(ndcg)
                top5_medipop_lmf_mix_hyper_parameter['entropy'].append(entropy)

### 2) ensemble top3

In [577]:
top3_medipop_lmf_mix_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'top':[],'NDCG':[],'entropy':[]}

factors = [40]
regularizations = [0.005]
iterations = [50]
tops = [3]

for factor in factors:
    for regularization in regularizations:
        for iteration in iterations:
            for top in tops:
                lmf_model = LMF(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
                lmf_model.fit(purchase_sparse, show_progress=False)

                # test 예측값
                lmf_predict_list = []
                for user_id in test['customer_id'].unique():
                    try:
                        train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                        medi_popular_top_three = medistream_popular_list[:top]
                        medi_popular_top_three_list = [medistream_prediction_preprop_df.product_ids.loc[num] for num in medi_popular_top_three \
                                                                            if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                            ]
                        result = lmf_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=20)
                        result_list = [indexToPdId[num] for num in result[0]]
                        medi_pop_lmf_list = list(dict.fromkeys(medi_popular_top_three_list + result_list))
                        lmf_predict_list.append({'id':user_id ,'items':medi_pop_lmf_list})
                    except:
                        train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                        lmf_predict_list.append({'id':user_id ,'items':[medistream_prediction_preprop_df.product_ids.loc[num] for num in medistream_popular_list \
                                                                            if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                            ]})

                # 15 개만 예측하기
                for idx, pred_list in enumerate(lmf_predict_list):
                    lmf_predict_list[idx]['items'] = pred_list['items'][:15]

                # LMF
                evaluator = CustomEvaluator()
                ndcg, entropy = evaluator._eval(ground_trues, lmf_predict_list)

                top3_medipop_lmf_mix_hyper_parameter['factor'].append(factor)
                top3_medipop_lmf_mix_hyper_parameter['regularization'].append(regularization)
                top3_medipop_lmf_mix_hyper_parameter['iteration'].append(iteration)
                top3_medipop_lmf_mix_hyper_parameter['top'].append(top)
                top3_medipop_lmf_mix_hyper_parameter['NDCG'].append(ndcg)
                top3_medipop_lmf_mix_hyper_parameter['entropy'].append(entropy)

In [579]:
pd.DataFrame(top5_medipop_lmf_mix_hyper_parameter).sort_values(by='NDCG',ascending=False).head()

Unnamed: 0,factor,regularization,iteration,top,NDCG,entropy
0,40,0.005,50,5,0.060843,3.242231


In [578]:
pd.DataFrame(top3_medipop_lmf_mix_hyper_parameter).sort_values(by='NDCG',ascending=False).head()

Unnamed: 0,factor,regularization,iteration,top,NDCG,entropy
0,40,0.005,50,3,0.061518,3.330821


# 7. 결론

- 각 모델은 하이퍼파라미터 튜닝한 결과로 최종 evalutation 진행
- ALS MF: factor = 3 , regularizations = 0.01 , iterations = 5
- LMF: factor = 15 , regularizations = 0.005 , iterations = 5
- ensemble(medi_mp_lmf_mix): factor = 40 , regularizations = 0.005 , iterations = 50 , tops = 3
- ensembel은 top3가 top5 보다 높은 점수인 것을 확인
- 결과, ensemble(medi_mp_lmf_mix) 모델이 NDCG 0.061518 diversity 3.330821 로 base model(0.061266, 2.70805) 대비 모두 높은 점수를 보이는 것을 확인

In [580]:
all_prediction_df = {'first_day':[],'last_day':[],'train_데이터수':[],'train_유저수':[],'test_데이터수':[],\
    'test_유저수':[],'test_신규유저수':[],'test_신규아이템수':[],'원본_test수':[],'전처리진행test수':[],\
    'als_mf':[],'lmf':[],'top5_medi_mp_lmf_mix':[],'top3_medi_mp_lmf_mix':[],'mp':[],'medi_popular':[],'latest':[],\
    'oldest':[],'high_price':[],'low_price':[],'name_sort':[],\
     'als_mf_entropy':[],'lmf_entropy':[],'top5_medi_mp_lmf_mix_entropy':[],'top3_medi_mp_lmf_mix_entropy':[],'mp_entropy':[],'medi_popular_entropy':[],'latest_entropy':[],\
     'oldest_entropy':[],'high_price_entropy':[],'low_price_entropy':[],'name_sort_entropy':[]}
medistream_predict_df = pd.DataFrame(medistream_predict_score)

all_prediction_df['first_day'].append(str(datetime.date(train['date_paid'].min()))+' '+str(datetime.date(train['date_paid'].max())))
all_prediction_df['last_day'].append(str(datetime.date(test['date_paid'].min()))+' '+str(datetime.date(test['date_paid'].max())))
all_prediction_df['train_데이터수'].append(len(train))
all_prediction_df['train_유저수'].append(len(set(train.customer_id)))
all_prediction_df['test_데이터수'].append(len(test))
all_prediction_df['test_유저수'].append(len(set(test.customer_id)))
all_prediction_df['test_신규유저수'].append(len(set(test['customer_id'].unique())- set(train['customer_id'].unique())))
all_prediction_df['test_신규아이템수'].append(len(set(test.product_ids.unique())-set(train.product_ids.unique())))
all_prediction_df['원본_test수'].append(len(test))
all_prediction_df['전처리진행test수'].append(len(if_prepro_test))

# ndcg
all_prediction_df['als_mf'].append(pd.DataFrame(als_mf_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
all_prediction_df['lmf'].append(pd.DataFrame(lmf_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
all_prediction_df['top5_medi_mp_lmf_mix'].append(pd.DataFrame(top5_medipop_lmf_mix_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
all_prediction_df['top3_medi_mp_lmf_mix'].append(pd.DataFrame(top3_medipop_lmf_mix_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
all_prediction_df['mp'].append(evaluator._eval(ground_trues, predict_popular_list)[0])
all_prediction_df['medi_popular'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='medi_popular'].iloc[0]['ndcg'])
all_prediction_df['latest'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='latest'].iloc[0]['ndcg'])
all_prediction_df['oldest'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='oldest'].iloc[0]['ndcg'])
all_prediction_df['high_price'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='high_price'].iloc[0]['ndcg'])
all_prediction_df['low_price'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='low_price'].iloc[0]['ndcg'])
all_prediction_df['name_sort'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='name_sort'].iloc[0]['ndcg'])

# entropy
all_prediction_df['als_mf_entropy'].append(pd.DataFrame(als_mf_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
all_prediction_df['lmf_entropy'].append(pd.DataFrame(lmf_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
all_prediction_df['top5_medi_mp_lmf_mix_entropy'].append(pd.DataFrame(top5_medipop_lmf_mix_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
all_prediction_df['top3_medi_mp_lmf_mix_entropy'].append(pd.DataFrame(top3_medipop_lmf_mix_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
all_prediction_df['mp_entropy'].append(evaluator._eval(ground_trues, predict_popular_list)[1])
all_prediction_df['medi_popular_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='medi_popular'].iloc[0]['entropy'])
all_prediction_df['latest_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='latest'].iloc[0]['entropy'])
all_prediction_df['oldest_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='oldest'].iloc[0]['entropy'])
all_prediction_df['high_price_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='high_price'].iloc[0]['entropy'])
all_prediction_df['low_price_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='low_price'].iloc[0]['entropy'])
all_prediction_df['name_sort_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='name_sort'].iloc[0]['entropy'])

print('train 총 기간:',train['date_paid'].max()-train['date_paid'].min())
print('test 총 기간:',test['date_paid'].max()-test['date_paid'].min())
display(pd.DataFrame(all_prediction_df))

train 총 기간: 131 days 13:31:51.843000
test 총 기간: 21 days 05:53:44.939000


Unnamed: 0,first_day,last_day,train_데이터수,train_유저수,test_데이터수,test_유저수,test_신규유저수,test_신규아이템수,원본_test수,전처리진행test수,als_mf,lmf,top5_medi_mp_lmf_mix,top3_medi_mp_lmf_mix,mp,medi_popular,latest,oldest,high_price,low_price,name_sort,als_mf_entropy,lmf_entropy,top5_medi_mp_lmf_mix_entropy,top3_medi_mp_lmf_mix_entropy,mp_entropy,medi_popular_entropy,latest_entropy,oldest_entropy,high_price_entropy,low_price_entropy,name_sort_entropy
0,2022-04-13 2022-08-22,2022-08-23 2022-09-13,2887,1482,974,744,514,11,974,384,0.053031,0.054209,0.060843,0.061518,0.053802,0.061266,0.016588,0.015806,0.003865,0.001136,0.007651,3.335207,3.551187,3.242231,3.330821,2.742936,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
