# Cross Validation 3week
- Time-Series 데이터이기 때문에 Time-Series-CV 방법 중 blocking 방법을 이용하여 평가 진행
- valid data 3주 기간 중 1day를 predict하고 나머지는 학습을 진행하며 순차적으로 1day를 평가하게 됨
- 학습 데이터의 기간은 동일하게 학습을 진행
- 아래 그림과 같이 학습이 진행되며 전체 21day를 평가하여 평균을 내서 평가를 진행하게 됩니다.

![Walk-Forward](https://user-images.githubusercontent.com/86936634/199648078-2fcafd47-75bc-45e8-9626-36648efa0bb0.gif)

- 수치상으로 보았을 때 과적합되는 경향도 확인되지만
- 데이터 수가 전체 1만건 가량으로 적으며 15개 아이템을 prediction 하기 때문에 적합도를 확인하기 쉽지 않음
- 하지만, MF 와 LMF, ensemble(medi_mp_lmf_mix) 모두 일정하게 약 0.2 정도 NDCG score 내는 것으로 보아 적합하다고 판단
- 모델 CV 결과는 아래 링크로 확인 할 수 있습니다.
- https://docs.google.com/spreadsheets/d/1Y_YDjP-QcCq7Qfgk2Cr0epKyX_6Fk4Eel1oklIJLLfE/edit#gid=1936801230

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import scipy.sparse as sparse
import random
import implicit
from implicit.als import AlternatingLeastSquares as ALS

%cd ../../util
from utils import *

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import scipy.sparse as sp
from implicit.lmf import LogisticMatrixFactorization as LMF

/home/user_3/medistream-recsys/Script


# 1.Dataload

In [57]:
# products name 확인 용
products_df = pd.read_json("/fastcampus-data/products/products.json")
products_df = key_to_element(['_id'],products_df)

100%|██████████| 5141/5141 [00:00<00:00, 667437.92it/s]


In [58]:
df = pd.read_json('/fastcampus-data/select_column_version_4.json')

In [59]:
df['date_paid'] = pd.to_datetime(df['date_paid'])
all_df = df.copy()

In [60]:
def product_name_fill(product_name_preprocess_df):
    # 각 마지막 product_ids, name으로 채우기
    product_ids_to_name = {}
    for idx, row in product_name_preprocess_df.iterrows():
        product_ids_to_name[row.product_ids] = row.name_x
    product_name_preprocess_df['name_x'] = product_name_preprocess_df['product_ids'].apply(lambda x: product_ids_to_name[x])

    name_to_product_ids = {}
    for idx, row in product_name_preprocess_df.iterrows():
        name_to_product_ids[row.name_x] = row.product_ids
    product_name_preprocess_df['product_ids'] = product_name_preprocess_df['name_x'].apply(lambda x: name_to_product_ids[x])
    return product_name_preprocess_df

In [61]:
def promotion_proprof(df):
    from datetime import datetime

    preprocessed_book_df_date = df.copy()

    promotion_book_df = preprocessed_book_df_date[preprocessed_book_df_date['date_paid'] >= '2022-01-01']
    promotion_book_df['date_paid_date'] = promotion_book_df['date_paid'].dt.date
    promotion_book_df['date_paid_week'] = promotion_book_df['date_paid_date'].apply(lambda x: x.isocalendar()[1])

    promotion_dict = {
        2:['트리거포인트 침치료'],
        3:['藥徵, 약의 징표','파킨슨병 한의진료','침의 과학적 접근의 이해','길익동동','Medical acupuncture 침의 과학적 접근과 임상활용',\
          '동의보감 약선','수화론(水火論)'],
        4:['실전한약가이드','음양승강으로 해석하는 사상의학: 생리병리'],
        5:['음양승강으로 해석하는 사상의학: 생리병리'],
        6:['윤상훈·권병조의 알짜 근육학','임상 한의사를 위한 기본 한약처방 강의 2판','트리거포인트 침치료','KCD 한방내과 진찰진단 가이드라인',\
          '실전한약가이드','음양승강으로 해석하는 사상의학: 생리병리','藥徵, 약의 징표','증보운곡본초학','통증치료를 위한 근육 초음파와 주사 테크닉'],
        7:['오국통 온병명방'],
        9:['병태생리 Visual map','NEO 인턴 핸드북','보험한약 브런치 the # 2판 개정판','Kendall 자세와 통증치료에 있어서 근육의 기능과 검사 5판',\
          '사상방 사용설명서','실전한약가이드','일차진료 한의사를 위한 보험한약입문 - 둘째 판','증보운곡본초학'],
        10:['한눈에 보는 스트레칭 해부학'],
        11:['임산부에게 사용할 수 있는 한방처방'],
        12:['임산부에게 사용할 수 있는 한방처방'],
        13:['MRI 자신감 키우기_족부편'],
        14:['장골의 PI 변위는 없다'],
        15:['윤상훈·권병조의 알짜 근육학','임상 한의사를 위한 기본 한약처방 강의 2판','KCD 한방내과 진찰진단 가이드라인','트리거포인트 침치료',\
           '음양승강으로 해석하는 사상의학: 생리병리','침의 과학적 접근의 이해','실전한약가이드','임산부에게 사용할 수 있는 한방처방','한눈에 보는 스트레칭 해부학',\
           'MRI 자신감 키우기_족부편'],
        16:['환자상담의 달인','병의원 경영과 자산 관리 클리닉','우리 병원의 문제? 현장에서 답을 찾다!','근육학','스파이랄 및 키네지오 테이핑',\
           '요양병원 주치의 진료핵심'],
        17:['오당 본초강론','운동기능장애 치료 매뉴얼','K. 한의학 임상총론','한방 활용 가이드','최강통증매선','암 치료에 이용되는 천연약물',\
           '왕문원 임상 평형침법','중국 왕문원 평형침구학'],
        18:['초음파 가이드 근골격계 통증 치료의 정석'],
        19:['초음파 가이드 근골격계 통증 치료의 정석','섭혜민 명의경방험안'],
        20:['카이로프랙틱 기본테크닉론'],
        21:['흔히보는 정형외과 외래진료 가이드북'],
        22:['趙紹琴(조소금) 내과학','한의학 상담','숨찬 세상, 호흡기를 편하게',\
         '의학심오(醫學心悟)','안면마비 침구치료','중경서 독법 강해(상,하) /개정판'],
        23:['선생님, 이제 그만 저 좀 포기해 주세요','한의학 상담','숨찬 세상, 호흡기를 편하게',\
        '의학심오(醫學心悟)','중경서 독법 강해(상,하) /개정판','안면마비 침구치료'],
     24:['황황교수의 임상의를 위한 근거기반 상한금궤 처방 매뉴얼','황황교수의 개원 한의사를 위한 상한금궤 처방 강의록',\
        '선생님, 이제 그만 저 좀 포기해 주세요'],\
     25:['황황교수의 임상의를 위한 근거기반 상한금궤 처방 매뉴얼',\
       '황황교수의 개원 한의사를 위한 상한금궤 처방 강의록','약침의 정석 –통증편','갑상선 진료 완전정복',\
       '신경학 증상의 감별법','이것이 알고싶다! 당뇨병진료','어지럼질환의 진단과 치료','증례와 함께 하는 한약처방',\
       '뇌의학의 첫걸음','HAPPY 소아청소년 진료'],\
     26:['약침의 정석 –통증편','갑상선 진료 완전정복','신경학 증상의 감별법',\
       '증례와 함께 하는 한약처방','이것이 알고싶다! 당뇨병진료','HAPPY 소아청소년 진료','어지럼질환의 진단과 치료',\
       '뇌의학의 첫걸음','실전, 임상한의학 내과질환을 중심으로','실전, 임상한의학 알레르기질환','침구대성','평주온열경위'],
     27:['침구과 진료매뉴얼','실전, 임상한의학 내과질환을 중심으로','실전, 임상한의학 알레르기질환','내과학 5권세트','한방순환 신경내과학',\
        '침구대성'],
     28:['감별진단의 정석','기본통증진료학','약처방의 정석 (1, 2권 세트)','QBook: Case based Review',\
         'SMART 내과 1권 : 바이탈, 감염, 종양, 류마티스','일차진료아카데미 처방가이드'],
     29:['비만문답','사암침의 해석과 임상'],
     30:['플로차트 정형외과 진단','침구과 진료매뉴얼','내과학 5권세트','한방순환 신경내과학'],
     31:['외래에서 꼭 알아야 할 통증증후군 137가지'],
     32:['SMART 기본 일차진료매뉴얼 3판(세트)','SMART 소아진료매뉴얼 3판','SMART 응급진료매뉴얼(세트)'],
     33:['SMART 기본 일차진료매뉴얼 3판(세트)','SMART 소아진료매뉴얼 3판','SMART 응급진료매뉴얼(세트)'],
     34:['초음파 유도하 침 시술 가이드북'],
     35:['영어 진료 가이드북','초음파 유도하 침 시술 가이드북'],
     36:['영어 진료 가이드북','소아피부질환해설'],
     37:['소아피부질환해설','醫學心悟(의학심오) 톺아보기'],}

    promotion_item_list = []
    for promotion_items in promotion_dict.values():
        for item in promotion_items:
            promotion_item_list.append(item)

    # set(promotion_item_list), len(set(promotion_item_list))
    
    preprocess_promotion_df = promotion_book_df[~((promotion_book_df['name_x'].str.contains('침의 과학적 접근과 임상활용')) & \
                            (promotion_book_df['date_paid_week']==3))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('의학심오')) & \
                                (preprocess_promotion_df['date_paid_week']==22))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('의학심오')) & \
                                (preprocess_promotion_df['date_paid_week']==23))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('약처방의 정석')) & \
                                (preprocess_promotion_df['date_paid_week']==28))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('초음파 유도하 침')) & \
                                (preprocess_promotion_df['date_paid_week']==34))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('초음파 유도하 침')) & \
                                (preprocess_promotion_df['date_paid_week']==34))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('영어 진료 가이드북')) & \
                                (preprocess_promotion_df['date_paid_week']==35))]
    preprocess_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('영어 진료 가이드북')) & \
                                (preprocess_promotion_df['date_paid_week']==36))]
    all_promotion_df = preprocess_promotion_df[~((preprocess_promotion_df['name_x'].str.contains('의학심오')) & \
                                (preprocess_promotion_df['date_paid_week']==37))]

    for key,value in promotion_dict.items():
        all_promotion_df = all_promotion_df[~((all_promotion_df['name_x'].isin(value)) & (all_promotion_df['date_paid_week']==key))]
    
    return all_promotion_df

In [62]:
df['date_paid'].max()

Timestamp('2022-09-13 08:59:21.151000+0000', tz='UTC')

In [63]:
split_day_list =  []
test_day_list = []

# 1일 test 날짜 확인
last_date_state = '2022-09-13'
split_day = pd.to_datetime(last_date_state)-relativedelta(months=4)
test_day = pd.to_datetime(last_date_state)
'''
마지막 날짜에서 개월 수를 자름 -> split_day
months 만 바꾸면 21번 분량이 나옴
'''
'''
train validation test
5month 3week     3week
42day 전부터 자르면 됨
'''
for i in range(21,42):
    sp_day = str((split_day-timedelta(days=i+1)).to_pydatetime().date())
    tt_day  = str((test_day-timedelta(days=i+1)).to_pydatetime().date())
    split_day_list.append(sp_day)
    test_day_list.append(tt_day)

In [64]:
class CustomEvaluator:
    # relavence 모두 1로 동일하게 봄
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))
    

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(1000)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]
    
    def _entropy_diversity(self,rec_list):
        import six
        import math
        
        topn = len(rec_list[0]['items'])
        users = [i.get('id',None) for i in rec_list]
        sz = float(len(users)) * topn
        freq = {}
        for rec in rec_list:
            for r in rec['items']:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent

    def _eval(self, gt_list, rec_list):
        gt_dict = {g["id"]: g for g in gt_list}
        ndcg_score = 0.0

        for rec in rec_list:
            gt = gt_dict[rec["id"]]
            ndcg_score += self._ndcg(gt["items"], rec["items"])


        ndcg_score = ndcg_score / len(rec_list)
        ent = self._entropy_diversity(rec_list)
        
        return ndcg_score, ent

    def evaluate(self, gt_list, rec_list):
        try:
            ndcg_score, ent_score = self._eval(gt_list, rec_list)
            print(f"NDCG: {ndcg_score:.6}")
            print(f"Entropy Diversity: {ent_score:.6} ")
        except Exception as e:
            print(e)

In [65]:
def module(df:pd.DataFrame(), split_date, test_date, all_df)->pd.DataFrame():
    
    # paid orders만 가져오기
    df['date_paid'] = pd.to_datetime(df['date_paid'])
    df_only_paid = df[~df['date_paid'].isna()]
    # 5개월치 데이터만 가져오기
    df_date = df_only_paid[df_only_paid['date_paid'] >= split_date]
    # 취소 안된 것만 가져오기
    complete_df = df_date[(df_date['paid'] == True) & (df_date['cancelled']==False)]
    # 도서 카테고리만 가져오기
    only_book = complete_df[complete_df['name'] == '도서']

    # 유저가 중복으로 아이템 구매 삭제
    df_duplicated_book = only_book.drop_duplicates(subset=['customer_id','product_ids'])
    df_book = df_duplicated_book.sort_values(by='date_paid').reset_index(drop=True)
    
    # medirecommend 만들기
    df = df.dropna(subset=['product_ids','name_x'])

    # paid orders만 가져오기
    df['date_paid'] = pd.to_datetime(df['date_paid'])
    df_only_paid = df[~df['date_paid'].isna()]
    # 취소 안된 것만 가져오기
    complete_df = df_only_paid[(df_only_paid['paid'] == True) & (df_only_paid['cancelled']==False)]
    # 도서 카테고리만 가져오기
    only_book = complete_df[complete_df['name'] == '도서']

    # 유저가 중복으로 아이템 구매 삭제
    df_duplicated_book = only_book.drop_duplicates(subset=['customer_id','product_ids'])
    df_sort = df_duplicated_book.sort_values(by='date_paid').reset_index(drop=True)
    df_sort = product_name_fill(df_sort)
    df_sort = df_sort.drop_duplicates(subset=['customer_id','product_ids']).reset_index(drop=True)
    
    # 변수 처리한 기간 데이터만 가져오기
    df_book = df_sort[df_sort['date_paid'] >= split_date].reset_index(drop=True)

    # 마지막 cross validation 6주 제외한 medirecommend 만들기
    mediprediction_all_df = df_sort[df_sort['date_paid'] < test_date].reset_index(drop=True)
    
    train_before = df_book[df_book['date_paid'] < test_date]
    train = promotion_proprof(train_before)
    test = df_book[df_book['date_paid'].dt.date == pd.to_datetime(test_date)]
    
    
    # test 만 있는 item 제거
    only_test_items = set(test.product_ids.unique())-set(train.product_ids.unique())
    if_prepro_test = test[~test['product_ids'].isin(only_test_items)]
    
    
    PdIds = train.product_ids.unique()

    PdIdToIndex = {}
    indexToPdId = {}

    colIdx = 0

    for PdId in PdIds:
        PdIdToIndex[PdId] = colIdx
        indexToPdId[colIdx] = PdId
        colIdx += 1

    userIds = train.customer_id.unique()

    userIdToIndex = {}
    indexToUserId = {}

    rowIdx = 0

    for userId in userIds:
        userIdToIndex[userId] = rowIdx
        indexToUserId[rowIdx] = userId
        rowIdx += 1
    rows = []
    cols = []
    vals = []

    for row in train.itertuples():
        rows.append(userIdToIndex[row.customer_id])
        cols.append(PdIdToIndex[row.product_ids])
        vals.append(1)

    purchase_sparse = sp.csr_matrix((vals, (rows, cols)), shape=(rowIdx,colIdx))

    matrix = purchase_sparse.todense()

    medistream_prediction_df = mediprediction_all_df[['date_created','regular_price','sale_price','three_months','product_ids','name_x']]
    medistream_prediction_preprop_df = medistream_prediction_df.drop_duplicates(subset=['product_ids'], ignore_index=True)
    medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])    
    
    most_popular = mediprediction_all_df.groupby(['product_ids']).count()['customer_id'].reset_index()
    most_popular_list = most_popular.sort_values(by='customer_id',ascending=False).index
    
    # test 예측값, 이미 구매 했을 경우 제외
    predict_popular_list = []
    for user_id in test['customer_id'].unique():
        train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
        predict_popular_list.append({'id':user_id ,'items':[most_popular.product_ids.loc[num] for num in most_popular_list \
                                                                if most_popular.product_ids.loc[num] not in train_purchase_list \
                                                                ]})

    # 15 개만 예측하기
    for idx, pred_list in enumerate(predict_popular_list):
        predict_popular_list[idx]['items'] = pred_list['items'][:15]
        
    # real test 
    ground_trues = []
    for user_id in test['customer_id'].unique():
        ground_trues.append({'id': user_id,\
        'items':list(test[test['customer_id']==user_id].product_ids)
        })

    # MP
    evaluator = CustomEvaluator()
    mp = evaluator._eval(ground_trues, predict_popular_list)
    
    # 인기도순
    medistream_popular_list = medistream_prediction_preprop_df.sort_values(by='three_months', ascending=False).index
    # 최신순
    medistream_latest_list = medistream_prediction_preprop_df.sort_values(by='date_created', ascending=False).index
    # 오랜된 순
    medistream_oldest_list = medistream_prediction_preprop_df.sort_values(by='date_created', ascending=True).index
    # 높은 가격 순
    medistream_high_price_list = medistream_prediction_preprop_df.sort_values(by='sale_price', ascending=False).index
    # 낮은 가격 순
    medistream_low_price_list = medistream_prediction_preprop_df.sort_values(by='sale_price', ascending=True).index
    # 이름 순
    medistream_name_sort_list = medistream_prediction_preprop_df.sort_values(by='name_x',ascending=True).index

    def medistream_prediction_method(predict_num:int ,medi_predict_list:list)->list:
        medistream_predict_list = []
        for user_id in test['customer_id'].unique():
            medistream_predict_list.append({'id':user_id ,'items':[medistream_prediction_preprop_df.product_ids.loc[num] \
                                                                           for num in medi_predict_list]})

        # 15 개만 예측하기
        for idx, pred_list in enumerate(medistream_predict_list):
            medistream_predict_list[idx]['items'] = pred_list['items'][:predict_num]

        return medistream_predict_list
    
    medistream_predict_popular_list = medistream_prediction_method(15, medistream_popular_list)
    medistream_predict_latest_list = medistream_prediction_method(15, medistream_latest_list)
    medistream_predict_oldest_list = medistream_prediction_method(15, medistream_oldest_list)
    medistream_predict_high_price_list = medistream_prediction_method(15, medistream_high_price_list)
    medistream_predict_low_price_list = medistream_prediction_method(15, medistream_low_price_list)
    medistream_predict_name_sort_list = medistream_prediction_method(15, medistream_name_sort_list)
    
    def medistream_prediction(ground_trues:list, predict_list:list):
        evaluator = CustomEvaluator()
        ndcg, entropy = evaluator._eval(ground_trues, predict_list)

        assert len(predict_list) == len(ground_trues)

        cnt = 0
        for gt, pred_list in zip(ground_trues, predict_list):
            for pred in pred_list['items']:
                if pred in gt['items']:
                    cnt += 1
        return ndcg, entropy, cnt
    
    medistream_predict_score = {'medistream_predict':['medi_popular','latest','oldest','high_price','low_price','name_sort'], \
                                'ndcg':[], 'entropy':[], 'cnt':[]}

    medistream_predict_list = [medistream_predict_popular_list, medistream_predict_latest_list, medistream_predict_oldest_list,\
                              medistream_predict_high_price_list, medistream_predict_low_price_list, medistream_predict_name_sort_list]

    for medistream_predict in medistream_predict_list:
        ndcg, entropy, cnt = medistream_prediction(ground_trues, medistream_predict)
        medistream_predict_score['ndcg'].append(ndcg)
        medistream_predict_score['entropy'].append(entropy)
        medistream_predict_score['cnt'].append(cnt)

        
######## hyper parameter tuned

    # MF
    als_mf_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'NDCG':[],'entropy':[]}

    factors = [5]
    regularizations = [0.01]
    iterations = [5]

    for factor in factors:
        for regularization in regularizations:
            for iteration in iterations:
                als_model = ALS(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
                als_model.fit(purchase_sparse, show_progress=False)

                # 신규 유저인 경우 mp로 넣기
                # 전체 도서에 대한 판매 만큼 정렬 후 넣기
                most_popular_list = most_popular.sort_values(by='customer_id',ascending=False).index

                # test 예측값, 이미 구매 했을 경우 제외
                als_predict_list = []
                for user_id in test['customer_id'].unique():
                    try:
                        result = als_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=15)
                        als_predict_list.append({'id':user_id ,'items':[indexToPdId[num] for num in result[0]]})
                    except:
                        train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                        als_predict_list.append({'id':user_id ,'items':[most_popular.product_ids.loc[num] for num in most_popular_list \
                                                                            if most_popular.product_ids.loc[num] not in train_purchase_list \
                                                                            ]})

                # 15 개만 예측하기
                for idx, pred_list in enumerate(als_predict_list):
                    als_predict_list[idx]['items'] = pred_list['items'][:15]

                # ALS 
                evaluator = CustomEvaluator()
                ndcg, entropy = evaluator._eval(ground_trues, als_predict_list)

                als_mf_hyper_parameter['factor'].append(factor)
                als_mf_hyper_parameter['regularization'].append(regularization)
                als_mf_hyper_parameter['iteration'].append(iteration)
                als_mf_hyper_parameter['NDCG'].append(ndcg)
                als_mf_hyper_parameter['entropy'].append(entropy)
                
    # LMF
                
    lmf_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'NDCG':[],'entropy':[]}

    factors = [15]
    regularizations = [0.005]
    iterations = [50]

    for factor in factors:
        for regularization in regularizations:
            for iteration in iterations:
                lmf_model = LMF(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
                lmf_model.fit(purchase_sparse, show_progress=False)

                # 신규 유저 mp로 넣기
                most_popular_list = most_popular.sort_values(by='customer_id',ascending=False).index

                # test 예측값
                lmf_predict_list = []
                for user_id in test['customer_id'].unique():
                    try:
                        result = lmf_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=15)
                        lmf_predict_list.append({'id':user_id ,'items':[indexToPdId[num] for num in result[0]]})
                    except:
                        train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                        lmf_predict_list.append({'id':user_id ,'items':[most_popular.product_ids.loc[num] for num in most_popular_list \
                                                                            if most_popular.product_ids.loc[num] not in train_purchase_list \
                                                                            ]})

                # 15 개만 예측하기
                for idx, pred_list in enumerate(lmf_predict_list):
                    lmf_predict_list[idx]['items'] = pred_list['items'][:15]

                # LMF
                evaluator = CustomEvaluator()
                ndcg, entropy = evaluator._eval(ground_trues, lmf_predict_list)

                lmf_hyper_parameter['factor'].append(factor)
                lmf_hyper_parameter['regularization'].append(regularization)
                lmf_hyper_parameter['iteration'].append(iteration)
                lmf_hyper_parameter['NDCG'].append(ndcg)
                lmf_hyper_parameter['entropy'].append(entropy)
                
    # 1) ensemble top5
                
    top5_medipop_lmf_mix_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'top':[],'NDCG':[],'entropy':[]}

    factors = [40]
    regularizations = [0.005]
    iterations = [50]
    tops = [5]

    for factor in factors:
        for regularization in regularizations:
            for iteration in iterations:
                for top in tops:
                    lmf_model = LMF(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
                    lmf_model.fit(purchase_sparse, show_progress=False)

                    # test 예측값
                    lmf_predict_list = []
                    for user_id in test['customer_id'].unique():
                        try:
                            train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                            medi_popular_top_three = medistream_popular_list[:top]
                            medi_popular_top_three_list = [medistream_prediction_preprop_df.product_ids.loc[num] for num in medi_popular_top_three \
                                                                                if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                                ]
                            result = lmf_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=20)
                            result_list = [indexToPdId[num] for num in result[0]]
                            medi_pop_lmf_list = list(dict.fromkeys(medi_popular_top_three_list + result_list))
                            lmf_predict_list.append({'id':user_id ,'items':medi_pop_lmf_list})
                        except:
                            train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                            lmf_predict_list.append({'id':user_id ,'items':[medistream_prediction_preprop_df.product_ids.loc[num] for num in medistream_popular_list \
                                                                                if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                                ]})

                    # 15 개만 예측하기
                    for idx, pred_list in enumerate(lmf_predict_list):
                        lmf_predict_list[idx]['items'] = pred_list['items'][:15]

                    # LMF
                    evaluator = CustomEvaluator()
                    ndcg, entropy = evaluator._eval(ground_trues, lmf_predict_list)

                    top5_medipop_lmf_mix_hyper_parameter['factor'].append(factor)
                    top5_medipop_lmf_mix_hyper_parameter['regularization'].append(regularization)
                    top5_medipop_lmf_mix_hyper_parameter['iteration'].append(iteration)
                    top5_medipop_lmf_mix_hyper_parameter['top'].append(top)
                    top5_medipop_lmf_mix_hyper_parameter['NDCG'].append(ndcg)
                    top5_medipop_lmf_mix_hyper_parameter['entropy'].append(entropy)
                    
    # 2) ensemble top3
    
    top3_medipop_lmf_mix_hyper_parameter = {'factor':[],'regularization':[],'iteration':[],'top':[],'NDCG':[],'entropy':[]}

    factors = [40]
    regularizations = [0.005]
    iterations = [50]
    tops = [3]

    for factor in factors:
        for regularization in regularizations:
            for iteration in iterations:
                for top in tops:
                    lmf_model = LMF(factors=factor, regularization=regularization, iterations = iteration, random_state=42)
                    lmf_model.fit(purchase_sparse, show_progress=False)

                    # test 예측값
                    lmf_predict_list = []
                    for user_id in test['customer_id'].unique():
                        try:
                            train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                            medi_popular_top_three = medistream_popular_list[:top]
                            medi_popular_top_three_list = [medistream_prediction_preprop_df.product_ids.loc[num] for num in medi_popular_top_three \
                                                                                if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                                ]
                            result = lmf_model.recommend(userIdToIndex[user_id], purchase_sparse[userIdToIndex[user_id]], N=20)
                            result_list = [indexToPdId[num] for num in result[0]]
                            medi_pop_lmf_list = list(dict.fromkeys(medi_popular_top_three_list + result_list))
                            lmf_predict_list.append({'id':user_id ,'items':medi_pop_lmf_list})
                        except:
                            train_purchase_list = list(train[train['customer_id']==user_id].product_ids)
                            lmf_predict_list.append({'id':user_id ,'items':[medistream_prediction_preprop_df.product_ids.loc[num] for num in medistream_popular_list \
                                                                                if medistream_prediction_preprop_df.product_ids.loc[num] not in train_purchase_list \
                                                                                ]})

                    # 15 개만 예측하기
                    for idx, pred_list in enumerate(lmf_predict_list):
                        lmf_predict_list[idx]['items'] = pred_list['items'][:15]

                    # LMF
                    evaluator = CustomEvaluator()
                    ndcg, entropy = evaluator._eval(ground_trues, lmf_predict_list)

                    top3_medipop_lmf_mix_hyper_parameter['factor'].append(factor)
                    top3_medipop_lmf_mix_hyper_parameter['regularization'].append(regularization)
                    top3_medipop_lmf_mix_hyper_parameter['iteration'].append(iteration)
                    top3_medipop_lmf_mix_hyper_parameter['top'].append(top)
                    top3_medipop_lmf_mix_hyper_parameter['NDCG'].append(ndcg)
                    top3_medipop_lmf_mix_hyper_parameter['entropy'].append(entropy)
                

    all_prediction_df = {'first_day':[],'last_day':[],'train_데이터수':[],'train_유저수':[],'test_데이터수':[],\
        'test_유저수':[],'test_신규유저수':[],'test_신규아이템수':[],'원본_test수':[],'전처리진행test수':[],\
        'als_mf':[],'lmf':[],'top5_medi_mp_lmf_mix':[],'top3_medi_mp_lmf_mix':[],'mp':[],'medi_popular':[],'latest':[],\
        'oldest':[],'high_price':[],'low_price':[],'name_sort':[],\
         'als_mf_entropy':[],'lmf_entropy':[],'top5_medi_mp_lmf_mix_entropy':[],'top3_medi_mp_lmf_mix_entropy':[],'mp_entropy':[],'medi_popular_entropy':[],'latest_entropy':[],\
         'oldest_entropy':[],'high_price_entropy':[],'low_price_entropy':[],'name_sort_entropy':[]}
    medistream_predict_df = pd.DataFrame(medistream_predict_score)

    all_prediction_df['first_day'].append(str(datetime.date(train['date_paid'].min()))+' '+str(datetime.date(train['date_paid'].max())))
    all_prediction_df['last_day'].append(str(datetime.date(test['date_paid'].min()))+' '+str(datetime.date(test['date_paid'].max())))
    all_prediction_df['train_데이터수'].append(len(train))
    all_prediction_df['train_유저수'].append(len(set(train.customer_id)))
    all_prediction_df['test_데이터수'].append(len(test))
    all_prediction_df['test_유저수'].append(len(set(test.customer_id)))
    all_prediction_df['test_신규유저수'].append(len(set(test['customer_id'].unique())- set(train['customer_id'].unique())))
    all_prediction_df['test_신규아이템수'].append(len(set(test.product_ids.unique())-set(train.product_ids.unique())))
    all_prediction_df['원본_test수'].append(len(test))
    all_prediction_df['전처리진행test수'].append(len(if_prepro_test))

    # ndcg
    all_prediction_df['als_mf'].append(pd.DataFrame(als_mf_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
    all_prediction_df['lmf'].append(pd.DataFrame(lmf_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
    all_prediction_df['top5_medi_mp_lmf_mix'].append(pd.DataFrame(top5_medipop_lmf_mix_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
    all_prediction_df['top3_medi_mp_lmf_mix'].append(pd.DataFrame(top3_medipop_lmf_mix_hyper_parameter).sort_values(by='NDCG',ascending=False)['NDCG'].iloc[0])
    all_prediction_df['mp'].append(evaluator._eval(ground_trues, predict_popular_list)[0])
    all_prediction_df['medi_popular'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='medi_popular'].iloc[0]['ndcg'])
    all_prediction_df['latest'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='latest'].iloc[0]['ndcg'])
    all_prediction_df['oldest'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='oldest'].iloc[0]['ndcg'])
    all_prediction_df['high_price'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='high_price'].iloc[0]['ndcg'])
    all_prediction_df['low_price'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='low_price'].iloc[0]['ndcg'])
    all_prediction_df['name_sort'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='name_sort'].iloc[0]['ndcg'])

    # entropy
    all_prediction_df['als_mf_entropy'].append(pd.DataFrame(als_mf_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
    all_prediction_df['lmf_entropy'].append(pd.DataFrame(lmf_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
    all_prediction_df['top5_medi_mp_lmf_mix_entropy'].append(pd.DataFrame(top5_medipop_lmf_mix_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
    all_prediction_df['top3_medi_mp_lmf_mix_entropy'].append(pd.DataFrame(top3_medipop_lmf_mix_hyper_parameter).sort_values(by='entropy',ascending=False)['entropy'].iloc[0])
    all_prediction_df['mp_entropy'].append(evaluator._eval(ground_trues, predict_popular_list)[1])
    all_prediction_df['medi_popular_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='medi_popular'].iloc[0]['entropy'])
    all_prediction_df['latest_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='latest'].iloc[0]['entropy'])
    all_prediction_df['oldest_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='oldest'].iloc[0]['entropy'])
    all_prediction_df['high_price_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='high_price'].iloc[0]['entropy'])
    all_prediction_df['low_price_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='low_price'].iloc[0]['entropy'])
    all_prediction_df['name_sort_entropy'].append(medistream_predict_df[medistream_predict_df['medistream_predict']=='name_sort'].iloc[0]['entropy'])

    print('train 총 기간:',train['date_paid'].max()-train['date_paid'].min())
    print('valid 총 기간:',test['date_paid'].max()-test['date_paid'].min())
    
    return pd.DataFrame(all_prediction_df)

In [66]:
def CrossValidation():
    from tqdm import tqdm

    twenty_df_list = []
    for sp_day, tt_day in tqdm(zip(split_day_list,test_day_list)):
        module_df = module(df, sp_day,tt_day, all_df)
        twenty_df_list.append(module_df)
    twenty_df = pd.concat(twenty_df_list, ignore_index=True)

    return twenty_df

In [67]:
twenty_df = CrossValidation()
twenty_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
1it [00:17, 17.20s/it]

train 총 기간: 122 days 14:59:35.538000
test 총 기간: 0 days 10:26:56.965000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
2it [00:38, 19.34s/it]

train 총 기간: 122 days 21:07:04.713000
test 총 기간: 0 days 13:44:46.500000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
3it [00:55, 18.34s/it]

train 총 기간: 122 days 16:35:30.710000
test 총 기간: 0 days 21:14:26.909000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
4it [01:13, 18.19s/it]

train 총 기간: 122 days 22:22:02.318000
test 총 기간: 0 days 16:24:28.670000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
5it [01:35, 19.83s/it]

train 총 기간: 122 days 21:45:05.364000
test 총 기간: 0 days 22:08:31.172000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
6it [01:55, 19.76s/it]

train 총 기간: 122 days 05:59:10.200000
test 총 기간: 0 days 21:45:19.098000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
7it [02:13, 19.26s/it]

train 총 기간: 122 days 19:10:27.839000
test 총 기간: 0 days 14:47:56.361000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
8it [02:31, 18.78s/it]

train 총 기간: 122 days 16:02:08
test 총 기간: 0 days 21:12:34.296000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
9it [02:48, 18.31s/it]

train 총 기간: 122 days 15:05:40.767000
test 총 기간: 0 days 08:32:23.038000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
10it [03:05, 17.95s/it]

train 총 기간: 122 days 23:29:49.118000
test 총 기간: 0 days 13:43:21.405000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
11it [03:23, 17.94s/it]

train 총 기간: 122 days 22:00:27.777000
test 총 기간: 0 days 23:18:01.972000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
12it [03:41, 17.91s/it]

train 총 기간: 122 days 14:08:06.703000
test 총 기간: 0 days 23:47:33.414000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
13it [04:02, 18.67s/it]

train 총 기간: 122 days 17:37:36.132000
test 총 기간: 0 days 16:54:10.591000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
14it [04:23, 19.51s/it]

train 총 기간: 122 days 20:38:18.546000
test 총 기간: 0 days 23:28:54.649000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
15it [04:40, 18.84s/it]

train 총 기간: 122 days 15:05:07.967000
test 총 기간: 0 days 19:46:16.719000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
16it [04:58, 18.41s/it]

train 총 기간: 122 days 14:02:30.995000
test 총 기간: 0 days 13:24:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
17it [05:15, 18.05s/it]

train 총 기간: 122 days 15:28:08.337000
test 총 기간: 0 days 14:43:19.778000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
18it [05:33, 18.18s/it]

train 총 기간: 122 days 10:34:19.664000
test 총 기간: 0 days 14:26:25.297000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
19it [06:01, 21.09s/it]

train 총 기간: 122 days 20:09:42.563000
test 총 기간: 0 days 11:24:56.665000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
20it [06:20, 20.42s/it]

train 총 기간: 122 days 16:33:08.155000
test 총 기간: 0 days 23:49:41.735000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_paid'] = pd.to_datetime(df['date_paid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  medistream_prediction_preprop_df['date_created'] = pd.to_datetime(medistream_prediction_preprop_df['date_created'])
21it [06:41, 19.12s/it]

train 총 기간: 122 days 16:17:50.960000
test 총 기간: 0 days 17:30:35.552000





Unnamed: 0,first_day,last_day,train_데이터수,train_유저수,test_데이터수,test_유저수,test_신규유저수,test_신규아이템수,원본_test수,전처리진행test수,als_mf,lmf,top5_medi_mp_lmf_mix,top3_medi_mp_lmf_mix,mp,medi_popular,latest,oldest,high_price,low_price,name_sort,als_mf_entropy,lmf_entropy,top5_medi_mp_lmf_mix_entropy,top3_medi_mp_lmf_mix_entropy,mp_entropy,medi_popular_entropy,latest_entropy,oldest_entropy,high_price_entropy,low_price_entropy,name_sort_entropy
0,2022-04-21 2022-08-21,2022-08-22 2022-08-22,2741,1421,10,8,5,0,10,10,0.441521,0.262526,0.434639,0.372139,0.301959,0.425974,0.142483,0.019553,0.0,0.015852,0.0,3.284273,3.450166,3.151517,3.271551,2.758286,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
1,2022-04-20 2022-08-20,2022-08-21 2022-08-21,2737,1425,28,15,9,0,28,28,0.333776,0.311489,0.207888,0.213219,0.282351,0.206744,0.085556,0.088352,0.0,0.0,0.049902,3.413769,3.642396,3.242338,3.321362,2.742388,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
2,2022-04-19 2022-08-19,2022-08-20 2022-08-20,2738,1427,12,7,3,0,12,12,0.210899,0.270354,0.206523,0.219101,0.194802,0.197236,0.050052,0.033885,0.0,0.0,0.023671,3.456378,3.488702,3.359082,3.435062,2.758685,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
3,2022-04-18 2022-08-18,2022-08-19 2022-08-19,2760,1439,19,14,6,0,19,19,0.16744,0.204789,0.276922,0.286064,0.252103,0.248316,0.06994,0.04379,0.018761,0.012603,0.011503,3.4896,3.758705,3.287905,3.420011,2.802319,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
4,2022-04-17 2022-08-17,2022-08-18 2022-08-18,2730,1422,69,53,37,0,69,69,0.128657,0.135959,0.165644,0.168949,0.132538,0.180751,0.217109,0.010065,0.07433,0.0,0.0726,3.290191,3.485916,3.188539,3.26786,2.755239,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
5,2022-04-16 2022-08-16,2022-08-17 2022-08-17,2691,1409,42,30,15,2,42,40,0.287282,0.258927,0.259495,0.212231,0.234618,0.230214,0.059215,0.065449,0.017763,0.012524,0.019444,3.593421,3.713041,3.281175,3.41622,2.755415,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
6,2022-04-15 2022-08-15,2022-08-16 2022-08-16,2679,1407,32,18,6,1,32,31,0.197017,0.254487,0.186331,0.1798,0.328102,0.194532,0.070859,0.02194,0.021439,0.0,0.035903,3.746213,3.836479,3.432582,3.552278,2.776645,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
7,2022-04-14 2022-08-14,2022-08-15 2022-08-15,2669,1404,18,12,7,0,18,18,0.27146,0.16605,0.248059,0.250751,0.190257,0.28715,0.128143,0.0,0.021887,0.0,0.032421,3.303551,3.476448,3.277373,3.369497,2.78454,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
8,2022-04-13 2022-08-13,2022-08-14 2022-08-14,2691,1409,8,8,6,1,8,7,0.204607,0.15625,0.100129,0.100129,0.22024,0.100129,0.138919,0.039433,0.0,0.0,0.034868,3.120363,3.213225,3.041801,3.128332,2.733168,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
9,2022-04-12 2022-08-12,2022-08-13 2022-08-13,2697,1407,8,7,4,1,8,7,0.330781,0.198122,0.162164,0.175958,0.323123,0.161561,0.0,0.047619,0.0,0.038605,0.0,3.314365,3.504081,3.171418,3.271504,2.74953,2.70805,2.70805,2.70805,2.70805,2.70805,2.70805
