<a href="https://colab.research.google.com/github/world4jason/data-course-sample/blob/main/weak_1/hw1_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import subprocess
import pathlib
import datetime


In [5]:

class DatasetLoader:
    def __init__(self, 
                 data_file_name:str="All_Beauty.csv",
                 meta_data_file_name:str="meta_All_Beauty.json.gz",
                 base_file_folder:str="data",
                 make_dir:bool=False,
                ):
        self.data      = None
        self.meta_data = None
        
        self._data_url      = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv"
        self._meta_data_url = "http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz"
        
        self._data_file_name      = data_file_name
        self._meta_data_file_name = meta_data_file_name
        
        self.base_file_folder = base_file_folder
        
        pathlib.Path(self.base_file_folder).mkdir(parents=True, exist_ok=True) 
                        
    def download_dataset(self) -> bool :
        data_retcode = subprocess.call("wget {url} -O {file_path}/{file_name}".format(url=self._data_url,
                                                                                      file_path=self.base_file_folder, 
                                                                                      file_name=self._data_file_name),
                                        shell=True,
                                        stdout=subprocess.DEVNULL,
                                        stderr=subprocess.STDOUT)
        meta_data_retcode = subprocess.call("wget {url} -O {file_path}/{file_name}".format(url=self._meta_data_url, 
                                                                                           file_path=self.base_file_folder, 
                                                                                           file_name=self._meta_data_file_name),
                                            shell=True,
                                            stdout=subprocess.DEVNULL,
                                            stderr=subprocess.STDOUT)
        
        
        return not meta_data_retcode and not data_retcode
    
    def prepare_data(self) -> None:
        data_file_path      = pathlib.Path("{file_path}/{file_name}".format(file_path=self.base_file_folder, file_name=self._data_file_name))
        meta_data_file_path = pathlib.Path("{file_path}/{file_name}".format(file_path=self.base_file_folder, file_name=self._meta_data_file_name))
        
        if not data_file_path.exists():
            raise AssertionError("data path : {path} is not exist.".format(path=data_file_path))
            
        if not meta_data_file_path.exists():
            raise AssertionError("meta data path : {path} is not exist.".format(path=data_file_path))
            
        self.meta_data = self.read_json(data_file_path=meta_data_file_path, lines=True, compression='gzip')
        self.data      = self.read_csv(data_file_path=data_file_path, names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)
        
    def read_json(self, data_file_path:str, **kwargs):
        return pd.read_json(path_or_buf=data_file_path, **kwargs)
    
    def read_csv(self, data_file_path:str, **kwargs) -> pd.DataFrame:
        return pd.read_csv(data_file_path, **kwargs)
    
    
class Dataset:
    def __init__(self, 
                 dataset_loader:DatasetLoader):
        
        self.ratings_trainings = None
        self.ratings_testings  = None
        
        self.ratings_testings_by_user=None
        
        self.ratings = dataset_loader.data
        self.meta_data = dataset_loader.meta_data
        
        
    def get_train_test(self, force_rearm=False):
        
        data_exist_condition = self.ratings_trainings is not None and self.ratings_testings is not None 
        
        if not data_exist_condition or force_rearm:
            self._train_test_split()
            self._generate_evaluation()
        
        return self.ratings_trainings, self.ratings_testings
    
    def get_evaluation_data(self):
        if self.ratings_testings_by_user is None:
            self._train_test_split()
            self._generate_evaluation()
        return self.ratings_testings_by_user
    
    def _train_test_split(self):
        ratings = self.ratings
        ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')
        
        ratings_trainings = ratings[
            (ratings['DATE'] < '2018-09-01')
        ]
        ratings_testings = ratings[
            (ratings['DATE'] >= '2018-09-01') & 
            (ratings['DATE'] <= '2018-09-30')
        ]
        
        #users = list(ratings_testings_by_user.keys())
        self.ratings_trainings = ratings_trainings
        self.ratings_testings  = ratings_testings
        
        
        return ratings_trainings, ratings_testings
    
        
    def _generate_evaluation(self):
        ratings = self.ratings
        ratings_testings = self.ratings_testings
        ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
        ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
        self.ratings_testings_by_user=ratings_testings_by_user
        return ratings_testings_by_user
    

In [6]:
dataset_loader = DatasetLoader()

In [7]:
dataset_loader.download_dataset()
dataset_loader.prepare_data()

In [8]:
dataset = Dataset(dataset_loader)

In [9]:
train, test = dataset.get_train_test()

In [10]:
def popularity_recommender(training_data, top_k=5, week_delta=1, rate_low_bound=1, base_date=datetime.date(2018,9,1)):
    # data filter
    df = training_data[training_data.DATE>pd.Timestamp(base_date-datetime.timedelta(days=7*week_delta))]
    # rate filter
    df = df[df.overall>rate_low_bound]
    # times_filter
    v_count = df.asin.value_counts()
    # discard 1 times stuff
    recommend_list = v_count[v_count>1].index[:top_k].values.tolist()
    return recommend_list

In [11]:
def recommender(training_data, users=[]):
    top_k = [1,3,5,10,15,20]
    rate_low_bound = [0,1,2,3,4]
    week_delta = list(range(1,14))
    
    result = [popularity_recommender(training_data,top_k=k,week_delta=delta,rate_low_bound=rate) 
                for delta in week_delta 
                  for rate in rate_low_bound 
                      for k in top_k]
        
    return result

In [12]:
def evaluate(ratings_testings_by_user={}, recommend_list=[], method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * recommend_list: list 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        total += len(set(recommend_list) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings_by_user)
    return score



In [13]:
def recommend_and_evaluation(training_data,
                             top_k = [1,3,5,10,15,20],
                             rate_low_bound = [0,1,2,3,4],
                             week_delta = list(range(1,14)),
                             base_date=datetime.date(2018,9,1)
                            ):
    for delta in week_delta:
        for rate in rate_low_bound:
            for k in top_k:
                result = popularity_recommender(training_data,top_k=k,week_delta=delta,rate_low_bound=rate,base_date=base_date)
                score = round(evaluate(dataset.ratings_testings_by_user, result),3)
                print("date_range:{start_date}~{end_date}, rating_minimun_thresold:{rate},top_k:{top_k} socre:{score}".format(start_date=base_date-datetime.timedelta(days=7*delta),
                                                                                                                              end_date=base_date,
                                                                                                                              rate=rate,
                                                                                                                              top_k=k,
                                                                                                                              score=score
                                                                                                                             ))

In [14]:
recommend_and_evaluation(train)

date_range:2018-08-25~2018-09-01, rating_minimun_thresold:0,top_k:1 socre:0.084
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:0,top_k:3 socre:0.098
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:0,top_k:5 socre:0.134
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:0,top_k:10 socre:0.158
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:0,top_k:15 socre:0.163
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:0,top_k:20 socre:0.178
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:1 socre:0.084
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:3 socre:0.098
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:5 socre:0.134
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:10 socre:0.156
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:15 socre:0.166
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:20 socre:0.197
date_range:2018-08-25~2018-09-01, 

In [15]:
recommend_and_evaluation(training_data=train,
                         top_k = [1,3,5],
                         rate_low_bound = [1],
                         week_delta = list(range(1,14)),
                         base_date=datetime.date(2018,9,1))

date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:1 socre:0.084
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:3 socre:0.098
date_range:2018-08-25~2018-09-01, rating_minimun_thresold:1,top_k:5 socre:0.134
date_range:2018-08-18~2018-09-01, rating_minimun_thresold:1,top_k:1 socre:0.003
date_range:2018-08-18~2018-09-01, rating_minimun_thresold:1,top_k:3 socre:0.101
date_range:2018-08-18~2018-09-01, rating_minimun_thresold:1,top_k:5 socre:0.113
date_range:2018-08-11~2018-09-01, rating_minimun_thresold:1,top_k:1 socre:0.084
date_range:2018-08-11~2018-09-01, rating_minimun_thresold:1,top_k:3 socre:0.087
date_range:2018-08-11~2018-09-01, rating_minimun_thresold:1,top_k:5 socre:0.103
date_range:2018-08-04~2018-09-01, rating_minimun_thresold:1,top_k:1 socre:0.084
date_range:2018-08-04~2018-09-01, rating_minimun_thresold:1,top_k:3 socre:0.087
date_range:2018-08-04~2018-09-01, rating_minimun_thresold:1,top_k:5 socre:0.103
date_range:2018-07-28~2018-09-01, rating