# Sample Code

## 載入資料

In [1]:
import pandas as pd
import numpy as np



In [2]:
## Sample Data

metadata = pd.DataFrame({
    'asin': ['A1', 'A2', 'AB3', 'B4', 'B5'],
    'title': ['Hello World', 'Hello Python', 'Python Tutorial', 'Java Tutorial', 'C++ Tutorial']
})
metadata

Unnamed: 0,asin,title
0,A1,Hello World
1,A2,Hello Python
2,AB3,Python Tutorial
3,B4,Java Tutorial
4,B5,C++ Tutorial


In [3]:
ratings = pd.DataFrame([
    {'asin': 'A1','reviewerID': 'USER1', 'overall': 5, 'unixReviewTime': 1424304000 ,'DATE': '2015-02-19'},
    {'asin': 'A2','reviewerID': 'USER1', 'overall': 5, 'unixReviewTime': 1424304000 ,'DATE': '2015-02-19'},
    {'asin': 'AB3','reviewerID': 'USER1', 'overall': 5, 'unixReviewTime': 1424304000 ,'DATE': '2015-02-19'},
    {'asin': 'B4','reviewerID': 'USER2', 'overall': 5, 'unixReviewTime': 1424304000 ,'DATE': '2015-02-19'},
    {'asin': 'AB3','reviewerID': 'USER2', 'overall': 5, 'unixReviewTime': 1424304000 ,'DATE': '2015-02-19'},
    {'asin': 'A1','reviewerID': 'USER3', 'overall': 5, 'unixReviewTime': 1424304000 ,'DATE': '2015-02-19'},
    
    {'asin': 'A1','reviewerID': 'USER1', 'overall': 5, 'unixReviewTime': 1537315200 ,'DATE': '2018-09-19'},
    {'asin': 'A2','reviewerID': 'USER1', 'overall': 5, 'unixReviewTime': 1537315200 ,'DATE': '2018-09-19'},
    {'asin': 'B5','reviewerID': 'USER2', 'overall': 5, 'unixReviewTime': 1537315200 ,'DATE': '2018-09-19'},
    {'asin': 'AB3','reviewerID': 'USER2', 'overall': 5, 'unixReviewTime': 1537315200 ,'DATE': '2018-09-19'},
    {'asin': 'A2','reviewerID': 'USER3', 'overall': 5, 'unixReviewTime': 1537315200 ,'DATE': '2018-09-19'},

])
ratings

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,A1,USER1,5,1424304000,2015-02-19
1,A2,USER1,5,1424304000,2015-02-19
2,AB3,USER1,5,1424304000,2015-02-19
3,B4,USER2,5,1424304000,2015-02-19
4,AB3,USER2,5,1424304000,2015-02-19
5,A1,USER3,5,1424304000,2015-02-19
6,A1,USER1,5,1537315200,2018-09-19
7,A2,USER1,5,1537315200,2018-09-19
8,B5,USER2,5,1537315200,2018-09-19
9,AB3,USER2,5,1537315200,2018-09-19


In [4]:
metadata.head()

Unnamed: 0,asin,title
0,A1,Hello World
1,A2,Hello Python
2,AB3,Python Tutorial
3,B4,Java Tutorial
4,B5,C++ Tutorial


In [5]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,A1,USER1,5,1424304000,2015-02-19
1,A2,USER1,5,1424304000,2015-02-19
2,AB3,USER1,5,1424304000,2015-02-19
3,B4,USER2,5,1424304000,2015-02-19
4,AB3,USER2,5,1424304000,2015-02-19


## 資料整理

In [6]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [7]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin', 'overall']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: dict(zip(rating['asin'], rating['overall'])) for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

In [8]:
ratings_trainings

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
0,A1,USER1,5,1424304000,2015-02-19
1,A2,USER1,5,1424304000,2015-02-19
2,AB3,USER1,5,1424304000,2015-02-19
3,B4,USER2,5,1424304000,2015-02-19
4,AB3,USER2,5,1424304000,2015-02-19
5,A1,USER3,5,1424304000,2015-02-19


In [9]:
ratings_testings

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,DATE
6,A1,USER1,5,1537315200,2018-09-19
7,A2,USER1,5,1537315200,2018-09-19
8,B5,USER2,5,1537315200,2018-09-19
9,AB3,USER2,5,1537315200,2018-09-19
10,A2,USER3,5,1537315200,2018-09-19


In [10]:
users

['USER1', 'USER2', 'USER3']

In [11]:
ratings_testings_by_user

{'USER1': {'A1': 5, 'A2': 5}, 'USER2': {'B5': 5, 'AB3': 5}, 'USER3': {'A2': 5}}

In [12]:
pd.DataFrame(ratings_testings_by_user)

Unnamed: 0,USER1,USER2,USER3
A1,5.0,,
A2,5.0,,5.0
B5,,5.0,
AB3,,5.0,


## 產生推薦

In [13]:
import math

# 計算 Item 之間的相似度
def ItemSimilarity_cos(train):  
    C = dict() 
    N = dict() 
    for u,items in train.items():
        for i in items.keys(): 
            if i not in N.keys():
                N[i]=0
            N[i] += items[i]* items[i] 
            for j in items.keys():  
                if i == j:  
                    continue  
                if i not in C.keys():
                    C[i]=dict()
                if j not in C[i].keys():
                    C[i][j]=0
                C[i][j] += items[i]*items[j]  
    W = dict() 
    for i,related_items in C.items():
        if i not in W.keys():
            W[i]=dict()        
        for j,cij in related_items.items(): 
            W[i][j] = cij / (math.sqrt( N[i]) *math.sqrt( N[j]) ) 
    
    return W  

# 找出特定 User 的未知 Item 的分數
def RecommendItemCF(train, user_id, W):  
    rank = dict()  
    ru = train[user_id]  
    for i, pi in ru.items(): 
        tmp = W[i]
        for j,wj in sorted(tmp.items(),key=lambda d: d[1],reverse=True): 
            if j not in rank.keys():
                rank[j] = 0
            if j in ru:  
                continue  
            rank[j] += pi*wj  
    return dict(sorted(list(rank.items()), key=lambda val:val[1],reverse=True))


In [14]:
W = ItemSimilarity_cos (ratings_testings_by_user)
pd.DataFrame(W)

Unnamed: 0,A1,A2,B5,AB3
A2,0.707107,,,
A1,,0.707107,,
AB3,,,1.0,
B5,,,,1.0


In [15]:
RecommendItemCF(ratings_testings_by_user, 'USER1', W) 

{'A2': 0, 'A1': 0}

In [16]:
RecommendItemCF(ratings_testings_by_user, 'USER2', W)  

{'AB3': 0, 'B5': 0}

In [17]:
RecommendItemCF(ratings_testings_by_user, 'USER3', W)  

{'A1': 3.5355339059327373}

In [18]:
def recommender(training_data, users=[], k=2):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    cf-item-based
    '''
    ratings_trainings = training_data
    recommendations = {user: list(RecommendItemCF(ratings_testings_by_user, user, W).keys()) for user in users}

    return recommendations

ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user

{'USER1': ['A2', 'A1'], 'USER2': ['AB3', 'B5'], 'USER3': ['A1']}

## 結果評估

In [19]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.8

In [20]:
ratings_testings_by_user

{'USER1': {'A1': 5, 'A2': 5}, 'USER2': {'B5': 5, 'AB3': 5}, 'USER3': {'A2': 5}}

In [21]:
ratings_by_user

{'USER1': ['A2', 'A1'], 'USER2': ['AB3', 'B5'], 'USER3': ['A1']}