# Sample Code

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')



## 載入資料

In [2]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

In [3]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [5]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [6]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [7]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [8]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 產生推薦

In [12]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 1.1 MB/s eta 0:00:01
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25ldone
[?25h  Created wheel for lightfm: filename=lightfm-1.16-cp39-cp39-macosx_11_0_x86_64.whl size=448292 sha256=da0b043b20fc1e5437c90f76099ae376055db62341dab84e4431127b9011258f
  Stored in directory: /Users/wei/Library/Caches/pip/wheels/d7/75/52/e42e5f9cd86d4902a352aff4dadde75ec041af713ffcf3ed05
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16
You should consider upgrading via the '/Users/wei/.virtualenvs/py39/bin/python -m pip install --upgrade pip' command.[0m


In [13]:
from lightfm import LightFM
import numpy as np

def recommend_items(model, data, user_id, item_list, k):
    
    # 取得item數量
    n_users, n_items = data.shape

    # 預測user對所有item的評分並進行推薦
    scores = model.predict(user_id, np.arange(n_items))
    top_items = item_list[np.argsort(-scores)]
    return top_items[:k].tolist()



In [14]:
from lightfm.data import Dataset

def recommender(training_data, users=[], k=10):
    '''
    * training_data: dataframe 輸入的訓練資料集（2018-09-01 以前資料）
    * users: [] 需要被推薦的使用者
    * k: int 每個使用者需要推薦的商品數
    * recommendations: dict
      {
          使用者一： [推薦商品一, 推薦商品二, ...],
          使用者二： [...], ...
      }
    '''
    recommendations = {}
    '''
    mf-based, lightFM
    '''
    
    # 取出評分矩陣所需的訓練資料並以lightfm的Dataset形式存取
    training_data = training_data[["reviewerID","asin","overall"]]
    data = Dataset()
    data.fit(np.append(training_data.reviewerID.unique(), users, 0), training_data.asin.unique())
    interactions_matrix, weights_matrix = data.build_interactions([tuple(i) for i in training_data.values])

    # 取得user與item的mapping資料
    user_id_map, user_feature_map, item_id_map, item_feature_map = data.mapping()
    item_list = np.array(list(item_id_map.keys())).astype(object)

    # 訓練模型
    model = LightFM(loss='warp', item_alpha=1e-6, no_components=30)
    model = model.fit(interactions_matrix, sample_weight = weights_matrix, epochs=3, num_threads=2)

    # 訓練混合模型（Hybrid Model）加入 user / item feature
    # model_hybrid = LightFM(loss='warp', item_alpha=1e-6, no_components=30)
    # model_hybrid = model_hybrid.fit(interactions_matrix, sample_weight = weights_matrix, item_features=item_features, epochs=3, num_threads=2)

    # 推薦
    recommendations = {user: recommend_items(model, interactions_matrix, user_id_map[user], item_list, k) for user in users}

    return recommendations

# 推薦成果：可看出冷啟動問題嚴重，針對未曾出現的使用者無法有鑑別的個別推薦，實務上可能會主動請使用者先評分再訓練進行推薦
#         除此之外也能嘗試使用混合模型（Hybrid Model）加入user / item feature進行訓練
ratings_by_user = recommender(ratings_trainings, users)
ratings_by_user


{'A100XQFWKQ30O2': ['B000FOI48G',
  'B000GLRREU',
  'B001QY8QXM',
  '1620213982',
  'B01DKQAXC0',
  'B006IB5T4W',
  'B00W259T7G',
  'B00005JS5C',
  'B00VF344X0',
  'B0067F28ZW'],
 'A103T1QOGFCSEH': ['B000FOI48G',
  'B000GLRREU',
  'B001QY8QXM',
  '1620213982',
  'B01DKQAXC0',
  'B006IB5T4W',
  'B00W259T7G',
  'B00005JS5C',
  'B00VF344X0',
  'B0067F28ZW'],
 'A106UKKSJ2KXPF': ['B000FOI48G',
  'B000GLRREU',
  'B001QY8QXM',
  '1620213982',
  'B01DKQAXC0',
  'B006IB5T4W',
  'B00W259T7G',
  'B00005JS5C',
  'B00VF344X0',
  'B0067F28ZW'],
 'A10A7GV4D5A11V': ['B000FOI48G',
  'B000GLRREU',
  'B001QY8QXM',
  '1620213982',
  'B01DKQAXC0',
  'B006IB5T4W',
  'B00W259T7G',
  'B00005JS5C',
  'B00VF344X0',
  'B0067F28ZW'],
 'A1119JJ37ZLB8R': ['B000FOI48G',
  'B000GLRREU',
  'B001QY8QXM',
  '1620213982',
  'B01DKQAXC0',
  'B006IB5T4W',
  'B00W259T7G',
  'B00005JS5C',
  'B00VF344X0',
  'B0067F28ZW'],
 'A113UOOLBSZN52': ['B000FOI48G',
  'B000GLRREU',
  'B001QY8QXM',
  '1620213982',
  'B01DKQAXC0',
  'B006

## 結果評估

In [15]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.08135593220338982