# Surprise Collabrative filtering

## 基礎建設

In [1]:
import pandas as pd
import gzip, json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [3]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

In [4]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [6]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [7]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


---

## 資料整理

In [6]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

### 去除重複資料

In [7]:
print('Original Num of metadata: {0}'.format(len(metadata)))
print('Num of duplicated metadata: {0}'.format(metadata.duplicated(subset=['asin', 'title']).sum()))

print('Drop duplicated metadata...')
metadata.drop_duplicates(subset=['asin', 'title'], keep=False, inplace=True)
metadata = metadata.reset_index(drop=True)
print('Current Num of metadata: {0}'.format(len(metadata)))

Original Num of metadata: 32892
Num of duplicated metadata: 404
Drop duplicated metadata...
Current Num of metadata: 32084


## 資料切分

In [8]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

---

保留原始資料，複製一份出來處理。

In [9]:
metadata_tmp = metadata.copy()

ratings_trainings_tmp = ratings_trainings.copy()
ratings_testings_tmp = ratings_testings.copy()

users_testing_tmp = users.copy()

---

## 資料探索

In [10]:
import numpy as np
import matplotlib.pyplot as plt

### 訓練集資料

In [11]:
# 如果只有一筆就直接填該筆評分，如果有多筆，overall_mean填入評分平均值、overall_lastest填入最近的評分值。
ratings_trainings_unique = ratings_trainings_tmp.groupby(['reviewerID', 'asin', 'DATE']) \
                    .agg(overall=('overall', np.mean)) \
                    .reset_index()
ratings_trainings_unique.head()

Unnamed: 0,reviewerID,asin,DATE,overall
0,A0010876CNE3ILIM9HV0,B01FHDYGQ8,2016-10-12,5.0
1,A001170867ZBE9FORRQL,B01B18T01Y,2017-02-16,5.0
2,A001170867ZBE9FORRQL,B01FCW88D6,2016-10-16,5.0
3,A0028738FYF1SKPPC7B1,B00TZ8XK1E,2015-08-20,5.0
4,A0045356A23634W7RI4M,B018H0EJI8,2016-05-23,3.0


In [12]:
ratings_trainings_item = ratings_trainings_unique.groupby('asin') \
       .agg({'reviewerID':'size', 'overall':'mean'}) \
       .rename(columns={'reviewerID':'rating_count','overall':'mean_score'}) \
       .reset_index().sort_values('rating_count', ascending=False)
ratings_trainings_item.head()

Unnamed: 0,asin,rating_count,mean_score
939,B000FOI48G,8667,4.394369
1059,B000GLRREU,8341,4.405107
59,1620213982,4792,4.798414
3301,B001QY8QXM,4542,4.555923
29552,B01DKQAXC0,4196,4.212107


In [13]:
ratings_trainings_item = ratings_trainings_item[ratings_trainings_item['rating_count'] > 1]

### 商品資料集

根據上份作業的資料探索，將多餘欄位剔除，新增有意義欄位(商品評論數、平均評分、季節性等)。主要是想用於找不到相似用戶時的推薦設計使用。

在此的ratings_trainings都不剔除重複評分，視其各自為獨立購買紀錄，皆列入分析計算。

In [14]:
# 剔除有問題的資料
metadata_tmp = metadata_tmp.drop(metadata[metadata['title'] == ''].index).reset_index(drop=True)

In [15]:
# 剔除評分數小於等於1的商品
metadata_tmp = metadata_tmp[metadata_tmp['asin'].isin(ratings_trainings_item['asin'].unique())]

In [16]:
metadata_tmp['asin'].count()

18741

#### 剔除多餘欄位

In [17]:
metadata_tmp = metadata_tmp.drop(['category', 'tech1', 'fit', 'tech2', 'feature', 'date', 'imageURLHighRes', 'main_cat', 'details'], axis=1)

##### `rank`: 內含有商品類別和其商品於該類別排名，將其拆成`RANK`、`CATEGORY`兩欄位。

In [18]:
metadata_tmp[['RANK', 'CATEGORY']] = metadata_tmp['rank'].str.split(' in ', 1, expand=True)
metadata_tmp['CATEGORY'] = metadata_tmp['CATEGORY'].str.replace(' \(', '')
metadata_tmp['CATEGORY'] = metadata_tmp['CATEGORY'].str.replace('\&amp;', '&')
metadata_tmp['CATEGORY'].value_counts()

Beauty & Personal Care       18574
Grocery & Gourmet Food          25
Health & Household              11
Sports & Outdoors                3
Clothing, Shoes & Jewelry        2
Automotive                       1
Toys & Games                     1
Tools & Home Improvement         1
Name: CATEGORY, dtype: int64

In [19]:
metadata_tmp = metadata_tmp.drop(['rank'], axis=1)

##### `similar_item`: 將其資料html表格轉為asin列表，與`also_buy`、`also_view`合併成`SIMILAR_ITEM`欄位。

In [20]:
from bs4 import BeautifulSoup
metadata['asin'] = metadata['asin'].astype(str)

def convert_similar_table_to_asin_list(html):
#     if (html == ''): return []
    asin_list = []
    
    soup = BeautifulSoup(html)
    
    compare_list = soup.findAll('th', {'class': 'comparison_image_title_cell'})
    for item in compare_list:
        title = item.span.text.strip()
        if (len(metadata[metadata['title'] == title]) > 0):
            asin = metadata[metadata['title'] == title]['asin'].iloc[0]
#             if (not any(asin.str in s for s in asin_list)):  # 去除重複
            asin_list.append(asin)
    
    return asin_list

In [21]:
metadata_tmp['SIMILAR_ITEM'] = metadata_tmp['similar_item'].apply(lambda x: convert_similar_table_to_asin_list(x) if x != ''  else x)

In [22]:
metadata_tmp['SIMILAR'] = [set(list(x)+y+z) for x, y, z in zip(metadata_tmp.SIMILAR_ITEM, metadata_tmp.also_view, metadata_tmp.also_buy)]

In [23]:
metadata_tmp = metadata_tmp.drop(['SIMILAR_ITEM', 'similar_item', 'also_view', 'also_buy'], axis=1)

In [24]:
metadata_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18741 entries, 0 to 32082
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  18741 non-null  object
 1   title        18741 non-null  object
 2   brand        18741 non-null  object
 3   price        18741 non-null  object
 4   asin         18741 non-null  object
 5   imageURL     18741 non-null  object
 6   RANK         18618 non-null  object
 7   CATEGORY     18618 non-null  object
 8   SIMILAR      18741 non-null  object
dtypes: object(9)
memory usage: 1.4+ MB


#### 加入評分
(同用戶對同商品不同時間或不同評分，做為獨立評分(未合併)，分別計入)

In [25]:
rating_mean = ratings_trainings_tmp.groupby('asin')['overall'].mean().round(2).reset_index(name='RATE')
rating_count = pd.DataFrame(ratings_trainings_tmp.asin.value_counts())
rating_count.reset_index(inplace=True)
rating_count.columns = ['asin', 'RATE(COUNT)']
# merge
rating_data = rating_mean.merge(rating_count, on='asin')
metadata_tmp = pd.merge(rating_data, metadata_tmp, left_on="asin", right_on="asin", how='right')

In [26]:
metadata_tmp.fillna({'RATE(COUNT)':0}, inplace=True)

In [27]:
# clear not-used DataFrame
del rating_data

#### 加入季節性
(`None`: 代表商品無季節性)

In [28]:
def MonthToSeason(x):   
    global season
    if x == 3 or x == 4 or x == 5:
         season = "Spring"
    elif x == 6 or x == 7 or x == 8:
         season = "Summer"
    elif x == 9 or x == 10 or x == 11:
         season = "Autumn"
    elif x == 12 or x == 1 or x == 2:
         season = "Winter"
    else:
         season = np.nan 
    return season

tmp_data = ratings_trainings_tmp.drop('unixReviewTime', axis=1)
tmp_data['SEASON'] = tmp_data['DATE'].dt.month.apply(lambda x : MonthToSeason(x))
tmp_data.head()

Unnamed: 0,asin,reviewerID,overall,DATE,SEASON
0,143026860,A1V6B6TNIC10QE,1.0,2015-02-19,Winter
1,143026860,A2F5GHSXFQ0W6J,4.0,2014-12-18,Winter
2,143026860,A1572GUYS7DGSR,4.0,2014-08-10,Summer
3,143026860,A1PSGLFK1NSVO,5.0,2013-03-11,Spring
4,143026860,A6IKXKZMTKGSC,5.0,2011-12-25,Winter


In [29]:
tmp_data = tmp_data.groupby(['SEASON', 'asin']).size().reset_index(name='count')

tmp_data = tmp_data.pivot(index='asin', columns='SEASON',values='count').reset_index()
tmp_data.fillna(0.0, inplace=True)
tmp_data.set_index("asin" , inplace=True)

In [30]:
tmp_data['sum'] = tmp_data[['Autumn', 'Spring', 'Summer', 'Winter']].sum(axis=1).astype(int)

sum_threshold = 60
tmp_data = tmp_data[tmp_data['sum'] > sum_threshold]  # 希望至少有一定的銷售紀錄(評分紀錄)才做判別

tmp_data['mean'] = tmp_data[['Autumn', 'Spring', 'Summer', 'Winter']].mean(axis=1).round(2).astype(float)
tmp_data['std'] = tmp_data[['Autumn', 'Spring', 'Summer', 'Winter']].std(axis=1).round(3).astype(float)
tmp_data['fstd'] = tmp_data['std'] / tmp_data['mean']  # fixed std

In [31]:
# 使用fstd篩選
season_tmp_data = tmp_data[tmp_data['fstd'] > 1]
print('以最低至少{0}的銷售紀錄為門檻時，得到的季節性商品占全部商品的約{1}%。\n'.format(sum_threshold, round(len(season_tmp_data) / len(tmp_data) * 100)))
print(season_tmp_data)

以最低至少60的銷售紀錄為門檻時，得到的季節性商品占全部商品的約2%。

SEASON      Autumn  Spring  Summer  Winter  sum   mean      std      fstd
asin                                                                     
B000I6AZHE     1.0   161.0     1.0     1.0  164  41.00   80.000  1.951220
B00182JYSS    32.0     7.0   133.0    18.0  190  47.50   57.911  1.219179
B002GP80EU    93.0     4.0    10.0    13.0  120  30.00   42.166  1.405533
B002OXSKF8     2.0     0.0     1.0   104.0  107  26.75   51.506  1.925458
B00D8FUHFC    28.0     9.0   208.0    21.0  266  66.50   94.659  1.423444
B00KGKQR2Y     7.0    63.0    16.0    10.0   96  24.00   26.268  1.094500
B00RCNHRQ8    12.0    30.0    12.0    87.0  141  35.25   35.528  1.007887
B00U0PABWW     8.0    10.0    58.0     7.0   83  20.75   24.865  1.198313
B0189CM3U0     0.0    33.0     9.0   260.0  302  75.50  123.786  1.639550
B019FOXB70    10.0    84.0     7.0   293.0  394  98.50  134.468  1.365157
B01AUOTSVW     7.0    63.0    22.0     7.0   99  24.75   26.462  1.069172
B

In [32]:
season_data = season_tmp_data.drop(['sum', 'mean', 'std', 'fstd'], axis=1)
season_data = season_data.idxmax(axis=1).reset_index(name='SEASON')
season_data

Unnamed: 0,asin,SEASON
0,B000I6AZHE,Spring
1,B00182JYSS,Summer
2,B002GP80EU,Autumn
3,B002OXSKF8,Winter
4,B00D8FUHFC,Summer
5,B00KGKQR2Y,Spring
6,B00RCNHRQ8,Winter
7,B00U0PABWW,Summer
8,B0189CM3U0,Winter
9,B019FOXB70,Winter


In [33]:
metadata_tmp = pd.merge(season_data, metadata_tmp, left_on="asin", right_on="asin", how='right')
metadata_tmp['SEASON'].fillna('None', inplace=True)

In [34]:
# clear not-used DataFrame
del tmp_data
del season_data

---

## 若沒有相似商品的替代推薦

### 近一個月熱銷
由`A6_cf-user-based.ipynb`作業發現，近一個月熱銷比其他最近熱銷、長期熱銷更好，故在這熱銷只選擇這種方式。

In [35]:
def get_recent_topN(k=10):
    recent_training_data = ratings_trainings[(ratings_trainings['DATE'] >= '2018-08-01') \
                                  & (ratings_trainings['DATE'] < '2018-09-01')]
    
    recent_top = recent_training_data.groupby('asin') \
           .agg({'asin':'size', 'overall':'mean'}) \
           .rename(columns={'asin':'sold_count','overall':'mean_score'}) \
           .reset_index().sort_values('sold_count', ascending=False) \
           .head(k).asin.reset_index(drop=True).to_list()
    
    return recent_top

---

## 產生推薦

In [36]:
def recomend_item_sort(user, recommended_items, k=10):
    # 儲存原本list順序
    recommended_dict = dict()
    for idx in range(len(recommended_items)):
        if recommended_items[idx] not in recommended_dict:
            recommended_dict[recommended_items[idx]] = idx
#     recommended_dict = {k: v for v, k in enumerate(recommended_items) if k not in recommended_dict}
    
    for item in recommended_items:
        if ((metadata_tmp['asin'].str.contains(item).any()) & (item in recommended_dict)):
            item_info = metadata_tmp[metadata_tmp['asin'] == item]
            # 剔除除分數過低的
            if (item_info['RATE'].values[0] < 3):
#                 recommended_items.remove(item)
                del recommended_dict[item]
            # 剔除非當季商品
            elif ((item_info['SEASON'].values[0] != 'None') or (item_info['SEASON'].values[0] == 'Autumn') or (item_info['SEASON'].values[0] == 'Winter')):
#                 recommended_items.remove(item)
                del recommended_dict[item]
            # TODO: 剔除近兩年未銷售出去過的商品
        
    
#     # 剔除重複商品
#     recommended_items = list(set(recommended_items))
    
#     # 依照RATE(COUNT)排序
# #     print(metadata_tmp[metadata_tmp['asin'].isin(recommended_items)].sort_values('RATE', ascending=False) \
# #                                .head(k))
#     recommended_list = metadata_tmp[metadata_tmp['asin'].isin(recommended_items)].sort_values('RATE', ascending=False) \
#                                .head(k).asin.reset_index(drop=True).to_list()
    # 依照舊有list順序排序
    # TODO: 考慮[每個用戶相似度x商品評論數]，才接著非來自相似用戶的推薦
    recommended_list = [key for (key, value) in sorted(recommended_dict.items(), key=lambda x: x[1])]
    
    return recommended_list

In [37]:
import time
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import KNNBasic

In [38]:
def recommender(training_data, users=[], k=10, user_based=False, algo=KNNBasic, is_sorted=True):

    training_data = (
        training_data
        .sort_values("DATE", ascending=False)
        .groupby(['reviewerID', 'asin']).head(1)
    )

    reader = Reader(rating_scale=(0, 5))
    training_data = training_data[['reviewerID', 'asin', 'overall']]
    data = Dataset.load_from_df(training_data, reader=reader)

    sim_options = {
        'name': 'cosine',
        'user_based': user_based  # compute similarities between items
    }
    algo_impl = algo(sim_options=sim_options)
    trainset = data.build_full_trainset()
    algo_impl.fit(trainset)

    recommendation = {}
    
    # 如果要排序就先找出兩倍k的資料
    n = 2*k if is_sorted else k
    
    for user in users:
        items_user_rated = set(training_data.loc[training_data['reviewerID'] == user]['asin'].to_list())
        recommend_item_list = []
        recommend_item_set = set()
        for item in items_user_rated:
            iid = algo_impl.trainset.to_inner_iid(item)
            recommend_items_iid = algo_impl.get_neighbors(iid, n)
            for sim_item_iid in recommend_items_iid:
                item_raw_id = algo_impl.trainset.to_raw_iid(sim_item_iid)
                if item_raw_id not in items_user_rated and item_raw_id not in recommend_item_set:
                    recommend_item_list.append(item_raw_id)
                    recommend_item_set.add(item_raw_id)

            if len(recommend_item_list) >= n:
                recommend_item_list = recommend_item_list[:n]
                break
                
        if len(recommend_item_list) < n:
            recommend_item_list = recommend_item_list + get_recent_topN(n-len(recommend_item_list))
            
        recommendation[user] = recomend_item_sort(user, recommend_item_list, k) if is_sorted else recommend_item_list

    return recommendation

---

## 結果評估

In [39]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

### 方案1：item-based (KNNBasic)

In [44]:
for k in [10, 20]:
    ratings_by_user = recommender(ratings_trainings_unique, users)
    print(f'top{k} Recall： {evaluate(ratings_testings_by_user, ratings_by_user)}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
top10 Recall： 0.16440677966101694
Computing the cosine similarity matrix...
Done computing similarity matrix.
top20 Recall： 0.16440677966101694


### 方案2：item-based (KNNWithMeans)

In [43]:
from surprise import KNNWithMeans
for k in [10, 20]:
    ratings_by_user = recommender(ratings_trainings_unique, users, algo=KNNWithMeans)
    print(f'top{k} Recall： {evaluate(ratings_testings_by_user, ratings_by_user)}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
top10 Recall： 0.16440677966101694
Computing the cosine similarity matrix...
Done computing similarity matrix.
top20 Recall： 0.16440677966101694
