<a href="https://colab.research.google.com/github/tony-chengchunchang/data-course-sample/blob/main/week3_cf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Code

## 基礎建設

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
from collections import defaultdict
import gzip, json
from sklearn.preprocessing import LabelEncoder

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## 載入資料

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz

--2022-01-08 11:27:07--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15499476 (15M) [application/octet-stream]
Saving to: ‘All_Beauty.csv’


2022-01-08 11:27:07 (25.2 MB/s) - ‘All_Beauty.csv’ saved [15499476/15499476]

--2022-01-08 11:27:08--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_All_Beauty.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10329961 (9.9M) [application/octet-stream]
Saving to: ‘meta_All_Beauty.json.gz’


2022-01-08 11:27:08 (18.7 MB/s) - ‘meta_All_Beauty.json.gz’ saved [10329961/10329961]



In [None]:
metadata = getDF('/content/meta_All_Beauty.json.gz')
ratings = pd.read_csv('/content/All_Beauty.csv', names=['asin', 'reviewerID', 'overall', 'unixReviewTime'], header=None)

In [None]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [None]:
ratings.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,143026860,A1V6B6TNIC10QE,1.0,1424304000
1,143026860,A2F5GHSXFQ0W6J,4.0,1418860800
2,143026860,A1572GUYS7DGSR,4.0,1407628800
3,143026860,A1PSGLFK1NSVO,5.0,1362960000
4,143026860,A6IKXKZMTKGSC,5.0,1324771200


## 資料整理

In [None]:
ratings['DATE'] = pd.to_datetime(ratings['unixReviewTime'], unit='s')

## 資料切分

In [None]:
ratings_trainings = ratings[
    (ratings['DATE'] < '2018-09-01')
]
ratings_testings = ratings[
    (ratings['DATE'] >= '2018-09-01') & 
    (ratings['DATE'] <= '2018-09-30')
]
ratings_testings_by_user = ratings_testings.groupby('reviewerID').agg(list).reset_index()[['reviewerID', 'asin']].to_dict('records')
ratings_testings_by_user = { rating['reviewerID']: rating['asin'] for rating in ratings_testings_by_user }
users = list(ratings_testings_by_user.keys())

## 建立user-to-item矩陣
每一個row代表一個user的所有評價

In [None]:
training_data = ratings_trainings
user_to_items = defaultdict(dict)
for _, row in training_data.iterrows():
    row = dict(row)
    user = row['reviewerID']
    item = row['asin']
    rating = float(row['overall'])

    user_to_items[user][item] = rating

print("total users before filtering: ", len(user_to_items))

# remove obscure user to decrease data size
# filtering params
remove_obscure_user = True
user_rating_threshold = 3
all_users = list(user_to_items.keys())
for user in all_users:
    ratings = user_to_items[user]
    if remove_obscure_user and len(ratings) < user_rating_threshold:
        del user_to_items[user]

print("total users  after filtering: ", len(user_to_items))

total users before filtering:  323489
total users  after filtering:  4793


In [None]:
user_item_matrix = pd.DataFrame(user_to_items).T

In [None]:
user_item_matrix.head(10)

Unnamed: 0,0992916305,B0013NB7DW,B019809F9Y,1620213982,B001QY8QXM,B00MXENWO2,B00JVU3K9I,B00CPN9H54,B001ET7FZE,B00W259T7G,B01BNEYGQU,B000V2MBZ4,B00BWIT33Y,B00G5L867C,B00HYCWWNK,B001AS5PV0,B00C6CCD8C,B00UARRJMA,B0018OVL4G,B000GLRREU,B000FOI48G,B0000530HU,B000FTYALG,B00GUYNF3O,B000EG8HLE,B004E3ION4,B000FED5D0,B00D3M0CRS,B00G14QJA4,B00IPVCCVQ,B00C4207LY,B00GBYNAJM,B00NNLG1QU,B00NV702XO,B00NNLIHLM,B00OTZ24VU,B00SKWG0OK,B00FOGXKG6,B00NV5SV70,B00NV6U3UC,...,B019SUK0WU,B01GST8AJ4,B01928CVUC,B01CU2F04G,B0199MABE4,B0199MCLTC,B0199MA62G,B0199MCIK4,B0199MCGTW,B019SKZXRW,B019SKZVAG,B019SKZXMM,B019Z9L2SK,B01AKH96TI,B01AKH9918,B01AKH959E,B01CH8H0B4,B01C4LDWLG,B01CE0HDNA,B01CEG0T4Y,B01CJX0MCG,B01CJXETY8,B01CJXWQT8,B01CKLWWNE,B01D6OZV32,B01ENV3OWC,B01D1K2BCK,B01D2BA1MK,B01D2BTQKI,B01DK5ERYG,B01DPXQQFG,B01DWI7FXQ,B01EW2E11K,B01EW2SKTY,B01EW2YP0W,B01FNU042A,B01G17BLR6,B01G5699V8,B01G93Z22M,B01G93YXKY
A2V9BG2MDQVCYX,5.0,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A3GAN4X1TF8TPY,,,,4.0,5.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A3F5DIB5CVJAT0,,,,5.0,,,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A3NCIN6TNL0MGA,,,,5.0,5.0,,,,5.0,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A37BVM0O7RSIIR,,,,5.0,5.0,,,,,,,5.0,4.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A2JC3NNXBTY45E,,,,5.0,,,,,,,,,,,5.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A3U4FCEAT75XLS,,,,5.0,5.0,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A2CU3X1KLYHQVM,,,,5.0,5.0,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
A3EFF6RH8WTRXO,,,,4.0,5.0,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
AJX4CKVWPVMJ9,,,,5.0,5.0,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
user_item_matrix.shape

(4793, 4853)

## 將user id(asin)進行編碼以利後續運算
其實這裡可以有別的方法，不一定要這樣做，只是當時沒想到


In [None]:
encoder = LabelEncoder()
user_encoded = encoder.fit_transform(user_item_matrix.index)
user_item_matrix.index = user_encoded
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.sort_index(inplace=True)

In [None]:
user_item_matrix.head(10)

Unnamed: 0,0992916305,B0013NB7DW,B019809F9Y,1620213982,B001QY8QXM,B00MXENWO2,B00JVU3K9I,B00CPN9H54,B001ET7FZE,B00W259T7G,B01BNEYGQU,B000V2MBZ4,B00BWIT33Y,B00G5L867C,B00HYCWWNK,B001AS5PV0,B00C6CCD8C,B00UARRJMA,B0018OVL4G,B000GLRREU,B000FOI48G,B0000530HU,B000FTYALG,B00GUYNF3O,B000EG8HLE,B004E3ION4,B000FED5D0,B00D3M0CRS,B00G14QJA4,B00IPVCCVQ,B00C4207LY,B00GBYNAJM,B00NNLG1QU,B00NV702XO,B00NNLIHLM,B00OTZ24VU,B00SKWG0OK,B00FOGXKG6,B00NV5SV70,B00NV6U3UC,...,B019SUK0WU,B01GST8AJ4,B01928CVUC,B01CU2F04G,B0199MABE4,B0199MCLTC,B0199MA62G,B0199MCIK4,B0199MCGTW,B019SKZXRW,B019SKZVAG,B019SKZXMM,B019Z9L2SK,B01AKH96TI,B01AKH9918,B01AKH959E,B01CH8H0B4,B01C4LDWLG,B01CE0HDNA,B01CEG0T4Y,B01CJX0MCG,B01CJXETY8,B01CJXWQT8,B01CKLWWNE,B01D6OZV32,B01ENV3OWC,B01D1K2BCK,B01D2BA1MK,B01D2BTQKI,B01DK5ERYG,B01DPXQQFG,B01DWI7FXQ,B01EW2E11K,B01EW2SKTY,B01EW2YP0W,B01FNU042A,B01G17BLR6,B01G5699V8,B01G93Z22M,B01G93YXKY
0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 建立相似度矩陣

In [None]:
def similarity(x,y):
    x_mean = x.mean()
    y_mean = y.mean()

    in_common = (x!=0) & (y!=0)
    if (in_common==False).all():
        sim = 0
    else:
        x = x[in_common]
        y = y[in_common]
        
        sim = ((x * y).sum()) / ((x**2).sum() * (y**2).sum()) ** 0.5
    
    return sim

In [None]:
users_count = user_item_matrix.shape[0]
sim_matrix = np.zeros((users_count, users_count))

for i in range(users_count):
    print('-----{}/{}-----'.format(i+1, users_count))
    for j in range(i, users_count):
        sim_score = similarity(user_item_matrix.loc[i].values, user_item_matrix.loc[j].values)
        sim_matrix[i,j] = sim_score
        sim_matrix[j,i] = sim_score
    
    print('============================')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-----2294/4793-----
-----2295/4793-----
-----2296/4793-----
-----2297/4793-----
-----2298/4793-----
-----2299/4793-----
-----2300/4793-----
-----2301/4793-----
-----2302/4793-----
-----2303/4793-----
-----2304/4793-----
-----2305/4793-----
-----2306/4793-----
-----2307/4793-----
-----2308/4793-----
-----2309/4793-----
-----2310/4793-----
-----2311/4793-----
-----2312/4793-----
-----2313/4793-----
-----2314/4793-----
-----2315/4793-----
-----2316/4793-----
-----2317/4793-----
-----2318/4793-----
-----2319/4793-----
-----2320/4793-----
-----2321/4793-----
-----2322/4793-----
-----2323/4793-----
-----2324/4793-----
-----2325/4793-----
-----2326/4793-----
-----2327/4793-----
-----2328/4793-----
-----2329/4793-----
-----2330/4793-----
-----2331/4793-----
-----2332/4793-----
-----2333/4793-----
-----2334/4793-----
-----2335/4793-----
-----2336/4793-----
-----2337/4793-----
-----2338/4793-----
-----2339/4793-----
-----2340/4793-

In [None]:
# Set the diagonal to 0 because we won't consider the user himself when generating recommendations
# And we are also not going to consider users with similarity <= 0
sim_matrix[np.arange(users_count), np.arange(users_count)] = 0

In [None]:
# Users that are applicable for CF
valid_user_ids = encoder.inverse_transform(np.arange(users_count))

# Set the axis back to real user ids
sim_df = pd.DataFrame(sim_matrix, columns=valid_user_ids, index=valid_user_ids)

In [None]:
sim_df.head()

Unnamed: 0,A100UD67AHFODS,A100WO06OQR8BQ,A101CAMZDHU1V9,A1046ICIMHENBN,A104MYYMJJ3WVD,A105A034ZG9EHO,A10658GHKIVJJ4,A106T050P52BKP,A1074UGOIWFB9Z,A107Y7EGEBNYMN,A10CLDIHSUDO1Y,A10H5JQ2PR8JCM,A10HR0ECYKDNTN,A10I57PJGNHURG,A10JB7YPWZGRF4,A10JPZAYDGFHEV,A10M2MLE2R0L6K,A10NI0O4MI6CMJ,A10OTDWTW5SD94,A10OYW0QYN13GL,A10P0NAKKRYKTZ,A10PKL0Z2UIJ4B,A10QJ6CMP3UHC6,A10QJVO4NWCL74,A10T9Y91J5WTVS,A10UMAMEHEI51T,A10Y59HW4O47N0,A10YXOHVD6CATU,A10Z57SSF7FEAN,A10ZBR6O8S8OCY,A10ZJZNO4DAVB,A1115OU8UPVNGJ,A11177HE8AJYPD,A1118RD3AJD5KH,A113ASXTCA2RLV,A11459ZN3F8VJ5,A114GOSSRSQ89R,A115HWIIU6I86Q,A115LE3GBAO8I6,A117FS98WND9A3,...,AYY463Q7V3LTU,AYYP2ZJFR7KKN,AZ0B1DLIUHMG7,AZ22EPSN22DDD,AZ26CDSJ363AH,AZ2OW83HIT8RG,AZ40BVI3QY8GN,AZ4LJ8GSD0PP,AZ4T3DDT8L9EQ,AZ520NWW40I9B,AZ8PQ5IFG4I71,AZ9EXMA8Q8126,AZANVKBYIYUJC,AZBAI4J0F2YJT,AZBJKXB19STRA,AZBLP8S3CHH3,AZCOSCQG73JZ1,AZD3ON9ZMEGL6,AZEVJCQESFBBC,AZFXRPA58G1KF,AZFYUPGEE6KLW,AZG4BKZCPRIW6,AZGLMGV318SN0,AZGVJ71SZFQAT,AZH8S229H7VPK,AZIOWW5HT8CPL,AZJMUP77WBQZQ,AZK1WY67FG09D,AZK41OIXZ425Y,AZLF8T5XUD47M,AZMAOC6QC0WEP,AZN7PTS80FJC2,AZN926JQW89WW,AZPI1JA9XKV8P,AZPU73G6DBPJ6,AZQZIAWSFBHLW,AZRD4IZU6TBFV,AZTZ7SIIRXLXE,AZWXG6KBXXC2N,AZZZ5UJWUVCYZ
A100UD67AHFODS,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A101CAMZDHU1V9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1046ICIMHENBN,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A104MYYMJJ3WVD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 產生推薦

In [None]:
def recommender(training_data, users=[], k=10):
    recommendations = {}
    for user in users:
        if user in valid_user_ids:
            print('User: {} using cf'.format(user))
            sim_users = sim_df.loc[user].sort_values(ascending=False)
            sim_user_ids = sim_users[sim_users>0].index
            
            rec_for_user = []
            for sim_user_id in sim_user_ids:
                sim_user_purchased_dict = user_to_items[sim_user_id]
                avg_rating_score = sum(sim_user_purchased_dict.values()) / len(sim_user_purchased_dict.values())
                rec_from_sim_user = [item for item, rating in sim_user_purchased_dict.items() if rating > avg_rating_score and item not in rec_for_user]
                rec_for_user.extend(rec_from_sim_user)
                if len(rec_for_user) > k:
                    rec_for_user = rec_for_user[:k]
                    break

            rec_count = len(rec_for_user)
            if rec_count < k:
                most_popular = ratings['asin'].value_counts().index[:k].to_list()
                short = k - rec_count
                rec_for_user.extend(most_popular[:short])

            recommendations[user] = rec_for_user
        else:
            most_popular = ratings['asin'].value_counts().index[:k].to_list()
            recommendations[user] = most_popular

    return recommendations

ratings_by_user = recommender(ratings_trainings, users)

User: A1SA3N793WT2LM using cf
User: ACE1ZIOI3U6PY using cf
User: ASGIVOW34XNQS using cf


## 只有三個users符合CF條件，可推測對這個問題來說，CF也不是適合的方法

In [None]:
ratings_by_user

{'A100XQFWKQ30O2': ['B000FOI48G',
  'B000GLRREU',
  '1620213982',
  'B001QY8QXM',
  'B01DKQAXC0',
  'B00W259T7G',
  'B006IB5T4W',
  'B00005JS5C',
  'B0012Y0ZG2',
  'B000WYJTZG'],
 'A103T1QOGFCSEH': ['B000FOI48G',
  'B000GLRREU',
  '1620213982',
  'B001QY8QXM',
  'B01DKQAXC0',
  'B00W259T7G',
  'B006IB5T4W',
  'B00005JS5C',
  'B0012Y0ZG2',
  'B000WYJTZG'],
 'A106UKKSJ2KXPF': ['B000FOI48G',
  'B000GLRREU',
  '1620213982',
  'B001QY8QXM',
  'B01DKQAXC0',
  'B00W259T7G',
  'B006IB5T4W',
  'B00005JS5C',
  'B0012Y0ZG2',
  'B000WYJTZG'],
 'A10A7GV4D5A11V': ['B000FOI48G',
  'B000GLRREU',
  '1620213982',
  'B001QY8QXM',
  'B01DKQAXC0',
  'B00W259T7G',
  'B006IB5T4W',
  'B00005JS5C',
  'B0012Y0ZG2',
  'B000WYJTZG'],
 'A1119JJ37ZLB8R': ['B000FOI48G',
  'B000GLRREU',
  '1620213982',
  'B001QY8QXM',
  'B01DKQAXC0',
  'B00W259T7G',
  'B006IB5T4W',
  'B00005JS5C',
  'B0012Y0ZG2',
  'B000WYJTZG'],
 'A113UOOLBSZN52': ['B000FOI48G',
  'B000GLRREU',
  '1620213982',
  'B001QY8QXM',
  'B01DKQAXC0',
  'B00W

## 範例原版

In [None]:
# import pandas as pd
# from itertools import combinations
# from collections import defaultdict

# # header: user_id,item_id,rating,timestamp

# def recommender(training_data, users=[], k=10):

#     # loading data from dataframe
#     # user_to_items dict:
#     # {
#     #   'user': {
#     #       'item': ratings...
#     #   }...
#     # }
#     user_to_items = defaultdict(dict)
#     for _, row in training_data.iterrows():
#         row = dict(row)
#         user = row['reviewerID']
#         item = row['asin']
#         rating = float(row['overall'])

#         user_to_items[user][item] = rating

#     print("total users before filtering: ", len(user_to_items))

#     # remove obscure user to decrease data size
#     # filtering params
#     remove_obscure_user = True
#     user_rating_threshold = 3
#     all_users = list(user_to_items.keys())
#     for user in all_users:
#         ratings = user_to_items[user]
#         if remove_obscure_user and len(ratings) < user_rating_threshold:
#             del user_to_items[user]

#     print("total users  after filtering: ", len(user_to_items))

#     # generate item to user mapping dict
#     # {
#     #   'item': {
#     #       'user': ratings...
#     #   }...
#     # }
#     item_to_users = defaultdict(dict)
#     for user, items in user_to_items.items():
#         for item, rating in items.items():
#             item_to_users[item][user] = rating

#     # prepare data of computing user similarity 
#     init_sim = lambda: [0 for _ in range(3)]
#     factory = lambda: defaultdict(init_sim)
#     pre_user_similarity = defaultdict(factory)
#     n = len(item_to_users)
#     index = 0
#     for item, user_ratings in item_to_users.items():
#         if len(user_ratings) > 1:
#             # print(f"item: {item} have been rated by {len(user_ratings)} users progress: {index}/{n}")
#             for user1, user2 in combinations(user_ratings.keys(), 2):
#                 xy = user_ratings[user1] * user_ratings[user2]
#                 xx = user_ratings[user1] ** 2
#                 yy = user_ratings[user2] ** 2
#                 pre_user_similarity[user1][user2][0] += xy
#                 pre_user_similarity[user1][user2][1] += xx
#                 pre_user_similarity[user1][user2][2] += yy

#                 pre_user_similarity[user2][user1][0] += xy
#                 pre_user_similarity[user2][user1][1] += xx
#                 pre_user_similarity[user2][user1][2] += yy
#         index += 1

#     user_similarity = {}
#     for src_user in pre_user_similarity:
#         user_similarity_order = []
#         for dst_user, val in pre_user_similarity[src_user].items():
#             xy = val[0]
#             xx = val[1]
#             yy = val[2]
#             div = ((xx*yy) ** 0.5)
#             if div == 0:
#                 continue
#             similarity = xy / div
#             if similarity < 0:
#                 continue
#             for i, s in enumerate(user_similarity_order):
#                 target_similarity = s[1]
#                 if target_similarity < similarity:
#                     user_similarity_order.insert(i, (dst_user, similarity))
#                     break
#             else:
#                 user_similarity_order.append((dst_user, similarity))
#         user_similarity[src_user] = user_similarity_order

#     recommendation = {}
#     for user in users:
#         if user in user_similarity:
#             sim_users = user_similarity[user]
#             recommended_items = []
#             recommended_items_set = set()
#             user_have_rated = set(user_to_items[user])
#             stop_recommend = False
#             for sim_user, _ in sim_users:
#                 items_from_sim_user = sorted(list(user_to_items[sim_user].items()), key=lambda item: item[1])
#                 for item, _ in items_from_sim_user:
#                     if item not in user_have_rated and item not in recommended_items_set:
#                         recommended_items.append(item)
#                         recommended_items_set.add(item)
#                     if len(recommended_items) >= k:
#                         stop_recommend = True
#                         break
#                 if stop_recommend:
#                     break
#             recommendation[user] = recommended_items
#         else:
#             recommendation[user] = []
#     return recommendation

# ratings_by_user = recommender(ratings_trainings, users)
# ratings_by_user

total users before filtering:  323489
total users  after filtering:  4793


{'A100XQFWKQ30O2': [],
 'A103T1QOGFCSEH': [],
 'A106UKKSJ2KXPF': [],
 'A10A7GV4D5A11V': [],
 'A1119JJ37ZLB8R': [],
 'A113UOOLBSZN52': [],
 'A12M4U7WK4ALCR': [],
 'A12T8YTW6VWT7S': [],
 'A1364JXGKB46MM': [],
 'A137DALOQFKBTI': [],
 'A13FEZ3WV7S2EY': [],
 'A13IV4I1B0RXMG': [],
 'A13JU88JAHN72I': [],
 'A13K55R6VH1OOD': [],
 'A13P7VFU075A': [],
 'A13SWYE4QLB6NG': [],
 'A13ZTQ0Q4ATA41': [],
 'A142EDN04OD62U': [],
 'A142I22FIC8MZK': [],
 'A14834QTII5TLT': [],
 'A14A447VPACTBC': [],
 'A14AP6MN5XO6LB': [],
 'A14CLF25IX25US': [],
 'A14LYXC3HTBAHI': [],
 'A14VUW4KZ34EOE': [],
 'A14Y32P26G9YL': [],
 'A157T25PBS7MX4': [],
 'A15HZDSERD85C8': [],
 'A15JJ8J1FGADIX': [],
 'A15ZCL70JXXH89': [],
 'A1617KN2IAWZ6J': [],
 'A16E0O88262HKA': [],
 'A16NSZ58PTVIYF': [],
 'A16UGDXRTDLJG5': [],
 'A16X9HR3UFQQXY': [],
 'A16Y7V1CZCWKFV': [],
 'A174YOBOSW9WDN': [],
 'A1786SKRAJXH86': [],
 'A17K2BUZ20WD2': [],
 'A17LYRFV645L0V': [],
 'A18LNGVXDZBTUR': [],
 'A19503XX7GU6J2': [],
 'A19HVHRZDYFEOP': [],
 'A19JM38B861BO

## 結果評估

In [None]:
def evaluate(ratings_testings_by_user={}, ratings_by_user={}, method=None):
    '''
    * ratings_testings_by_user: dict 真實被購買的商品資料（2018-09-01 以後資料）
    * ratings_by_user: dict 利用訓練資料學習的推薦商品
    * method: str
    * score: float
    '''
    total = 0
    for d in ratings_testings_by_user:
        if d in ratings_by_user:
            total += len(set(ratings_by_user[d]) & set(ratings_testings_by_user[d]))

    score = total / len(ratings_testings)
    return score

evaluate(ratings_testings_by_user, ratings_by_user)

0.08305084745762711