In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
#載入資料集
rating_df = pd.read_csv('userstar.csv')
rating_df.head()

Unnamed: 0,BOOKNAME,CONTENT,ISBN,USER,USERSTAR
0,「氣內臟」自癒按摩法：每天按摩脾．肝．腎30秒，美肌、消除便秘、好眠、減肥，天天都有好氣色！,深入淺出，圖片清晰介唔按壓身體及伸展。\n按照個人體質、情志、節氣等，以吃，嗅香芬、調息、伸...,9789865683498,啟迪,5.0
1,唐鳳：我所看待的自由與未來,一開始是對唐鳳好奇，想更了解她，讀完之後真正體認到她是個傳奇人物，同時她的家庭給了她很大的自...,9789575036942,Miley Guan,5.0
2,老夫老妻重返青春（1）,看了很暖心的漫畫，會覺得要好好過每一天,9789865241810,蘇蘇,0.0
3,祕密瑜伽士的日常,直指心性的好書，書中道理發人省思,9789864779079,kbboss7416,5.0
4,花開千年（01）,好唯美的畫風。喜歡故事開頭引用的一段《佛經》：「彼岸花，開一千年，落一千年，花葉永不想見。情...,9789571068121,Annie,5.0


In [3]:
# 檢查空值
rating_df.isna().any()

BOOKNAME    False
CONTENT      True
ISBN        False
USER         True
USERSTAR     True
dtype: bool

In [4]:
# 清理空值並檢查空值
rating_df=rating_df.dropna(axis=0)
rating_df.isna().any()

BOOKNAME    False
CONTENT     False
ISBN        False
USER        False
USERSTAR    False
dtype: bool

In [5]:
# 清理重複
duplicates = rating_df.duplicated()

if duplicates.sum() > 0:
    print('> {} duplicates'.format(duplicates.sum()))
    rating_df = rating_df[~duplicates]

print('> {} duplicates'.format(rating_df.duplicated().sum()))

> 72 duplicates
> 0 duplicates


In [6]:
print('Rating Dataframe shape : ', rating_df.shape)

Rating Dataframe shape :  (100091, 5)


In [7]:
# 移除 CONTENT 欄位
rating_df.drop(['CONTENT'], axis = 1, inplace = True)

In [8]:
# 給user_id
import statsmodels.api as sm
rating_df['user_id'] = pd.Categorical(rating_df.USER).codes   
rating_df.head()

Unnamed: 0,BOOKNAME,ISBN,USER,USERSTAR,user_id
0,「氣內臟」自癒按摩法：每天按摩脾．肝．腎30秒，美肌、消除便秘、好眠、減肥，天天都有好氣色！,9789865683498,啟迪,5.0,8064
1,唐鳳：我所看待的自由與未來,9789575036942,Miley Guan,5.0,942
2,老夫老妻重返青春（1）,9789865241810,蘇蘇,0.0,8727
3,祕密瑜伽士的日常,9789864779079,kbboss7416,5.0,4513
4,花開千年（01）,9789571068121,Annie,5.0,203


In [9]:
book_features_df = rating_df.pivot_table(index = 'BOOKNAME',columns = 'user_id',values = 'USERSTAR')
book_features_df.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,8944,8945,8946,8947,8948,8949,8950,8951,8952,8953
BOOKNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
81,,,,,,,,,,,...,,,,,,,,,,
BACCANO！大騷動！ (15) 1710,,,,,,,,,,,...,,,,,,,,,,
Mr. Adult 大人先生,,,,,,,,,,,...,,,,,,,,,,
TWO CAN TOUCAN大嘴鳥「兩罐」的故事,,,,,,,,,,,...,,,,,,,,,,
Visual C# 2008 程式設計範例教本,,,,,,,,,,,...,,,,,,,,,,


In [10]:
missing_value = book_features_df.isna().sum()
missing_value

user_id
0       46141
1       46132
2       46080
3       46147
4       46129
        ...  
8949    46108
8950    46147
8951    46146
8952    46146
8953    46140
Length: 8954, dtype: int64

In [11]:
rows = book_features_df.shape[0]
cols = book_features_df.shape[1]

count_empty_or_zero_cells = book_features_df.isna().sum().sum()
total_elements = rows * cols

sparsity_of_matrix = count_empty_or_zero_cells/total_elements

print('Total Empty cells are : ', count_empty_or_zero_cells)
print('Total cells in Matrix are : ', total_elements)
print('Sparsity of Matrix are : ', sparsity_of_matrix)

Total Empty cells are :  413118241
Total cells in Matrix are :  413218146
Sparsity of Matrix are :  0.9997582269777668


In [12]:
# 空值用 0 取代
book_features_df.fillna(0, inplace = True)

In [13]:
book_features_df.values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
from scipy.sparse import csr_matrix
book_features_df_matrix = csr_matrix(book_features_df.values)

In [16]:
# 建立餘弦相似模型 (K-近鄰演算法)
from sklearn.neighbors import NearestNeighbors
nearest_neighbor_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
nearest_neighbor_model.fit(book_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [17]:
# 隨機選擇一本書
total_no_of_books = book_features_df.shape[0]
print('Total books in our pivot table : ', total_no_of_books)
random_book_index = np.random.choice(total_no_of_books)
print('Random book Index : ', random_book_index)

Total books in our pivot table :  46149
Random book Index :  9977


In [18]:
book_features_df.iloc[random_book_index]

user_id
0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
8949    0.0
8950    0.0
8951    0.0
8952    0.0
8953    0.0
Name: 危機中的盼望 Hope in Any Crisis, Length: 8954, dtype: float64

In [19]:
# 向量顯示 random_book
one_dimensional_representation_of_book_vector = book_features_df.iloc[random_book_index].values.reshape(1, -1)
one_dimensional_representation_of_book_vector

array([[0., 0., 0., ..., 0., 0., 0.]])

In [20]:
# 相似書本與 random_book 的餘弦距離指數
distances, indices = nearest_neighbor_model.kneighbors(one_dimensional_representation_of_book_vector, n_neighbors = 6)
print('Distance :', distances)
print('Indices :', indices)

Distance : [[0. 1. 1. 1. 1. 1.]]
Indices : [[ 9977 30767 30765 30769 30768 30771]]


In [21]:
indices = indices.flatten()
distances = distances.flatten()
for i in range(0, len(indices)):
    ## same book
    if i == 0:
        print('Recommendations for {0}:\n'.format(book_features_df.index[random_book_index]))
    else:
        ## similar books
        print('{0}: {1}, with distance of {2}'.format(i, book_features_df.index[indices[i]], distances[i]))

Recommendations for 危機中的盼望 Hope in Any Crisis:

1: 生命是長期而持續的累積, with distance of 1.0
2: 生命是什麼？, with distance of 1.0
3: 生命最後一個月的花嫁              全, with distance of 1.0
4: 生命暗章, with distance of 1.0
5: 生命永不落：一個心理醫師追尋老化意義的旅程, with distance of 1.0


In [22]:
# 找出全部書本的相似資料集 
my_dict = {}
for book_index in range(book_features_df.shape[0]):
    
    one_dimensional_representation_of_book_vector = book_features_df.iloc[book_index].values.reshape(1, -1)
    distances, indices = nearest_neighbor_model.kneighbors(one_dimensional_representation_of_book_vector, n_neighbors = 4)
    indices = indices.flatten()
    distances = distances.flatten()
    similar_books = []
    
    for i in range(0, len(indices)):
        ## same book
        if i == 0:
            original_book = book_features_df.index[book_index]
        else:
            ## similar books
            similar_books.append(book_features_df.index[indices[i]])

        my_dict[original_book] = similar_books

In [23]:
# 秀出前三本相似
recommended_book_df = pd.DataFrame(my_dict)
recommended_book_df = recommended_book_df.T
recommended_book_df.columns = ['1st_Similar_book', '2nd_Similar_book', '3rd_Similar_book']
recommended_book_df.head()

Unnamed: 0,1st_Similar_book,2nd_Similar_book,3rd_Similar_book
81,餐飲業邪惡的賺錢祕密,イメージ別レイアウトスタイルシリーズクールandスタイリッシュ編,雑誌をデザインする集団キャップ
BACCANO！大騷動！ (15) 1710,The Chronicles of Narnia,涼宮春日的陰謀,忍者：闇影軍團的真實面貌！
Mr. Adult 大人先生,字母會C獨身,奧森巴赫之眼,有效期滿的初戀（全）（限）
TWO CAN TOUCAN大嘴鳥「兩罐」的故事,數位設計案內所--名片篇(附光碟)(平裝),眾神的山嶺（上）,紙之月
Visual C# 2008 程式設計範例教本,我的四千金,我上班，我存到100萬,魚類圖鑑-台灣七百多種常見魚類圖鑑(精裝)


In [24]:
# 書本推薦
book_name = "忘了我是誰"
result = recommended_book_df.loc[book_name, : ]
print('I have book recommendatation for you: \n')
for books in list(result.values):
      print(books)

I have book recommendatation for you: 

一起去看宋朝的活色生香
7個習慣教出優秀的孩子（教育現場篇）自我領導力教育的奇蹟
茶行的女兒
