In [1]:
from __future__ import (absolute_import, division, print_function, unicode_literals)
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import KNNBasic
import os
import pandas as pd
from surprise import SVD
from surprise.model_selection import train_test_split

user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
user.columns = ['userID', 'Location', 'Age']
rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
rating.columns = ['userID', 'ISBN', 'bookRating']
df = pd.merge(user, rating, on='userID', how='inner')
df.drop(['Location', 'Age'], axis=1, inplace=True)
df.head()

"""
User 정보와 Book Rating 정보가 담긴 csv 파일을 읽어서 데이터 프레임에 merge
그 후에 Location, Age 열을 삭제
--> 유저, 아이템, 레이팅 자료구조 획득
"""


'\nUser 정보와 Book Rating 정보가 담긴 csv 파일을 읽어서 데이터 프레임에 merge\n그 후에 Location, Age 열을 삭제\n--> 유저, 아이템, 레이팅 자료구조 획득\n'

In [2]:
#원본 데이터 출력
df

Unnamed: 0,userID,ISBN,bookRating
0,2,0195153448,0
1,7,034542252,0
2,8,0002005018,5
3,8,0060973129,0
4,8,0374157065,0
5,8,0393045218,0
6,8,0399135782,0
7,8,0425176428,0
8,8,0671870432,0
9,8,0679425608,0


In [3]:
#데이터가 너무 크기 때문에 user를 400여명 정도 남긴 후 삭제
idx = []

for i in range(3000, 1149780):
    idx.append(i)

df = df.drop(idx)

df

Unnamed: 0,userID,ISBN,bookRating
0,2,0195153448,0
1,7,034542252,0
2,8,0002005018,5
3,8,0060973129,0
4,8,0374157065,0
5,8,0393045218,0
6,8,0399135782,0
7,8,0425176428,0
8,8,0671870432,0
9,8,0679425608,0


In [4]:
#현재 자료구조 내에 있는 유저 출력
print(len(df['userID'].unique()))
df['userID'].unique()

431


array([   2,    7,    8,    9,   10,   12,   14,   16,   17,   19,   20,
         22,   23,   26,   32,   36,   38,   39,   42,   44,   51,   53,
         56,   64,   67,   68,   69,   70,   73,   75,   77,   78,   79,
         81,   82,   83,   85,   86,   87,   88,   91,   92,   95,   97,
         99,  100,  102,  107,  109,  110,  114,  125,  129,  132,  133,
        135,  137,  139,  141,  144,  151,  160,  162,  165,  169,  176,
        178,  182,  183,  185,  190,  193,  199,  202,  204,  207,  212,
        215,  217,  221,  224,  226,  228,  230,  232,  233,  236,  237,
        241,  242,  243,  244,  247,  249,  250,  254,  256,  257,  269,
        272,  273,  278,  280,  289,  291,  300,  302,  306,  310,  311,
        313,  316,  323,  326,  327,  332,  334,  337,  338,  339,  343,
        345,  356,  357,  359,  361,  362,  364,  367,  369,  372,  376,
        382,  383,  384,  387,  388,  392,  393,  400,  406,  408,  413,
        422,  424,  430,  431,  432,  433,  435,  4

In [5]:
#surprise 라이브러리를 이용하여 data로드
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df[['userID', 'ISBN', 'bookRating']], reader)


In [6]:
from surprise.model_selection import KFold
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

#SVD를 이용하여 precision recall 계산

kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    P = sum(prec for prec in precisions.values()) / len(precisions)
    R = sum(rec for rec in recalls.values()) / len(recalls)
    F1 = 2.0*P*R / (P + R)
    
    print("precision : ", P)
    print("recall : ", R)
    print("F1 : ", F1)
    print()

precision :  0.9081981981981981
recall :  0.5073195273195273
F1 :  0.6509938693333399

precision :  0.8638576779026217
recall :  0.5964962389450162
F1 :  0.7057027065945827

precision :  0.8823863636363637
recall :  0.5113602543290042
F1 :  0.6474883016889201

precision :  0.8942460317460318
recall :  0.4972619663807552
F1 :  0.6391261002780131

precision :  0.9161048689138577
recall :  0.4995349220068322
F1 :  0.6465294027166848



In [7]:

#KNNWithMeans를 이용하여 precision recall 계산

from surprise import KNNWithMeans
sim_options = {'name' : 'cosine', 'user_based' : True}
algo = KNNWithMeans(k = 40, min_k = 1, sim_options = sim_options)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    P = sum(prec for prec in precisions.values()) / len(precisions)
    R = sum(rec for rec in recalls.values()) / len(recalls)
    F1 = 2.0*P*R / (P + R)
    
    print("precision : ", P)
    print("recall : ", R)
    print("F1 : ", F1)
    print()

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9613453815261043
recall :  0.3062822719449225
F1 :  0.46455762742525786

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9382352941176471
recall :  0.3377450980392157
F1 :  0.4966916001898263

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9807692307692307
recall :  0.34637862137862135
F1 :  0.51195123963712

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9664804469273743
recall :  0.29326678907684495
F1 :  0.44998966344228725

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9540540540540541
recall :  0.32683865683865687
F1 :  0.4868819112276468



  sim = construction_func[name](*args)


In [8]:

#KNNWithZScore를 이용하여 precision recall 계산
from surprise import KNNWithZScore

sim_options = {'name' : 'cosine', 'user_based' : True}
algo = KNNWithZScore(k = 40, min_k = 1, sim_options = sim_options)



for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    P = sum(prec for prec in precisions.values()) / len(precisions)
    R = sum(rec for rec in recalls.values()) / len(recalls)
    F1 = 2.0*P*R / (P + R)
    
    print("precision : ", P)
    print("recall : ", R)
    print("F1 : ", F1)
    print()

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9646182495344507
recall :  0.3332617103566824
F1 :  0.49537759672024706

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9576427255985267
recall :  0.3390405634880773
F1 :  0.500784936521994

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9741379310344828
recall :  0.32708333333333334
F1 :  0.4897311323358914

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.9657142857142857
recall :  0.3148051948051948
F1 :  0.4748258475804114

Computing the cosine similarity matrix...
Done computing similarity matrix.
precision :  0.985897435897436
recall :  0.3145737453429761
F1 :  0.47696166344644203



In [9]:

#PMF를 이용하여 precision recall 계산

from surprise import SVD

algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0.02)



for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    P = sum(prec for prec in precisions.values()) / len(precisions)
    R = sum(rec for rec in recalls.values()) / len(recalls)
    F1 = 2.0*P*R / (P + R)
    
    print("precision : ", P)
    print("recall : ", R)
    print("F1 : ", F1)
    print()

precision :  1.0
recall :  0.3028571428571429
F1 :  0.4649122807017544

precision :  1.0
recall :  0.3431952662721893
F1 :  0.5110132158590308

precision :  1.0
recall :  0.2994350282485876
F1 :  0.4608695652173913

precision :  1.0
recall :  0.2784090909090909
F1 :  0.43555555555555553

precision :  1.0
recall :  0.22162162162162163
F1 :  0.36283185840707965



In [10]:
#PMF with biased를 이용하여 precision recall 계산

from surprise import SVD

algo = SVD(n_factors = 100, n_epochs = 20, biased = True, lr_all = 0.005, reg_all = 0.02)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    P = sum(prec for prec in precisions.values()) / len(precisions)
    R = sum(rec for rec in recalls.values()) / len(recalls)
    F1 = 2.0*P*R / (P + R)
    
    print("precision : ", P)
    print("recall : ", R)
    print("F1 : ", F1)
    print()

precision :  0.9145714285714286
recall :  0.5227940630797774
F1 :  0.665290096216611

precision :  0.8586917562724012
recall :  0.48573070016193953
F1 :  0.620478996019878

precision :  0.8767258382642997
recall :  0.4982976036386189
F1 :  0.6354369982966509

precision :  0.8517647058823529
recall :  0.5302941176470588
F1 :  0.653641951353855

precision :  0.9135838150289018
recall :  0.5389644247683494
F1 :  0.6779660211676751



In [11]:
import math

def NDCG(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
        
    
    total_idcg = 0
    total_dcg = 0

    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)   #real raing으로 정렬
        real_sort = user_ratings
        user_ratings.sort(key=lambda x: x[0], reverse=True)   #predict rating으로 정렬
        predict_sort = user_ratings
        
        idcg = real_sort[0][1] 
        dcg = user_ratings[0][1]
        
        for i in range(1, len(real_sort)):
            idcg = idcg + (real_sort[i][1] / math.log(i+1, 2))
            dcg = dcg + (predict_sort[i][0] / math.log(i+1, 2))
            
        total_idcg = total_idcg + idcg
        total_dcg = total_dcg + dcg

    return total_dcg / total_idcg

#SVD를 이용하여 NDCG 계산


kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    ndcg = NDCG(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print("NDCG : ", ndcg)
    
    

NDCG :  1.0505465692100986
NDCG :  0.9579802013542316
NDCG :  0.949030023677149
NDCG :  0.9477541574940522
NDCG :  0.9642924335189541


In [12]:

#KNNWithMeans를 이용하여 NDCG 계산

from surprise import KNNWithMeans
sim_options = {'name' : 'cosine', 'user_based' : True}
algo = KNNWithMeans(k = 40, min_k = 1, sim_options = sim_options)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    ndcg = NDCG(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print("NDCG : ", ndcg)
    

Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  0.934592143980597
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  0.9974924316170849
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  1.0048707093506717
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  1.0361548011453388
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  0.9805642162917809


In [13]:

#KNNWithZScore를 이용하여 NDCG 계산
from surprise import KNNWithZScore

sim_options = {'name' : 'cosine', 'user_based' : True}
algo = KNNWithZScore(k = 40, min_k = 1, sim_options = sim_options)


for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    ndcg = NDCG(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print("NDCG : ", ndcg)
    

Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  1.0210621223395067
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  0.9896432743760775
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  1.0245831342729435
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  0.995614779961006
Computing the cosine similarity matrix...
Done computing similarity matrix.
NDCG :  0.9400273880071047


In [14]:

#PMF를 이용하여 NDCG 계산

from surprise import SVD

algo = SVD(n_factors = 100, n_epochs = 20, biased = False, lr_all = 0.005, reg_all = 0.02)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    ndcg = NDCG(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print("NDCG : ", ndcg)

NDCG :  0.9414084098228521
NDCG :  0.9990801762775184
NDCG :  0.9565784856772684
NDCG :  1.0282053343452202
NDCG :  0.9335564492889935
