In [1]:
import pandas as pd

In [182]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';',error_bad_lines=False,warn_bad_lines=False,encoding='latin-1')
    return df

book   = loaddata("../data/BX-CSV-Dump/BX-Books")
user   = loaddata("../data/BX-CSV-Dump/BX-Users")
rating = loaddata("../data/BX-CSV-Dump/BX-Book-Ratings")

  if (yield from self.run_code(code, result)):


# Explorating Data

In [183]:
rating.shape

(1149780, 3)

In [184]:
rating.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [185]:
# Check datatypes & missing values
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [186]:
# Check for duplicate values
print(f'Duplicate entries: {rating.duplicated().sum()}')

Duplicate entries: 0


In [187]:
rating['Book-Rating'].value_counts()

0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: Book-Rating, dtype: int64

Ratings are of two types, an implicit rating & explicit rating. An implicit rating is based on tracking user interaction with an item such as a user clicking on an item and in this case is recorded as rating '0'. An explicit rating is when a user explicitly rates an item, in this case on a numeric scale of '1-10'

Majority of ratings are implicit i.e., rating '0'
Rating of '8' has the highest rating count among explicit ratings '1-10'

评分有两种类型，显式和隐式的。显示的就是用户直接对商品打分1-10，隐式的就是，曝光给用户item，但是，用户没点，所以打了0分

大部分评分都是隐式的即评分为0

In [206]:
rating_users = rating['User-ID'].value_counts().reset_index().rename({"index":'user_id','User-ID':'count'},axis=1)


In [207]:
rating_users

Unnamed: 0,user_id,count
0,11676,13602
1,198711,7550
2,153662,6109
3,98391,5891
4,35859,5850
...,...,...
105278,158698,1
105279,17920,1
105280,277135,1
105281,275086,1


In [208]:
rating_users.describe()

Unnamed: 0,user_id,count
count,105283.0,105283.0
mean,139474.738619,10.920851
std,80574.738184,90.562825
min,2.0,1.0
25%,69376.0,1.0
50%,139146.0,1.0
75%,209643.5,4.0
max,278854.0,13602.0


In [211]:
rating_books = rating['ISBN'].value_counts().reset_index().rename({"index":'ISBN','ISBN':'count'},axis=1)


In [212]:
rating_books.describe()

Unnamed: 0,count
count,340556.0
mean,3.376185
std,12.436252
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,2502.0


In order to avoid rating bias & for making good recommendations, limit the dataset to only those
users that have made at least 250 ratings & books that have received at least 50 ratings

In [214]:
rating = rating[rating['User-ID'].isin(rating_users[rating_users['count']>250]['user_id'])]
rating = rating[rating['ISBN'].isin(rating_books[rating_books['count']> 50]['ISBN'])]

rating

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1468,277427,006092988X,0
1469,277427,0060930535,0
1470,277427,0060932139,0
1471,277427,0060934417,0
...,...,...,...
1147440,275970,1400031354,0
1147441,275970,1400031362,0
1147470,275970,1558744606,0
1147517,275970,1573229725,0


In [215]:
# For the recommendation system, it is prefered to have the book titles rather than ISBN for easier interpretation

rating = rating.merge(book, on="ISBN")[['User-ID','Book-Title','Book-Rating']] # merging with the book dataframe
rating       

Unnamed: 0,User-ID,Book-Title,Book-Rating
0,277427,Politically Correct Bedtime Stories: Modern Ta...,10
1,3363,Politically Correct Bedtime Stories: Modern Ta...,0
2,11676,Politically Correct Bedtime Stories: Modern Ta...,6
3,12538,Politically Correct Bedtime Stories: Modern Ta...,10
4,13552,Politically Correct Bedtime Stories: Modern Ta...,0
...,...,...,...
79308,234828,Ringworld,8
79309,236283,Ringworld,0
79310,249628,Ringworld,0
79311,261829,Ringworld,0


In [216]:
# Check for duplicate values
print(f'Duplicate entries: {rating.duplicated().sum()}')

Duplicate entries: 531


In [217]:
rating.drop_duplicates(inplace=True)
rating

Unnamed: 0,User-ID,Book-Title,Book-Rating
0,277427,Politically Correct Bedtime Stories: Modern Ta...,10
1,3363,Politically Correct Bedtime Stories: Modern Ta...,0
2,11676,Politically Correct Bedtime Stories: Modern Ta...,6
3,12538,Politically Correct Bedtime Stories: Modern Ta...,10
4,13552,Politically Correct Bedtime Stories: Modern Ta...,0
...,...,...,...
79308,234828,Ringworld,8
79309,236283,Ringworld,0
79310,249628,Ringworld,0
79311,261829,Ringworld,0


In [226]:
rating['User-ID'].value_counts()

11676     1505
35859      722
76352      655
16795      596
153662     544
          ... 
41700        1
203820       1
193560       1
173632       1
153621       1
Name: User-ID, Length: 686, dtype: int64

In [227]:
rating['Book-Title'].value_counts()

Wild Animus                                                               290
The Lovely Bones: A Novel                                                 218
Bridget Jones's Diary                                                     216
The Pelican Brief                                                         196
The Notebook                                                              189
                                                                         ... 
Stupid White Men. Eine Abrechnung mit dem Amerika unter George W. Bush      4
Illuminati.                                                                 3
MÃ?Â¶rder ohne Gesicht.                                                     3
Russendisko.                                                                2
Free                                                                        1
Name: Book-Title, Length: 1913, dtype: int64

In [218]:
list_of_distinct_users = list(rating['User-ID'].unique())

# Data Transformation

In [219]:
from surprise import Dataset, KNNBaseline, SVD, accuracy, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD

In [220]:
user_based_sim_option = {'name': 'cosine', 'user_based': True}
# item-based
item_based_sim_option = {'name': 'cosine', 'user_based': False}

In [221]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(rating[['User-ID','Book-Title','Book-Rating']], reader)
raw_ratings = data.raw_ratings

threshold   = int(len(raw_ratings)*0.7)



train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings  = raw_ratings[threshold:]

data.raw_ratings = train_raw_ratings        # data is now the trainset
trainset         = data.build_full_trainset() 
testset          = data.construct_testset(test_raw_ratings)

# User Based CF 

## KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms

In [222]:
# Trying KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms using default model parameters

models=[KNNBasic(sim_option = user_based_sim_option)
        ,KNNWithMeans(sim_option = user_based_sim_option)
        ,KNNWithZScore(sim_option = user_based_sim_option)
        ,KNNBaseline(sim_option = user_based_sim_option)
        ,SVD()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [223]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNBaseline,2.397846,3.30551,0.177434,2.391942
knns.KNNWithMeans,2.405467,3.307819,0.130655,1.494682
knns.KNNWithZScore,2.391456,3.334144,0.179888,1.864789
matrix_factorization.SVD,2.449474,3.369426,5.682561,0.209353
knns.KNNBasic,2.475973,3.494708,0.107945,1.462651


# Item based CF

## KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms

In [224]:
# Trying KNN (K-Nearest Neighbors) & SVD (Singluar Value decomposition) algorithms using default model parameters

models=[KNNBasic(sim_option = item_based_sim_option)
        ,KNNWithMeans(sim_option = item_based_sim_option)
        ,KNNWithZScore(sim_option = item_based_sim_option)
        ,KNNBaseline(sim_option = item_based_sim_option)
        ,SVD()] 
results = {}

for model in models:
    # perform 5 fold cross validation
    # evaluation metrics: mean absolute error & root mean square error
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)  
    
    # storing the average score across the 5 fold cross validation for each model
    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0).\
             rename({'test_mae':'MAE', 'test_rmse': 'RMSE'})
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [228]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='RMSE')

Model Performance: 



Unnamed: 0,MAE,RMSE,fit_time,test_time
knns.KNNWithMeans,2.400698,3.299146,0.171292,1.948496
knns.KNNBaseline,2.402539,3.305673,0.188824,2.568579
knns.KNNWithZScore,2.390553,3.332113,0.223256,2.037724
matrix_factorization.SVD,2.447145,3.372398,5.513562,0.17817
knns.KNNBasic,2.476888,3.489958,0.153143,1.747954


# tuning parameters for SVD

In [229]:
# Hyperparameter tuning - SVD

param_grid = {"n_factors": range(10,100,20),
              "n_epochs" : [5, 10, 20],
              "lr_all"   : [0.002, 0.005],
              "reg_all"  : [0.2, 0.5]}

gridsearchSVD = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=5, n_jobs=-1)
                                    
gridsearchSVD.fit(data)

print(f'MAE Best Parameters:  {gridsearchSVD.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchSVD.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchSVD.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchSVD.best_score["rmse"]}\n')

MAE Best Parameters:  {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}
MAE Best Score:       2.4197942228249327

RMSE Best Parameters: {'n_factors': 70, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.2}
RMSE Best Score:      3.190270081093822



# Building recommendations

In [231]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(rating[['User-ID','Book-Title','Book-Rating']], reader)
trainset = data.build_full_trainset()

In [317]:
rating.head()

Unnamed: 0,User-ID,Book-Title,Book-Rating
0,277427,Politically Correct Bedtime Stories: Modern Ta...,10
1,3363,Politically Correct Bedtime Stories: Modern Ta...,0
2,11676,Politically Correct Bedtime Stories: Modern Ta...,6
3,12538,Politically Correct Bedtime Stories: Modern Ta...,10
4,13552,Politically Correct Bedtime Stories: Modern Ta...,0


# CF Recommendation

## Item Based

* According to these items that target user clicked, we find the similar items because of the similarity matrix between items, then, recommend them out of the weighted cosine distance

In [232]:
# KNNWithMeans

def generate_recommendationsKNN(userID=13552, like_recommend=5, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations using 
        KNNWithMeans & item based filtering. The function needs as input three 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) like_recommend i.e., number of top recommendations for the userID to be 
        considered for making recommendations 
        (3) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, like_recommend=5, get_recommend=10
    '''
    
    # Compute item based similarity matrix
    sim_options       = {'name':'cosine','min_support':3,'user_based':False}
    similarity_matrix = KNNWithMeans(sim_options=sim_options).fit(trainset).\
                        compute_similarities() 
    
    userID      = trainset.to_inner_uid(userID)    # converts the raw userID to innerID
    userRatings = trainset.ur[userID]              # method .ur takes user innerID & 
                                                   # returns back user ratings
    
    
    # userRatings is a list of tuples [(,),(,),(,)..]. Each tuple contains item & rating
    # given by the user for that item. Next, the tuples will be sorted within the list 
    # in decreasing order of rating. Then top 'like_recommend' items & ratings are extracted
    
    temp_df = pd.DataFrame(userRatings).sort_values(by=1, ascending=False).\
              head(like_recommend)
    userRatings = temp_df.to_records(index=False) 
    # for each (item,rating) in top like_recommend user items, multiply the user rating for
    # the item with the similarity score (later is obtained from item similarity_matrix) for
    # all items. This helps calculate the weighted rating for all items. The weighted ratings 
    # are added & divided by sum of weights to estimate rating the user would give an item
    
    recommendations   = {}

    for user_top_item, user_top_item_rating  in userRatings:

        all_item_indices          =   list(pd.DataFrame(similarity_matrix)[user_top_item].index)
        all_item_weighted_rating  =   list(pd.DataFrame(similarity_matrix)[user_top_item].values*\
                                          user_top_item_rating)
        
        all_item_weights          =   list(pd.DataFrame(similarity_matrix)[user_top_item].values)
        
        
        # All items & final estimated ratings are added to a dictionary called recommendations
        
        for index in range(len(all_item_indices)):
            if index in recommendations:
                # sum of weighted ratings
                recommendations[index] += all_item_weighted_rating[index]        
            else:                        
                recommendations[index]  = all_item_weighted_rating[index]

    
    for index in range(len(all_item_indices)):                               
            if all_item_weights[index]  !=0:
                # final ratings (sum of weighted ratings/sum of weights)
                recommendations[index]   =recommendations[index]/\
                                          (all_item_weights[index]*like_recommend)
    # convert dictionary recommendations to a be a list of tuples [(,),(,),(,)]
    # with each tuple being an item & estimated rating user would give that item
    # sort the tuples within the list to be in decreasing order of estimated ratings

    temp_df = pd.Series(recommendations).reset_index().sort_values(by=0, ascending=False)
    recommendations = list(temp_df.to_records(index=False))
    
    # return get_recommend number of recommedations (only return items the user 
    # has not previously rated)
    
    final_recommendations = []
    count = 0
    
    for item, score in recommendations:
        flag = True
        for userItem, userRating in trainset.ur[userID]:
            if item == userItem: 
                flag = False       # If item in recommendations has not been rated by user, 
                break              # add to final_recommendations
        if flag == True:
            final_recommendations.append(trainset.to_raw_iid(item)) 
            count +=1              # trainset has the items stored as inner id,  
                                   # convert to raw id & append 
            
        if count > get_recommend:  # Only get 'get_recommend' number of recommendations
            break
    
    return(final_recommendations)

In [233]:
recommendationsKNN = generate_recommendationsKNN(userID=13552, like_recommend=5, get_recommend=10)
recommendationsKNN

Computing the cosine similarity matrix...
Done computing similarity matrix.


  sim = construction_func[name](*args)


Computing the cosine similarity matrix...
Done computing similarity matrix.


['The Lake House',
 'Harry Potter and the Chamber of Secrets (Book 2)',
 'Why Girls Are Weird : A Novel',
 'SKINNY LEGS AND ALL',
 '2nd Chance',
 'Round Ireland With a Fridge',
 "Harry Potter and the Sorcerer's Stone (Book 1)",
 'Summer Pleasures',
 'And Then There Were None : A Novel',
 'This Present Darkness',
 'Vittorio the Vampire: New Tales of the Vampires']

### Let me check the cosine similarity

In [296]:
test_itemid1 = 'The Notebook'
test_itemid2 =  'The Pelican Brief'
test_itemid3 = 'The Lovely Bones: A Novel'

In [297]:
sim_options       = {'name':'msd','min_support':3,'user_based':False}
similarity_matrix = KNNWithMeans(sim_options=sim_options).fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [298]:
similarity_matrix.sim

array([[1.        , 0.03240741, 0.03619048, ..., 0.03418803, 0.03636364,
        0.        ],
       [0.03240741, 1.        , 0.04145078, ..., 0.        , 0.        ,
        0.        ],
       [0.03619048, 0.04145078, 1.        , ..., 0.        , 0.03030303,
        0.04504505],
       ...,
       [0.03418803, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.03636364, 0.        , 0.03030303, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.04504505, ..., 0.        , 0.        ,
        1.        ]])

In [299]:
itemID1 = trainset.to_inner_iid(test_itemid1)
itemID2 = trainset.to_inner_iid(test_itemid2)
itemID3 = trainset.to_inner_iid(test_itemid3)

In [269]:
similarity_matrix.sim[itemID1][itemID2]

0.04952380952380952

In [300]:
similarity_matrix.sim[itemID1][itemID3]

0.02972972972972973

In [301]:
similarity_matrix.sim.shape

(1913, 1913)

In [308]:
user_dict = {}
user_int = 0
item_dict = {}
item_int = 0

In [309]:
for i,col in rating.iterrows():
    if col[0] not in user_dict:
        user_dict[col[0]] = user_int 
        user_int += 1
    if col[1] not in item_dict:
        item_dict[col[1]] = item_int
        item_int += 1

In [310]:
print((len(user_dict),len(item_dict)))

(686, 1913)


In [311]:
rating_matrix = np.zeros((len(user_dict),len(item_dict)))

In [312]:
for i,col in rating.iterrows():
    rating_matrix[user_dict[col[0]]][item_dict[col[1]]] = col[2]

In [288]:
rating_matrix2 = rating_matrix/rating_matrix.sum(axis=0).reshape([1,-1])

  """Entry point for launching an IPython kernel.


In [289]:
rating_matrix2[np.isnan(rating_matrix2)] = 0

In [313]:
from sklearn.metrics.pairwise import cosine_similarity

In [314]:
cosine_similarity(rating_matrix2[:,item_dict[test_itemid1]].reshape(1,-1),rating_matrix2[:,item_dict[test_itemid2]].reshape(1,-1))


array([[0.14084645]])

In [315]:
cosine_similarity(rating_matrix2[:,item_dict[test_itemid1]].reshape(1,-1),rating_matrix2[:,item_dict[test_itemid3]].reshape(1,-1))

array([[0.10757089]])

Although the value of the similarities of item1,item2 and item3 in surprise is different from the similarties from rating matrix, **their order is the same**. 

In [318]:
def getitem_score(x):
    return rating[(rating['Book-Title'] == x)&(rating['Book-Rating'] > 0)]['Book-Rating'].mean()

## User Based

In [332]:
def get_similar_users_recommendations(uid, n=10):
    # 获取训练集，这里取数据集全部数据
    # 考虑基线评级的协同过滤算法
    sim_options       = {'name':'cosine','min_support':3,'user_based':True}
    algo = KNNWithMeans(sim_options=sim_options)
    # 拟合训练集
    algo.fit(trainset)
    # 将原始id转换为内部id
    inner_id = algo.trainset.to_inner_uid(uid)
    # 使用get_neighbors方法得到10个最相似的用户
    neighbors = algo.get_neighbors(inner_id, k=10)
    neighbors_uid = ( algo.trainset.to_raw_uid(x) for x in neighbors )
    recommendations = set()
    #把评分为5的电影加入推荐列表
    for user in neighbors_uid:
        if len(recommendations) > n:
            break
        item = rating[rating['User-ID']==user]
        item['score'] = item['Book-Title'].apply(lambda x:getitem_score(x))
        item = item.sort_values(by='score',ascending=False)
        for i,col in item.iterrows():
            if len(recommendations) >= n:
                break
            recommendations.add(col[1])
    return recommendations
    

In [333]:
recommendations = get_similar_users_recommendations(13552,10)

Computing the cosine similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [334]:
recommendations

{"Angela's Ashes (MMP) : A Memoir",
 'Cloud Nine',
 'Considering Kate (The Stanislaskis) (Silhouette Special Edition)',
 'Hiding in the Shadows (Shadows Trilogy (Paperback))',
 'Killjoy',
 'Nights in Rodanthe',
 "Patty Jane's House of Curl",
 'SILENT NIGHT : The Story of the World War I Christmas Truce',
 'The Da Vinci Code',
 "The Pilot's Wife : A Novel"}

# SVD

In [335]:
def generate_recommendationsSVD(userID=13552, get_recommend =10):
    
    ''' This function generates "get_recommend" number of book recommendations 
        using Singular value decomposition. The function needs as input two 
        different parameters:
        (1) userID i.e., userID for which recommendations need to be generated 
        (2) get_recommend i.e., number of recommendations to generate for the userID
        Default values are: userID=13552, get_recommend=10
    '''
    
    model = SVD(n_factors=50, n_epochs=10, lr_all=0.005, reg_all= 0.2)
    model.fit(trainset)
    
    # predict rating for all pairs of users & items that are not in the trainset
    
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    predictions_df = pd.DataFrame(predictions)
    
    # get the top get_recommend predictions for userID
    
    predictions_userID = predictions_df[predictions_df['uid'] == userID].\
                         sort_values(by="est", ascending = False).head(get_recommend)
    
    recommendations = []
    recommendations.append(list(predictions_userID['iid']))
    recommendations = recommendations[0]
    
    return(recommendations)

In [336]:
recommendationsSVD = generate_recommendationsSVD(userID=13552, get_recommend =10)
recommendationsSVD

['Harry Potter and the Prisoner of Azkaban (Book 3)',
 'Harry Potter and the Goblet of Fire (Book 4)',
 'Harry Potter and the Chamber of Secrets (Book 2)',
 "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
 'The Hobbit : The Enchanting Prelude to The Lord of the Rings',
 'Harry Potter and the Order of the Phoenix (Book 5)',
 "Harry Potter and the Sorcerer's Stone (Book 1)",
 'The Cat in the Hat',
 'Anne of Green Gables (Anne of Green Gables Novels (Paperback))',
 'The Lion, the Witch, and the Wardrobe (The Chronicles of Narnia, Book 2)']

# Swing

In [338]:
def get_uitems_iusers(train):
    u_items = dict()
    i_users = dict()
    for index, row in train.iterrows():
        u_items.setdefault(row["User-ID"], set())
        i_users.setdefault(row["Book-Title"], set())

        u_items[row["User-ID"]].add(row["Book-Title"])
        i_users[row["Book-Title"]].add(row["User-ID"])
    print("使用的用户个数为：{}".format(len(u_items)))
    print("使用的item个数为：{}".format(len(i_users)))
    return u_items, i_users

In [344]:
from itertools import combinations

In [349]:
def cal_similarity(u_items, i_users,alpha=0.5):
    item_pairs = list(combinations(i_users.keys(), 2))
    print("item pairs length：{}".format(len(item_pairs))) # 1410360
    item_sim_dict = dict()
    cnt = 0
    for (i, j) in item_pairs:
        cnt += 1
        user_pairs = list(combinations(i_users[i] & i_users[j], 2))
        result = 0.0
        for (u, v) in user_pairs:
            result += 1 / (alpha + list(u_items[u] & u_items[v]).__len__())

        item_sim_dict.setdefault(i, dict())
        item_sim_dict[i][j] = result
        # print(item_sim_dict[i][j])
    return item_sim_dict


In [350]:
u_items, i_users = get_uitems_iusers(rating)
item_sim_dict = cal_similarity(u_items, i_users)

使用的用户个数为：686
使用的item个数为：1913
item pairs length：1828828


In [395]:
def generate_recommendationsSwing(item_sim_dict,userID=13552, get_recommend =10):
    items = u_items[userID]
    recommendation = set()
    for item in items:
        sitems = dict(sorted(item_sim_dict[item].items(), key = lambda k:k[1], reverse=True)[:get_recommend])
        for it in sitems:
            if it in items:
                continue
            else:
                if len(recommendation) >= get_recommend:
                    break
                recommendation.add(it)
    return recommendation
        

In [369]:
recommendationSwing = generate_recommendationsSwing(13552,10,item_sim_dict)

In [370]:
recommendationSwing

{'Animal Farm',
 "Pretend You Don't See Her",
 'Red Storm Rising',
 'Sphere',
 "The Girls' Guide to Hunting and Fishing",
 'The Nanny Diaries: A Novel',
 'The Reader',
 'The Red Tent (Bestselling Backlist)',
 'While I Was Gone',
 'Whispers'}

## Adamic-Adar

* Item Based

In [385]:
i2i = dict()

In [386]:
for i in i_users:
    for u in i_users[i]:
        i2i.setdefault(i, set())
        i2i[i] |= u_items[u]
        i2i[i].remove(i)

In [387]:
print(len(i2i))

1913


In [388]:
def cal_similarity(i2i):
    item_pairs = list(combinations(i2i.keys(), 2))
    print("item pairs length：{}".format(len(item_pairs))) # 1410360
    item_sim_dict = dict()
    cnt = 0
    for (i, j) in item_pairs:
        cnt += 1
        iiset = i2i[i] & i2i[j]
        result = 0.0
        for ii in iiset:
            result += (1/np.log(1+len(i2i[ii])))
            
        item_sim_dict.setdefault(i, dict())
        item_sim_dict[i][j] = result
        # print(item_sim_dict[i][j])
    return item_sim_dict

In [389]:
item_combination = list(combinations(i2i.keys(), 2))

In [390]:
len(item_combination)

1828828

In [391]:
item_sim_dict = cal_similarity(i2i)

item pairs length：1828828


In [396]:
recommendationAAitem = generate_recommendationsSwing(item_sim_dict,13552,10,)

In [397]:
recommendationAAitem

{'Airframe',
 'Fahrenheit 451',
 'Midwives: A Novel',
 'Snow Falling on Cedars',
 'The Bean Trees',
 "The Girls' Guide to Hunting and Fishing",
 'The Joy Luck Club',
 'The Nanny Diaries: A Novel',
 'The Perfect Storm : A True Story of Men Against the Sea',
 'The Rapture of Canaan'}

* User Based

In [398]:
u2u = dict()

In [402]:
for u in u_items:
    for i in u_items[u]:
        u2u.setdefault(u, set())
        u2u[u] |= i_users[i]
        u2u[u].remove(u)

In [403]:
user_sim_dict = cal_similarity(u2u)

item pairs length：234955


In [413]:
def get_similar_users_recommendationsAA(uid, user_sim_dict,n=10):
    # 获取训练集，这里取数据集全部数据
    # 考虑基线评级的协同过滤算法
    user_list = dict(sorted(user_sim_dict[uid].items(),key=lambda x:x[1],reverse=True)[:n])
    recommendations = set()
    user_used_item = set(rating[rating['User-ID']==uid]['Book-Title'].values.tolist())
    #把评分为5的电影加入推荐列表
    for user in user_list:
        if len(recommendations) > n:
            break
        item = rating[rating['User-ID']==user]
        item['score'] = item['Book-Title'].apply(lambda x:getitem_score(x))
        item = item.sort_values(by='score',ascending=False)
        for i,col in item.iterrows():
            if len(recommendations) >= n:
                break
            if col[1] in user_used_item:
                continue

            recommendations.add(col[1])
            user_used_item.add(col[1])
    return recommendations

In [414]:
recommendationAAuser =get_similar_users_recommendationsAA(13552,user_sim_dict,10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats

In [415]:
recommendationAAuser

{'AGE OF INNOCENCE (MOVIE TIE-IN)',
 'An Instance of the Fingerpost',
 'Grave Secrets (Temperance Brennan Novel (Hardcover))',
 'Harry Potter and the Goblet of Fire (Book 4)',
 "I Know This Much Is True (Oprah's Book Club)",
 'Parallel Lies',
 'The Blue Day Book',
 'The Coffin Dancer (Lincoln Rhyme Novels (Paperback))',
 'The Curious Incident of the Dog in the Night-Time (Vintage Contemporaries)',
 'Xenocide (Ender Wiggins Saga (Paperback))'}