In [1]:
# Your user-user system should take as input a particular user ID and return a set of top 5 most similar users. For each of the similar users, 
# also retrieve their top rated books from the sampled dataset. Perform similar hyperparameter testing as in Problem 2 and detail your experiences using each system

In [166]:
##Script to build book recommendation systems.

import json
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import pairwise_distances
from tqdm import tqdm

def read_raw_data(_num_samples, _fn):
    _df = pd.read_csv("goodreads_interactions.csv", nrows=_num_samples)
    _df = _df[_df.is_read == 1]
    _df = _df[0:_num_samples]  
    _df.to_csv('goodreads_{}.csv'.format(_fn, index=False))
 #   return _df  # Return the dataframe for further use


def build_rating_matrix(_df):
    
    _n_users = len(_df.user_id.unique()) + 1  # python indices start at zero, user_ids start at 1
    _n_books = _df.book_idx.max() + 1  # python indices start at zero, book_ids start at 1
    print('Users: {}'.format(_n_users))
    print('Books: {}'.format(_n_books))
    _ratings = np.zeros((_n_users, _n_books))
    for _, row in tqdm(_df.iterrows()):
        i = row.user_id
        j = row.book_idx
        _ratings[i, j] = row.rating
    #print(_ratings, 'THis is the rating Matrix')
    return _ratings

def recommend_user_similarity(_matrix, _eps, _n_latent):
    
  #  _user_svd = TruncatedSVD(n_components=min(_matrix.shape)-1)  # Adjust to prevent dimension issues
    _user_svd = TruncatedSVD(n_components=_n_latent)    
    _user_features = _user_svd.fit_transform(_matrix.transpose())    
    
    print('Converting to sparse')
    
    _user_similarity = sparse.csr_matrix(_user_features)
    return _user_similarity


def generate_similarity_matrix(_features, _metric):
    """
    Generates the similarity matrix from either item or user features
    based on the given similarity metric.
    :param _features: The matrix of user or item features.
    :param _metric: A string indicating which similarity metric should be used.
    :return: _similarity_matrix, The final similarity matrix.
    """
    assert _metric in ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
    print('Computing similarity')
    _similarity_matrix = pairwise_distances(_features, metric=_metric)
    return _similarity_matrix

def merge_meta(_meta_path, _map_path, _ratings):
    """
    Merges book metadata with ratings.

    :param _meta_path: Path to book metadata csv.
    :param _map_path: Path to book ID mapping.
    :param _ratings: Dataframe of rating interactions.
    :return: _ratings_meta, a dataframe of metadata and ratings and
    _metadata_lookup, dictionary for the UI.
    """
    _meta = pd.read_csv(_meta_path)
    _map = pd.read_csv(_map_path)
    _ratings_map = _ratings.merge(_map, how='left',
                                  left_on='user_id', right_on='book_id_csv')
    _ratings_map = _ratings_map[['user_id', 'book_id_csv', 'is_read',
                                 'rating', 'is_reviewed', 'book_id_y']]
    _ratings_map.columns = ['user_id', 'book_idx', 'is_read',
                            'rating', 'is_reviewed', 'book_id']
    _metadata_lookup = {}
    for _, row in _ratings_map.iterrows():
        _md = _meta[_meta['book_id'] == row['book_id']]
        _metadata_lookup[str(row.user_id)] = {
            'title': _md['title'].values[0],
            'link': _md['link'].values[0]}
        
    #print(_ratings_map, ' THis is the Rating MAP')
    #print(_metadata_lookup, 'This is the metadata lookup')
    return _ratings_map, _metadata_lookup


if __name__ == "__main__":
    NS = 9000
    FN = '9k'
    EPS = 1e-9
    FACTORS = 2
    METRIC = 'cosine'

    try:
        goodreads = pd.read_csv('goodreads_{}.csv'.format(FN))
        #print(goodreads)
    except FileNotFoundError:
        read_raw_data(NS, FN)
        goodreads = pd.read_csv('goodreads_{}.csv'.format(FN))


    ratings_meta, metadata_lookup = merge_meta(
        'book_metadata.csv',
        'book_id_map.csv', goodreads)
    
    print('Saving metadata')
    

   # print('This is the meataData',  ratings_meta)
   # print(' THis is the metadata lookup', metadata_lookup)
    
    with open('books_metadata_{records}.json'.format(records=FN), 'w', encoding='utf-8') as m:
        json.dump(metadata_lookup, m)
    
    ratings = build_rating_matrix(ratings_meta)
    
    print(ratings,  'This is the RATING Matrix')
    
    user_similarity = recommend_user_similarity(ratings, EPS, FACTORS)  # Calculating user-user similarity
    
    #print(user_similarity,  'This is the Recommender Similarity')
    
    sim = generate_similarity_matrix(user_similarity, METRIC)

    print(sim, 'This is the Generate similarity')
    
    #with open('user_similarity_{FACTORS}_{FN}_{METRIC}.pkl', 'wb') as f:
        #pickle.dump(user_similarity, f)
    print('Saving similarity')
    
    with open('user_similarity_{factors}_{records}_{metric}.pkl'.format(factors=FACTORS,
                                                                        records=FN,
                                                                        metric=METRIC), 'wb') as f:
        pickle.dump(sim, f)

Saving metadata
Users: 15
Books: 14


2476it [00:00, 51555.57it/s]

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 5. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 5.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] This is the RATING Matrix
Converting to sparse
Computing similarity
[[0.         1.         1.         1.         1.         1.
  1.         1.         1.         1.         1.         1.
  1.         1.        ]
 [1.         0.         0.11404202 1.         1.         1.99933763
  1.         1.         1.         1.      


