# Recommender Systems 1 - Collaborative Filtering 1

### Import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors     # Unsupervised learner for implementing neighbor searches
from sklearn.metrics import pairwise_distances     # Compute the distance matrix from either a vector array
                                                   #  or a matrix, and returns a distance matrix

### Declare global variables

In [2]:
global k, metric
k = 3
metric = 'cosine' # can be changed to 'correlation' for Pearson correlation similarities

### Load data

In [3]:
# Create data
#  Rows = users
#  Columns = items
#  user-item ratings matrix where ratings are integers from 1-10
data = np.asarray([[3,7,4,9,9,7], 
                   [7,0,5,3,8,8],
                   [7,5,5,0,8,4],
                   [5,6,8,5,9,8],
                   [5,8,8,8,10,9],
                   [7,7,0,4,7,8]])

df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2,3,4,5
0,3,7,4,9,9,7
1,7,0,5,3,8,8
2,7,5,5,0,8,4
3,5,6,8,5,9,8
4,5,8,8,8,10,9
5,7,7,0,4,7,8


### Generate & Review Similarity Matrices
#### Note: The next 2 cells are presented for teaching purposes only.

In [4]:
# Generate cosine similarity matrix from user-item ratings matrix 
#  pairwise_distances returns distances between ratings
#   similarities are obtained by subtracting distances from 1
cosine_sim = 1 - pairwise_distances(df, metric="cosine")
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.799268,0.779227,0.934622,0.97389,0.8846
1,0.799268,1.0,0.874744,0.90585,0.866146,0.827036
2,0.779227,0.874744,1.0,0.909513,0.865454,0.853275
3,0.934622,0.90585,0.909513,1.0,0.989344,0.865614
4,0.97389,0.866146,0.865454,0.989344,1.0,0.88164
5,0.8846,0.827036,0.853275,0.865614,0.88164,1.0


In [5]:
# Generate Pearson correlation similarity matrix from user-item ratings matrix
#  pairwise_distances returns distances between ratings
#   similarities are obtained by subtracting distances from 1
pearson_sim = 1 - pairwise_distances(df, metric="correlation")
pd.DataFrame(pearson_sim)

Unnamed: 0,0,1,2,3,4,5
0,1.0,-0.137446,-0.357398,0.208179,0.761905,0.27735
1,-0.137446,1.0,0.453897,0.51591,0.112456,0.218328
2,-0.357398,0.453897,1.0,0.451378,-0.042888,0.297373
3,0.208179,0.51591,0.451378,1.0,0.763325,-0.057739
4,0.761905,0.112456,-0.042888,0.763325,1.0,0.039621
5,0.27735,0.218328,0.297373,-0.057739,0.039621,1.0


### User-based Recommendation System

In [6]:
# Create function to find k similar users given the user_id and user-item ratings matrix
#  Note that the similarities are same as obtained via using pairwise_distances
def findksimilarusers(user_id, ratings, metric=metric, k=k, verbose=False):
    similarities = []
    indices = []
    model_knn = NearestNeighbors(metric=metric, algorithm='brute')   # The brute force algorithm is based on
                                                                     #  pairwise_distances
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
                                   # Returns indices of and distances to the neighbors of each point
    similarities = 1 - distances.flatten()     # Flatten a dataframe to a list
    if verbose:
        print('{0} most similar users for User {1} using {2} metric:'.format(k, user_id, metric))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;
        elif verbose:
            print('   {0}: User {1} with similarity of {2}'. \
                  format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    return similarities,indices

In [7]:
similarities,indices = findksimilarusers(1, df, metric='cosine', verbose=True)

3 most similar users for User 1 using cosine metric:
   1: User 5 with similarity of 0.9738899354018394
   2: User 4 with similarity of 0.934621684178377
   3: User 6 with similarity of 0.8846004572297814


In [8]:
similarities

array([1.        , 0.97388994, 0.93462168, 0.88460046])

In [9]:
indices

array([[0, 4, 3, 5]])

In [10]:
similarities,indices = findksimilarusers(1, df, metric='correlation', verbose=True)

3 most similar users for User 1 using correlation metric:
   1: User 5 with similarity of 0.7619047619047619
   2: User 6 with similarity of 0.2773500981126146
   3: User 4 with similarity of 0.20817945092665124


In [11]:
similarities

array([1.        , 0.76190476, 0.2773501 , 0.20817945])

In [12]:
indices

array([[0, 4, 5, 3]])

In [13]:
# Create function to predict rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric=metric, k=k):
    prediction = 0
    similarities, indices = findksimilarusers(user_id, ratings, metric, k) # Similar users based on similarity metric
    mean_rating = ratings.loc[user_id-1,:].mean() # Adjust for zero based indexing
    sum_wt = np.sum(similarities) - 1
    wtd_sum = 0 
    product = 1
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_id-1]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))    
    print('Predicted rating for user {0} for item {1} = {2}'.format(user_id, item_id, prediction))

    return prediction

In [14]:
predict_userbased(3, 4, df)

Predicted rating for user 3 for item 4 = 3


3

In [15]:
predict_userbased(5, 4, df, metric='correlation')

Predicted rating for user 5 for item 4 = 8


8

### Item-based Recommendation System

In [16]:
# Create function to find k similar items given the item_id and user-item ratings matrix
# Note that the similarities are same as obtained via using pairwise_distances
def findksimilaritems(item_id, ratings, metric=metric, k=k, verbose=False):
    similarities = []
    indices = []
    ratings = ratings.T    # Transpose matrix
    model_knn = NearestNeighbors(metric=metric, algorithm='brute')   # The brute force algorithm is based on
                                                                     #  pairwise_distances
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[item_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
                                   # Returns indices of and distances to the neighbors of each point
    similarities = 1 - distances.flatten()
    if verbose:
        print('{0} most similar items for item {1} using {2} metric:'.format(k, item_id, metric))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;
        elif verbose:
            print('   {0}: Item {1} with similarity of {2}'. \
                  format(i,indices.flatten()[i]+1, similarities.flatten()[i]))

    return similarities,indices

In [17]:
similarities,indices=findksimilaritems(1, df, metric='cosine', verbose=True)

3 most similar items for item 1 using cosine metric:
   1: Item 5 with similarity of 0.9410670449755878
   2: Item 6 with similarity of 0.932273086263139
   3: Item 2 with similarity of 0.8164921374451916


In [18]:
similarities

array([1.        , 0.94106704, 0.93227309, 0.81649214])

In [19]:
indices

array([[0, 4, 5, 1]])

In [20]:
similarities,indices=findksimilaritems(1, df, metric='correlation', verbose=True)

3 most similar items for item 1 using correlation metric:
   1: Item 6 with similarity of -0.2331262020600784
   2: Item 3 with similarity of -0.33028912953790823
   3: Item 2 with similarity of -0.5101381811386223


In [21]:
similarities

array([ 1.        , -0.2331262 , -0.33028913, -0.51013818])

In [22]:
indices

array([[0, 5, 2, 1]])

In [23]:
# Create function to predict rating for specified user-item combination based on item-based approach
def predict_itembased(user_id, item_id, ratings, metric=metric, k=k):
    prediction = 0
    similarities, indices = findksimilaritems(item_id, ratings, metric) # Similar users based on similarity metric
    sum_wt = np.sum(similarities) - 1
    wtd_sum = 0
    product = 1
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;
        else:
            product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    print('Predicted rating for user {0} for item {1} = {2}'.format(user_id, item_id, prediction))
    
    return prediction

In [24]:
prediction = predict_itembased(1, 3, df)

Predicted rating for user 1 for item 3 = 6


In [25]:
prediction = predict_itembased(3, 3, df, metric='correlation')

Predicted rating for user 3 for item 3 = 6
