# Item - Item Collaborative Filter Recommender

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time

## Load data to pandas

In [2]:
df_ratings_contents = pd.read_table("data/u.data",
                                    names=["user", "movie", "rating", "timestamp"])

In [3]:
df_ratings_contents.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
df_ratings_contents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user         100000 non-null int64
movie        100000 non-null int64
rating       100000 non-null int64
timestamp    100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB


In [5]:
df_ratings_contents.describe()

Unnamed: 0,user,movie,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


## Convert rating records to user-movie utility matrix

### Option #1, use pandas.pivot_table

In [6]:
df_utility = pd.pivot_table(data=df_ratings_contents, 
                            values='rating', 
                            index='user', 
                            columns='movie', 
                            fill_value=0)

In [7]:
df_utility.head()

movie,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df_utility.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 1 to 943
Columns: 1682 entries, 1 to 1682
dtypes: int64(1682)
memory usage: 12.1 MB


### Option #2, convert to sparse matrix using scipy.sparse.lil_matrix

In [9]:
highest_user_id = df_ratings_contents.user.max()
highest_movie_id = df_ratings_contents.movie.max()

# make a row-based linked list with size # of users by # of movies
ratings_mat = sparse.lil_matrix((highest_user_id, highest_movie_id))
ratings_mat

<943x1682 sparse matrix of type '<type 'numpy.float64'>'
	with 0 stored elements in LInked List format>

In [10]:
for _, row in df_ratings_contents.iterrows():
    # subtract 1 from id's due to match 0 indexing
    ratings_mat[row.user-1, row.movie-1] = row.rating

In [12]:
ratings_mat.toarray()

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

### Let's carry on with option #2

In [13]:
utility_mat = ratings_mat.toarray()

### Calculate item-item similarity matrix

In [14]:
# Item-Item Similarity Matrix
item_sim_mat = cosine_similarity(utility_mat.T)

In [15]:
print item_sim_mat

[[ 1.          0.40238218  0.33024479 ...,  0.          0.04718307
   0.04718307]
 [ 0.40238218  1.          0.27306918 ...,  0.          0.07829936
   0.07829936]
 [ 0.33024479  0.27306918  1.         ...,  0.          0.          0.09687505]
 ..., 
 [ 0.          0.          0.         ...,  1.          0.          0.        ]
 [ 0.04718307  0.07829936  0.         ...,  0.          1.          0.        ]
 [ 0.04718307  0.07829936  0.09687505 ...,  0.          0.          1.        ]]


### Calculate neighborhood

In [16]:
least_to_most_sim_indexes = np.argsort(item_sim_mat)

In [17]:
least_to_most_sim_indexes

array([[1556, 1579, 1616, ...,  180,   49,    0],
       [1201, 1524, 1240, ...,  160,  232,    1],
       [ 676, 1463, 1464, ...,  762,  409,    2],
       ..., 
       [   0, 1125, 1124, ..., 1677, 1678, 1679],
       [ 840, 1120, 1119, ..., 1350, 1671, 1680],
       [ 840,  934,  933, ..., 1334,  766, 1681]], dtype=int64)

In [18]:
# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size - 1: -1]

In [29]:
neighborhoods

array([[  98,  844,  273, ...,  120,  180,   49],
       [ 422,  684,   63, ...,  384,  160,  232],
       [ 551,  549,  420, ...,   41,  762,  409],
       ..., 
       [ 300,  351,  288, ..., 1394, 1677, 1678],
       [ 189,  326,  385, ..., 1621, 1350, 1671],
       [1078,  947,  205, ..., 1596, 1334,  766]], dtype=int64)

In [19]:
neighborhoods.shape

(1682L, 75L)

## Make rating prediction on a user

In [24]:
# Let's pick a lucky user
user_id = np.random.randint(utility_mat.shape[0])
print "luck user is", user_id

luck user is 153


In [28]:
utility_mat[user_id].nonzero()[0]

array([ 49,  60,  88, 134, 136, 142, 151, 171, 173, 174, 181, 184, 186,
       190, 196, 199, 201, 210, 221, 237, 241, 257, 285, 287, 288, 301,
       323, 332, 356, 413, 461, 473, 474, 478, 479, 481, 483, 487, 495,
       514, 522, 526, 639, 640, 641, 650, 707, 805, 873, 918, 944], dtype=int64)

In [31]:
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]


items_rated_by_this_user = utility_mat[user_id].nonzero()[0]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in xrange(n_items):
    
    # find intersection between user-rated items and neighbors of items
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    
    # prediction on items based on item-item similairty
    out[item_to_rate] = ratings_mat[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print pred_ratings
# print("Execution time: %f seconds" % (time()-start_time))



[ 3.74496834  4.26032068  4.01913047 ...,  2.45492161  0.          4.3186894 ]


In [32]:
import cProfile
def pred(n_items):
    for item_to_rate in xrange(n_items):
        relevanat_items = np.intersect1d(neighborhoods[item_to_rate],
                                        items_rated_by_this_user,
                                        assume_unique = True)
        out[item_to_rate] = ratings_mat[user_id, relevant_items] * \
                            item_sim_mat[item_to_rate,relevant_items] / \
                            item_sim_mat[item_to_rate,relevant_items].sum()
    return np.nan_to_num(out)
cProfile.run('pred(n_items)')



         365033 function calls in 0.642 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.053    0.053    0.642    0.642 <ipython-input-32-de597e0db943>:2(pred)
        1    0.000    0.000    0.642    0.642 <string>:1(<module>)
     1682    0.001    0.000    0.013    0.000 _methods.py:31(_sum)
     3364    0.003    0.000    0.003    0.000 _util.py:128(_prune_array)
     1682    0.012    0.000    0.027    0.000 arraysetops.py:297(intersect1d)
    13456    0.003    0.000    0.003    0.000 base.py:100(get_shape)
    10092    0.004    0.000    0.007    0.000 base.py:1111(isspmatrix)
     6728    0.004    0.000    0.008    0.000 base.py:193(nnz)
     1682    0.006    0.000    0.254    0.000 base.py:342(__mul__)
     1682    0.006    0.000    0.247    0.000 base.py:421(_mul_vector)
     3364    0.003    0.000    0.003    0.000 base.py:70(__init__)
     1682    0.004    0.000    0.005    0.000 base.py:77(set_shape)
  

In [33]:
pred_ratings.shape

(1682L,)

## Get final recommendations for a user

In [34]:
# Recommend n movies
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))

# Find items that have been rated by user
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[-n:]


[378, 1305, 1306, 946, 1030, 443, 1265, 439, 634, 562]