In [1]:
import pandas as pd

In [2]:
path = "./u.data" 
df = pd.read_csv(path, sep='\t') 

In [3]:
type(df)

pandas.core.frame.DataFrame

In [4]:
df.head()

Unnamed: 0,UserID,ItemId,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
 df.columns

Index(['UserID', 'ItemId', 'Rating', 'Timestamp'], dtype='object')

In [6]:
df.shape

(100000, 4)

In [7]:
#create a variable n_users to find the total number of unique users in the data.
n_users = df.UserID.unique().shape[0] 

In [8]:
#create a variable n_items to find the total number of unique movies in the data
n_items = df['ItemId'].unique().shape[0] 

In [9]:
#print the counts of unique users and movies
print(str(n_users) + ' users') 

943 users


In [10]:
print(str(n_items) + ' movies') 

1682 movies


In [11]:
import numpy as np
import sklearn

In [12]:
#create a zero value matrix of size (n_users X n_items) to store the ratings in the cell of the matrix ratings.
ratings = np.zeros((n_users, n_items)) 

In [13]:
# for each tuple in the dataframe, df extract the information of each column of the row and store into the rating matrix cell value as below
for  row in df.itertuples():
	ratings[row[1]-1, row[2]-1] = row[3] 

In [14]:
type(ratings)

numpy.ndarray

In [15]:
ratings.shape

(943, 1682)

In [16]:
ratings

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [17]:
sparsity = float(len(ratings.nonzero()[0]))

In [18]:
sparsity /= (ratings.shape[0] * ratings.shape[1])

In [19]:
sparsity *= 100

In [20]:
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 6.30%


In [21]:
from sklearn.cross_validation import train_test_split 
ratings_train, ratings_test = train_test_split(ratings,test_size=0.33, random_state=42)
ratings_test.shape



(312, 1682)

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
ratings_train, ratings_test = train_test_split(ratings,test_size=0.33, random_state=42)

In [24]:
ratings_test.shape

(312, 1682)

In [25]:
import numpy as np
import sklearn

In [26]:
dist_out = 1-sklearn.metrics.pairwise.cosine_distances(ratings_train)

In [27]:
type(dist_out)

numpy.ndarray

In [28]:
dist_out.shape

(631, 631)

In [29]:
dist_out

array([[ 1.        ,  0.36475764,  0.44246231, ...,  0.02010641,
         0.33107929,  0.25638518],
       [ 0.36475764,  1.        ,  0.42635255, ...,  0.06694419,
         0.27339314,  0.22337268],
       [ 0.44246231,  0.42635255,  1.        , ...,  0.06675756,
         0.25424373,  0.22320126],
       ..., 
       [ 0.02010641,  0.06694419,  0.06675756, ...,  1.        ,
         0.04853428,  0.05142508],
       [ 0.33107929,  0.27339314,  0.25424373, ...,  0.04853428,
         1.        ,  0.1198022 ],
       [ 0.25638518,  0.22337268,  0.22320126, ...,  0.05142508,
         0.1198022 ,  1.        ]])

In [30]:
user_pred = dist_out.dot(ratings_train) / np.array([np.abs(dist_out).sum(axis=1)]).T

In [31]:
from sklearn.metrics import mean_squared_error
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [32]:
get_mse(user_pred, ratings_train)

7.8821939915510031

In [33]:
get_mse(user_pred, ratings_test)

8.9224954316965484

In [34]:
#Find top N nearest neighbours

In [35]:
k=5
from sklearn.neighbors import NearestNeighbors

In [36]:
#define  NearestNeighbors object by passing k and the similarity method as parameters.
neigh = NearestNeighbors(k,'cosine')

In [37]:
#fit the training data to the nearestNeighbor object
neigh.fit(ratings_train)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius='cosine')

In [38]:
#calculate the top5 similar users for each user and their similarity  values, i.e. the distance values between each pair of users.
top_k_distances,top_k_users = neigh.kneighbors(ratings_train, return_distance=True)

In [39]:
top_k_distances.shape

(631, 5)

In [40]:
top_k_users.shape

(631, 5)

In [41]:
top_k_users[0]

array([  0,  82, 511, 184, 207], dtype=int64)

In [42]:
user_pred_k = np.zeros(ratings_train.shape)

In [43]:
for i in range(ratings_train.shape[0]):
    user_pred_k[i,:] =   top_k_distances[i].T.dot(ratings_train[top_k_users][i])/np.array([np.abs(top_k_distances[i].T).sum(axis=0)]).T

In [44]:
user_pred_k.shape

(631, 1682)

In [45]:
user_pred_k

array([[ 3.25379713,  1.75556855,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.48370298,  0.        ,  1.24948776, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.01011767,  0.73826825,  0.7451635 , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.74469557,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.9753676 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [46]:
get_mse(user_pred_k, ratings_train)

8.1956065114538781

In [47]:
get_mse(user_pred_k, ratings_test)

11.602073790588975

In [48]:
#Since we have to calculate the similarity between movies, we use movie count as k instead of user count

k = ratings_train.shape[1]

In [49]:
neigh = NearestNeighbors(k,'cosine')

In [50]:
#we fit the transpose of the rating matrix to the Nearest Neighbors object
neigh.fit(ratings_train.T)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=1682, p=2,
         radius='cosine')

In [51]:
#calcualte the cosine similarity distance between each movie pairs
top_k_distances,top_k_users = neigh.kneighbors(ratings_train.T, return_distance=True)

In [52]:
top_k_distances.shape

(1682, 1682)

In [53]:
item__pred = ratings_train.dot(top_k_distances) / np.array([np.abs(top_k_distances).sum(axis=1)])

In [54]:
item__pred.shape

(631, 1682)

In [55]:
item__pred

array([[ 0.        ,  0.51752631,  0.60019695, ...,  2.31664301,
         2.34134745,  2.46671096],
       [ 0.        ,  0.31976603,  0.37168534, ...,  1.34680571,
         1.34897863,  1.44314592],
       [ 0.        ,  0.50619664,  0.58685005, ...,  2.5337623 ,
         2.57055505,  2.74749235],
       ..., 
       [ 0.        ,  0.08945322,  0.10271303, ...,  0.41949597,
         0.41995047,  0.45733339],
       [ 0.        ,  0.25785693,  0.29819614, ...,  1.30767892,
         1.32470838,  1.41198324],
       [ 0.        ,  0.07197376,  0.08524505, ...,  0.25523416,
         0.25259761,  0.26155752]])

In [57]:
get_mse(item__pred, ratings_train)

11.130000188318895

In [58]:
get_mse(item__pred,ratings_test)

12.128683035513326

In [59]:
k = 40

In [60]:
neigh2 = NearestNeighbors(k,'cosine')

In [61]:
neigh2.fit(ratings_train.T)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=40, p=2,
         radius='cosine')

In [62]:
top_k_distances,top_k_movies = neigh2.kneighbors(ratings_train.T, return_distance=True)

In [63]:
#rating prediction - top k user based

In [64]:
pred = np.zeros(ratings_train.T.shape)

In [None]:
for i in range(ratings_train.T.shape[0]):
    pred[i,:] = top_k_distances[i].dot(ratings_train.T[top_k_users][i])/np.array([np.abs(top_k_distances[i]).sum(axis=0)]).T

In [None]:
get_mse(item_pred_k, ratings_train)

In [None]:
# get_mse(item_pred_k,ratings_test)