In [17]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

In [18]:
rating = pd.read_csv("ml-20m/ratings.csv")
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


In [3]:
rating.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [4]:
# Neighborhood-based Collaborative Filtering

# Data preprocessing
rating['userId'] = rating['userId'].fillna(0)
rating['movieId'] = rating['movieId'].fillna(0)
rating['rating'] = rating['rating'].fillna(rating['rating'].mean())

# Start developing using a small dataset < 10,000 users / < 1,000 items
sample = rating.sample(frac = 0.0001)
print(sample.info())

# Split the data into testing and training sets
train,test = train_test_split(sample, test_size = 0.2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 10593063 to 4289793
Data columns (total 4 columns):
userId       2000 non-null int64
movieId      2000 non-null int64
rating       2000 non-null float64
timestamp    2000 non-null int64
dtypes: float64(1), int64(3)
memory usage: 78.1 KB
None


In [5]:
# Transform into pivot table

train = train.pivot(index='userId', columns='movieId', values='rating')
train = train.fillna(0)

In [6]:
test = test.pivot(index='userId', columns='movieId', values='rating')
test = test.fillna(0)

In [16]:
test.head()

movieId,5,6,7,11,14,17,19,21,22,25,...,81591,82459,86206,88672,90890,92259,95088,105213,106487,112552
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
train.shape[0], train.shape[1]

(1562, 1102)

In [13]:
test.shape[0], test.shape[1]

(399, 353)

In [10]:
# Use Pearson Correlation Coefficient to calculate the item similarity Matrix
pearson_sim = 1 - pairwise_distances(train, metric="correlation")
pd.DataFrame(pearson_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1552,1553,1554,1555,1556,1557,1558,1559,1560,1561
0,1.000000,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
1,-0.000908,1.000000,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
2,-0.000908,-0.000908,1.000000,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
3,-0.000908,-0.000908,-0.000908,1.000000,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
4,-0.000908,-0.000908,-0.000908,-0.000908,1.000000,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
5,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,1.000000,-0.000908,-0.000908,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
6,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,1.000000,-0.000908,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
7,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,1.000000,-0.000908,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
8,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,1.000000,-0.000908,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908
9,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,1.000000,...,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908,-0.000908


In [None]:
#This function predicts the rating for specified user-item combination based on item-based approach
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == item_id:
            continue;
        else:
            product = ratings.iloc[user_id-1,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    print '\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction)      

    return prediction

In [11]:
# Evaluation using RMSE
def RMSE(predict, actual):
    predict = predict[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(predict,actual))

# Predict ratings of items
item_predict = predict_itembased(train, pearson_sim)

# Calculate RMSE on the dataset
print('Item-based Collaborative Filtering RMSE on train data is:', RMSE(item_predict[:,2], train_matrix[:,2]))

ValueError: Dot product shape mismatch, (1562, 1102) vs (1562, 1562)

In [None]:
item_predict[:,2]

In [None]:
train_matrix

In [None]:
RMSE(item_predict[:,2], train_matrix[:,2])