<a href="https://colab.research.google.com/github/sumitmamtani1/House-Price-Prediction/blob/main/AdvancedRecommenderSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd

In [6]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']

In [7]:
df = pd.read_csv('/content/u.data', sep = '\t', names = column_names)

In [8]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
movie_titles = pd.read_csv('/content/Movie_Id_Titles')

In [10]:
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [11]:
ratings = pd.merge(df,movie_titles, on ='item_id')

In [12]:
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [13]:
n_users = ratings.user_id.nunique()
n_items = ratings.item_id.nunique()

In [14]:
print('Number of users :' + str(n_users))
print('Number of ratings :' + str(n_items) )

Number of users :943
Number of ratings :1682


In [15]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings, test_size = 0.25)

In [16]:
print(train_data)

       user_id  item_id  rating  timestamp                               title
42497      122      135       4  879270327        2001: A Space Odyssey (1968)
65808      746      127       2  885075243               Godfather, The (1972)
11655      389      656       5  879991175                            M (1931)
55612      666      510       4  880139409       Magnificent Seven, The (1954)
64849      846       64       4  883948221    Shawshank Redemption, The (1994)
...        ...      ...     ...        ...                                 ...
86852      546      760       5  885140808                    Screamers (1995)
17137      372      649       3  876869977  Once Upon a Time in America (1984)
78212      881      195       4  876539636              Terminator, The (1984)
5201       293      194       4  888906045                   Sting, The (1973)
47881      548      344       1  891042530                 Apostle, The (1997)

[75000 rows x 5 columns]


In [17]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
  train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
  test_data_matrix[line[1] - 1, line[2] - 1] = line[3]


In [18]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric = 'cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric = 'cosine')


In [19]:
def predict(ratings, similarity, type = 'user'):
  if type == 'user':
    mean_user_rating = ratings.mean(axis = 1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
  elif type == 'item':
    pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis = 1)])
  return pred

In [20]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [21]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(prediction, ground_truth):
  prediction = prediction[ground_truth.nonzero()].flatten()
  ground_truth = ground_truth[ground_truth.nonzero()].flatten()
  return sqrt(mean_squared_error(prediction, ground_truth))

In [22]:
print('User-based CF rmse :' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF rmse :' + str(rmse(item_prediction, test_data_matrix)))

User-based CF rmse :3.138027310309598
Item-based CF rmse :3.463884648932448


In [27]:
sparsity = round(1.0 - len(ratings) / float(n_users * n_items), 3) * 100
print('Sparsity for MovieLens dataset is ' + str(sparsity) + '%')

Sparsity for MovieLens dataset is 93.7%


In [34]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

print("User-based CF rmse : " + str(rmse(X_pred, test_data_matrix)))

User-based CF rmse : 2.7309183266360253
