# Project 4: Collaborative filtering
## Group 8: Suraj and Pankajdeer

In [29]:
# import libraries
import random
import os
import numpy as np
import pandas as pd
import collections
import math

from pathlib import Path
import time

from random import randrange
from shutil import copyfile
from numpy.linalg import norm
from math import sqrt

from sklearn.metrics import mean_squared_error


In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
TRAIN_RATINGS_PATH = '/content/drive/My Drive/Coursework/CS 5683_Big_Data/Project-4/training_dataset.csv'
#TRAIN_RATINGS_PATH = '/content/drive/My Drive/3Sem/CS5683/Projects/Project 4/training_dataset.csv'
TEST_RATINGS_PATH = '/content/drive/My Drive/Coursework/CS 5683_Big_Data/Project-4/test_dataset.csv'
#TEST_RATINGS_PATH = '/content/drive/My Drive/3Sem/CS5683/Projects/Project 4/test_dataset.csv'

# read the data
train_data = pd.read_csv(TRAIN_RATINGS_PATH,sep=",", engine="python",names=["user_id", "item_id", "rating"],usecols=[0, 1, 2], skiprows=1)
test_data = pd.read_csv(TEST_RATINGS_PATH,sep=",", engine="python",names=["user_id", "item_id", "rating"],usecols=[0, 1, 2], skiprows=1)

In [32]:
# checking sparsity
x = train_data.pivot_table(index='user_id',columns='item_id',values='rating')
x.isna().sum().sum()/(x.shape[0]*x.shape[1])
sparsity =[]
for i in x.columns:
  sparsity.append(x[i].isnull().sum()/len(x[i]))
print('Sparsity = ', round(np.mean(sparsity),4))

Sparsity =  0.9395


 Goal: Predict the rating of the movies by a user <br>
 Step1: Find similar movies which are similar to movie 1 in test set, using correlation <br>
 Step 2: predict the movie rating using weighted average estimate
 Step 3: 

In [33]:
# construct the unility matrix
umDf = train_data.pivot_table(index='user_id', columns='item_id', values='rating')
user_mean = [0 for i in range(umDf.shape[0])]

# utility matrix for the cosine similarity
um = umDf.copy().values

# utility matrix for the adjusted cosine similarity
um_adj= umDf.copy().values

# movies and user list for train data
movies = list(umDf.columns)
users= list(umDf.index)

ratings_sum = 0
ratings_count = 0

# Computing the user mean and global mean
for i in range(0, umDf.shape[0]):
  m = 0
  cnt = 0
  for j in range(len(movies)):
      if not math.isnan(um[i][j]):
          m = m + um[i][j]
          cnt += 1
  if cnt !=0:
    user_mean[i] = m / cnt
  ratings_sum =ratings_sum+ m
  ratings_count = ratings_count+cnt

# Computing the global mean
global_mean=ratings_sum/ratings_count

# removing user bias in um_Adj and replace NaN values with zero in both um_Adj and um
for i in range(0, umDf.shape[0]):
    m = 0
    cnt = 0
    for j in range(len(movies)):
      if not math.isnan(um[i][j]):
        um[i][j]=um[i][j] 
        um_adj[i][j]=um_adj[i][j]- user_mean[i]
      else:
        um[i][j]=0
        um_adj[i][j]=0

# Computing movie mean
movie_mean = [0 for j in range(len(movies))]
for j in range(len(movies)):
  m = 0
  cnt = 0
  for i in range(umDf.shape[0]):
      if um[i][j] != 0:
          m = m + um[i][j]
          cnt = cnt + 1
  if cnt != 0:
      movie_mean[j] = m / cnt


In [34]:
def item_sim(um, i, j):
  # compute the similarity between ith and jth item
  sim = np.dot(um[:,i],um[:,j])/(norm(um[:,i])*norm(um[:,j]))
  return sim 

In [35]:
def top_sim_items(um,movies, user_index,i):
  # select the top similar items
  # we select 20 top similar items as this was found to give better prediction 
  # than selecting based on certain thresold value of the similarity
  
  cor=0.01
  top_items = []

  for j in range(len(movies)):
    if j!=i and um[user_index][j] !=0:
      top_items.append((item_sim(um,i,j),j))
  top_items.sort(reverse=True)

  req=[]
  for z in top_items:
    if z[0]>=cor:
      req.append(z)
    else:
      continue
  return req[:20]


In [36]:
def predict_rating(um,movies,user_mean, u, m):
  # predict the rating for cosine similarity 
  # the rating for the movies which are present in test data and not in train data, 
  # is assinged as the user_mean

  u_ind =users.index(u)
  if m in movies:
    m_ind = movies.index(m)
    ti = top_sim_items(um, movies, u_ind, m_ind)
    num, den = 0, 0
    for x in ti:
      num += x[0] * um[u_ind][x[1]]
      den += x[0]
    r = num / den
    r = r
  else:
    r = user_mean[u_ind]
  return r

def predict_rating_adj(um,movies,user_mean, u, m):
  # predict the rating for ajusted cosine similarity 
  # the rating for the movies which are present in test data and not in train data, 
  # is assinged as the user_mean
  u_ind =users.index(u)
  if m in movies:
    m_ind = movies.index(m)
    ti = top_sim_items(um, movies, u_ind, m_ind)
    num, den = 0, 0
    for x in ti:
      num += x[0] * um[u_ind][x[1]]
      den += x[0]
    r = num / den
    r = r + user_mean[u_ind]
  else:
    r = user_mean[u_ind]
  return r

In [38]:
# predict the rating for test data with cosine similarity
u_id = test_data["user_id"]
m_id = test_data["item_id"]
Y = test_data["rating"].values

start = time.time()
predictions = []
for i in range(len(u_id)):
  predictions.append(predict_rating(um,movies,user_mean,u_id[i], m_id[i]))
  if i % 200 == 0 and i != 0:
    print("RMSE at ", i, " :", round(sqrt(mean_squared_error(Y[: i + 1], predictions)),4))

print('Total CPU time = ', time.time() - start)
print("RMSE for the test data: ", round(sqrt(mean_squared_error(Y, predictions)),4))

RMSE at  200  : 0.9917
RMSE at  400  : 0.9429
RMSE at  600  : 0.9618
RMSE at  800  : 0.961
RMSE at  1000  : 0.9601
RMSE at  1200  : 0.9425
RMSE at  1400  : 0.9302
RMSE at  1600  : 0.9268
RMSE at  1800  : 0.9291
RMSE at  2000  : 0.9277
RMSE at  2200  : 0.9263
RMSE at  2400  : 0.9286
RMSE at  2600  : 0.9305
RMSE at  2800  : 0.9357
RMSE at  3000  : 0.9425
RMSE at  3200  : 0.9373
RMSE at  3400  : 0.9401
RMSE at  3600  : 0.9397
RMSE at  3800  : 0.9377
RMSE at  4000  : 0.9381
RMSE at  4200  : 0.9409
RMSE at  4400  : 0.9432
RMSE at  4600  : 0.9404
RMSE at  4800  : 0.9379
Total CPU time =  36.045615673065186
RMSE for the test data:  0.9393


In [39]:
# predict the rating for test data with asjusted cosine similarity
u_id = test_data["user_id"]
m_id = test_data["item_id"]
Y = test_data["rating"].values

start = time.time()
predictions = []
for i in range(len(u_id)):
  predictions.append(predict_rating_adj(um_adj,movies,user_mean,u_id[i], m_id[i]))
  if i % 200 == 0 and i != 0:
    print("RMSE at ", i, " :", round(sqrt(mean_squared_error(Y[: i + 1], predictions)),4))

print('Total CPU time = ', time.time() - start)
print("RMSE for the test data", round(sqrt(mean_squared_error(Y, predictions)),4))

RMSE at  200  : 0.9559
RMSE at  400  : 0.8989
RMSE at  600  : 0.8955
RMSE at  800  : 0.8977
RMSE at  1000  : 0.9001
RMSE at  1200  : 0.8906
RMSE at  1400  : 0.8773
RMSE at  1600  : 0.8773
RMSE at  1800  : 0.8811
RMSE at  2000  : 0.878
RMSE at  2200  : 0.8797
RMSE at  2400  : 0.8788
RMSE at  2600  : 0.8809
RMSE at  2800  : 0.8826
RMSE at  3000  : 0.8882
RMSE at  3200  : 0.8859
RMSE at  3400  : 0.8885
RMSE at  3600  : 0.8862
RMSE at  3800  : 0.8869
RMSE at  4000  : 0.8849
RMSE at  4200  : 0.8837
RMSE at  4400  : 0.8869
RMSE at  4600  : 0.8855
RMSE at  4800  : 0.8819
Total CPU time =  35.049400091171265
RMSE for the test data 0.8829
